From dd8869c2604ab9ce32e69397b23a601003de71df Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Tue, 22 Jul 2025 18:52:09 +0200 Subject: [PATCH] Add an optional delay before Teleport shutdown --- .../config-reference/instance-wide.yaml | 7 +++ lib/config/configuration.go | 2 + lib/config/configuration_test.go | 4 ++ lib/config/fileconf.go | 4 ++ lib/config/testdata_test.go | 1 + lib/service/service.go | 52 ++++++++++++++++--- lib/service/servicecfg/config.go | 4 ++ 7 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/pages/includes/config-reference/instance-wide.yaml b/docs/pages/includes/config-reference/instance-wide.yaml index ae423f2c1a2c9..d53638e5ac9ca 100644 --- a/docs/pages/includes/config-reference/instance-wide.yaml +++ b/docs/pages/includes/config-reference/instance-wide.yaml @@ -96,6 +96,13 @@ teleport: # # The cache is enabled by default, it can be disabled with this flag # enabled: true + # The duration (in string form) of the delay between receiving a termination + # signal and the beginning of the shutdown procedures. It can be used to + # give time to load balancers to stop routing connections to the Teleport + # instance while the instance is still capable of handling them. If unset or + # negative, no delay is applied. + #shutdown_delay: "0s" + # Teleport can limit the number of connections coming from each client # IP address to avoid abuse. Note that these limits are enforced separately # for each service (SSH, Kubernetes, etc.) diff --git a/lib/config/configuration.go b/lib/config/configuration.go index fb1f1df2ad980..e653511084396 100644 --- a/lib/config/configuration.go +++ b/lib/config/configuration.go @@ -618,6 +618,8 @@ func ApplyFileConfig(fc *FileConfig, cfg *servicecfg.Config) error { } cfg.CachePolicy = *cachePolicy + cfg.ShutdownDelay = time.Duration(fc.ShutdownDelay) + // Apply (TLS) cipher suites and (SSH) ciphers, KEX algorithms, and MAC // algorithms. if len(fc.CipherSuites) > 0 { diff --git a/lib/config/configuration_test.go b/lib/config/configuration_test.go index 17aedcdd1661c..90b60d32577c8 100644 --- a/lib/config/configuration_test.go +++ b/lib/config/configuration_test.go @@ -795,6 +795,8 @@ func TestApplyConfig(t *testing.T) { require.Equal(t, "tcp://127.0.0.1:3000", cfg.DiagnosticAddr.FullAddress()) + require.Equal(t, 7*time.Minute+35*time.Second, cfg.ShutdownDelay) + u2fCAFromFile, err := os.ReadFile("testdata/u2f_attestation_ca.pem") require.NoError(t, err) require.Empty(t, cmp.Diff(cfg.Auth.Preference, &types.AuthPreferenceV2{ @@ -1419,6 +1421,8 @@ func checkStaticConfig(t *testing.T, conf *FileConfig) { require.Equal(t, "10.10.10.1:3022", conf.AdvertiseIP) require.Equal(t, "/var/run/teleport.pid", conf.PIDFile) + require.Zero(t, conf.ShutdownDelay) + require.Empty(t, cmp.Diff(conf.Limits, ConnectionLimits{ MaxConnections: 90, MaxUsers: 91, diff --git a/lib/config/fileconf.go b/lib/config/fileconf.go index 3b04a5d6382d1..81a38e2aaafd3 100644 --- a/lib/config/fileconf.go +++ b/lib/config/fileconf.go @@ -612,6 +612,10 @@ type Global struct { AdvertiseIP string `yaml:"advertise_ip,omitempty"` CachePolicy CachePolicy `yaml:"cache,omitempty"` + // ShutdownDelay is a fixed delay between receiving a termination signal and + // the beginning of the shutdown procedures. + ShutdownDelay types.Duration `yaml:"shutdown_delay,omitempty"` + // CipherSuites is a list of TLS ciphersuites that Teleport supports. If // omitted, a Teleport selected list of defaults will be used. CipherSuites []string `yaml:"ciphersuites,omitempty"` diff --git a/lib/config/testdata_test.go b/lib/config/testdata_test.go index ba2d2d1bfdd62..83268682874b2 100644 --- a/lib/config/testdata_test.go +++ b/lib/config/testdata_test.go @@ -96,6 +96,7 @@ teleport: log: output: stderr severity: INFO + shutdown_delay: "7m35s" connection_limits: max_connections: 90 max_users: 91 diff --git a/lib/service/service.go b/lib/service/service.go index a36ffca81b539..abb7983e86d14 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -280,6 +280,18 @@ const ( // all listening sockets and exiting. TeleportExitEvent = "TeleportExit" + // TeleportTerminatingEvent is generated when the Teleport process receives + // a signal to shut down. It's always generated as part of the process + // lifecycle and it's always generated before TeleportExitEvent, but there + // might be some configured delay between this event and the + // TeleportExitEvent signaling the actual beginning of the shut down + // procedures. It should be used to advertise the fact that the Teleport + // instance is going to shut down at some near time in the future, not to + // reduce the functionality of services - i.e., it's perfectly fine for + // services to ignore this event altogether, and nothing should get closed + // as a result of this event. + TeleportTerminatingEvent = "TeleportTerminating" + // TeleportPhaseChangeEvent is generated to indicate that the CA rotation // phase has been updated, used in tests. TeleportPhaseChangeEvent = "TeleportPhaseChange" @@ -6497,24 +6509,45 @@ func (process *TeleportProcess) WaitWithContext(ctx context.Context) { // StartShutdown launches non-blocking graceful shutdown process that signals // completion, returns context that will be closed once the shutdown is done func (process *TeleportProcess) StartShutdown(ctx context.Context) context.Context { - // by the time we get here we've already extracted the parent pipe, which is - // the only potential imported file descriptor that's not a listening - // socket, so closing every imported FD with a prefix of "" will close all - // imported listeners that haven't been used so far - warnOnErr(process.ExitContext(), process.closeImportedDescriptors(""), process.logger) - warnOnErr(process.ExitContext(), process.stopListeners(), process.logger) + shutdownDelayTimer := process.Clock.NewTimer(process.Config.ShutdownDelay) + defer shutdownDelayTimer.Stop() hasChildren := process.forkedTeleportCount.Load() > 0 + if hasChildren { + ctx = services.ProcessForkedContext(ctx) + } + + process.BroadcastEvent(Event{Name: TeleportTerminatingEvent}) + if process.inventoryHandle != nil { deleteResources := !hasChildren if err := process.inventoryHandle.SetAndSendGoodbye(ctx, deleteResources, hasChildren); err != nil { process.logger.WarnContext(process.ExitContext(), "Failed sending inventory goodbye during shutdown", "error", err) } } - if hasChildren { - ctx = services.ProcessForkedContext(ctx) + + if d := process.Config.ShutdownDelay; d > 0 { + if hasChildren { + process.logger.InfoContext(ctx, "Ignoring shutdown delay due to the presence of forked processes") + } else { + process.logger.InfoContext(ctx, "Waiting for shutdown delay", "shutdown_delay", d.String()) + select { + case <-shutdownDelayTimer.Chan(): + case <-process.ExitContext().Done(): + process.logger.WarnContext(ctx, "Skipping shutdown delay early due to process exit") + case <-ctx.Done(): + process.logger.WarnContext(ctx, "Skipping shutdown delay early due to context cancellation") + } + } } + // by the time we get here we've already extracted the parent pipe, which is + // the only potential imported file descriptor that's not a listening + // socket, so closing every imported FD with a prefix of "" will close all + // imported listeners that haven't been used so far + warnOnErr(process.ExitContext(), process.closeImportedDescriptors(""), process.logger) + warnOnErr(process.ExitContext(), process.stopListeners(), process.logger) + process.BroadcastEvent(Event{Name: TeleportExitEvent, Payload: ctx}) localCtx, cancel := context.WithCancel(ctx) go func() { @@ -6555,6 +6588,9 @@ func (process *TeleportProcess) Shutdown(ctx context.Context) { // Close broadcasts close signals and exits immediately func (process *TeleportProcess) Close() error { + // generate a TeleportTerminatingEvent to unblock any service waiting on + // that event before TeleportExitEvent + process.BroadcastEvent(Event{Name: TeleportTerminatingEvent}) process.BroadcastEvent(Event{Name: TeleportExitEvent}) var errors []error diff --git a/lib/service/servicecfg/config.go b/lib/service/servicecfg/config.go index 99df9bf050910..5fc6dc8354a8b 100644 --- a/lib/service/servicecfg/config.go +++ b/lib/service/servicecfg/config.go @@ -84,6 +84,10 @@ type Config struct { // in case if they lose connection to auth servers CachePolicy CachePolicy + // ShutdownDelay is a fixed delay between receiving a termination signal and + // the beginning of the shutdown procedures. + ShutdownDelay time.Duration + // Auth service configuration. Manages cluster state and configuration. Auth AuthConfig