From 91c5c1f0da36a2ea7a0fc13db8404ec2f84d18b2 Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Mon, 5 Feb 2024 22:57:22 +0100 Subject: [PATCH 1/5] clean up pidfile and reloads --- go.mod | 2 +- lib/service/service.go | 20 ++++----- lib/service/signals.go | 100 ++++++++++------------------------------- 3 files changed, 34 insertions(+), 88 deletions(-) diff --git a/go.mod b/go.mod index a6f686b77b6db..fb4bfc18afbd3 100644 --- a/go.mod +++ b/go.mod @@ -99,6 +99,7 @@ require ( github.com/google/go-containerregistry v0.17.0 github.com/google/go-querystring v1.1.0 github.com/google/go-tpm-tools v0.4.2 + github.com/google/renameio/v2 v2.0.0 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 github.com/google/uuid v1.5.0 github.com/googleapis/gax-go/v2 v2.12.0 @@ -342,7 +343,6 @@ require ( github.com/google/go-tpm v0.9.0 // indirect github.com/google/go-tspi v0.3.0 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/gorilla/handlers v1.5.2 // indirect diff --git a/lib/service/service.go b/lib/service/service.go index a9c19aea4787b..a4eac7287c9fa 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -46,6 +46,7 @@ import ( "time" awscredentials "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/google/renameio/v2" "github.com/google/uuid" "github.com/gravitational/roundtrip" "github.com/gravitational/trace" @@ -389,10 +390,9 @@ type TeleportProcess struct { // closed all the listeners) listenersClosed bool - // forkedPIDs is a collection of a teleport processes forked - // during restart used to collect their status in case if the - // child process crashed. - forkedPIDs []int + // forkedTeleportCount is the count of forked Teleport child processes + // currently active, as spawned by SIGHUP or SIGUSR2. + forkedTeleportCount atomic.Int32 // storage is a server local storage storage *auth.ProcessStorage @@ -738,6 +738,9 @@ func waitAndReload(ctx context.Context, cfg servicecfg.Config, srv Process, newT } newCfg := cfg newCfg.FileDescriptors = fileDescriptors + // our PID hasn't changed as we reload in-process, and if we're no longer + // the "main" Teleport process we don't want to overwrite the PID file + newCfg.PIDFile = "" newSrv, err := newTeleport(&newCfg) if err != nil { warnOnErr(srv.Close(), cfg.Log) @@ -1252,14 +1255,9 @@ func NewTeleport(cfg *servicecfg.Config) (*TeleportProcess, error) { // create the new pid file only after started successfully if cfg.PIDFile != "" { - f, err := os.OpenFile(cfg.PIDFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o666) - if err != nil { + if err := renameio.WriteFile(cfg.PIDFile, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { return nil, trace.ConvertSystemError(err) } - _, err = fmt.Fprintf(f, "%v", os.Getpid()) - if err = trace.NewAggregate(err, f.Close()); err != nil { - return nil, trace.Wrap(err) - } } // notify parent process that this process has started @@ -5528,7 +5526,7 @@ func (process *TeleportProcess) StartShutdown(ctx context.Context) context.Conte warnOnErr(process.stopListeners(), process.log) // populate context values - if len(process.getForkedPIDs()) > 0 { + if process.forkedTeleportCount.Load() > 0 { ctx = services.ProcessForkedContext(ctx) } diff --git a/lib/service/signals.go b/lib/service/signals.go index 4bae75ab4c978..70815456e8d8a 100644 --- a/lib/service/signals.go +++ b/lib/service/signals.go @@ -22,6 +22,7 @@ import ( "context" "encoding/json" "fmt" + "io" "net" "os" "os/exec" @@ -64,7 +65,6 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { syscall.SIGUSR1, // log process diagnostic info syscall.SIGUSR2, // initiate process restart procedure syscall.SIGHUP, // graceful restart procedure - syscall.SIGCHLD, // collect child status ) defer signal.Stop(sigC) @@ -124,8 +124,6 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { process.Shutdown(ctx) process.log.Infof("All services stopped, exiting.") return nil - case syscall.SIGCHLD: - process.collectStatuses() default: process.log.Infof("Ignoring %q.", signal) } @@ -160,8 +158,10 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { } } -const defaultShutdownTimeout = time.Second * 3 -const maxShutdownTimeout = time.Minute * 10 +const ( + defaultShutdownTimeout = time.Second * 3 + maxShutdownTimeout = time.Minute * 10 +) func getShutdownTimeout(log logrus.FieldLogger) time.Duration { timeout := defaultShutdownTimeout @@ -528,85 +528,33 @@ func (process *TeleportProcess) forkChild() error { if err != nil { return trace.ConvertSystemError(err) } - process.pushForkedPID(p.Pid) - log.WithFields(logrus.Fields{"pid": p.Pid}).Infof("Forked new child process.") + log.WithField("pid", p.Pid).Infof("Forked new child process.") + log = process.log.WithField("pid", p.Pid) - messageReceived, cancel := context.WithCancel(context.TODO()) - defer cancel() + process.forkedTeleportCount.Add(1) go func() { - data := make([]byte, 1024) - len, err := readPipe.Read(data) + defer process.forkedTeleportCount.Add(-1) + state, err := p.Wait() if err != nil { - log.Debug("Failed to read from pipe") + log.WithError(err). + Error("Failed waiting for forked Teleport process.") return } - log.Infof("Received message from pid %v: %v", p.Pid, string(data[:len])) - cancel() + log.WithField("status", state.String()).Warn("Forked Teleport process has exited.") }() - select { - case <-time.After(signalPipeTimeout): - return trace.BadParameter("Failed waiting from process") - case <-messageReceived.Done(): - log.WithFields(logrus.Fields{"pid": p.Pid}).Infof("Child process signals success.") + _ = writePipe.Close() + readPipe.SetReadDeadline(time.Now().Add(signalPipeTimeout)) + buf := make([]byte, 1024) + // we require at least one byte from the child, otherwise we can't + // distinguish the child dying (and closing the pipe) and a deliberate close + // without data; conversely, we don't care if we get an I/O or timeout error + // if we know that the child has sent at least one byte + n, err := io.ReadAtLeast(readPipe, buf, 1) + if err != nil { + return trace.Wrap(err, "waiting for forked Teleport process to signal successful start") } + log.WithField("data", string(buf[:n])).Infof("Forked Teleport process signaled successful start.") return nil } - -// collectStatuses attempts to collect exit statuses from -// forked teleport child processes. -// If forked teleport process exited with an error during graceful -// restart, parent process has to collect the child process status -// otherwise the child process will become a zombie process. -// Call Wait4(-1) is trying to collect status of any child -// leads to warnings in logs, because other parts of the program could -// have tried to collect the status of this process. -// Instead this logic tries to collect statuses of the processes -// forked during restart procedure. -func (process *TeleportProcess) collectStatuses() { - pids := process.getForkedPIDs() - if len(pids) == 0 { - return - } - for _, pid := range pids { - var wait syscall.WaitStatus - rpid, err := syscall.Wait4(pid, &wait, syscall.WNOHANG, nil) - if err != nil { - process.log.Errorf("Wait call failed: %v.", err) - continue - } - if rpid == pid { - process.popForkedPID(pid) - process.log.Warningf("Forked teleport process %v has exited with status: %v.", pid, wait.ExitStatus()) - } - } -} - -func (process *TeleportProcess) pushForkedPID(pid int) { - process.Lock() - defer process.Unlock() - process.forkedPIDs = append(process.forkedPIDs, pid) -} - -func (process *TeleportProcess) popForkedPID(pid int) { - process.Lock() - defer process.Unlock() - for i, p := range process.forkedPIDs { - if p == pid { - process.forkedPIDs = append(process.forkedPIDs[:i], process.forkedPIDs[i+1:]...) - return - } - } -} - -func (process *TeleportProcess) getForkedPIDs() []int { - process.Lock() - defer process.Unlock() - if len(process.forkedPIDs) == 0 { - return nil - } - out := make([]int, len(process.forkedPIDs)) - copy(out, process.forkedPIDs) - return out -} From 6b68de0db1cad208d39276cf353b8db0e22aa297 Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Wed, 7 Feb 2024 16:06:27 +0100 Subject: [PATCH 2/5] reduce log spam during graceful shutdowns --- lib/defaults/defaults.go | 4 ---- lib/service/signals.go | 5 ++++- lib/sshutils/server.go | 18 ++++++++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lib/defaults/defaults.go b/lib/defaults/defaults.go index 2c15ca38f7963..d1fb08f33b425 100644 --- a/lib/defaults/defaults.go +++ b/lib/defaults/defaults.go @@ -315,10 +315,6 @@ const ( // LowResPollingPeriod is a default low resolution polling period LowResPollingPeriod = 600 * time.Second - // HighResReportingPeriod is a high resolution polling reporting - // period used in services - HighResReportingPeriod = 10 * time.Second - // SessionControlTimeout is the maximum amount of time a controlled session // may persist after contact with the auth server is lost (sessctl semaphore // leases are refreshed at a rate of ~1/2 this duration). diff --git a/lib/service/signals.go b/lib/service/signals.go index 70815456e8d8a..b96a2728fe2a3 100644 --- a/lib/service/signals.go +++ b/lib/service/signals.go @@ -41,13 +41,16 @@ import ( // printShutdownStatus prints running services until shut down func (process *TeleportProcess) printShutdownStatus(ctx context.Context) { - t := time.NewTicker(defaults.HighResReportingPeriod) + statusInterval := defaults.HighResPollingPeriod + t := time.NewTimer(statusInterval) defer t.Stop() for { select { case <-ctx.Done(): return case <-t.C: + statusInterval = min(statusInterval*2, defaults.LowResPollingPeriod) + t.Reset(statusInterval) process.log.Infof("Waiting for services: %v to finish.", process.Supervisor.Services()) } } diff --git a/lib/sshutils/server.go b/lib/sshutils/server.go index 67ad9dfb4434f..6a1252445da0e 100644 --- a/lib/sshutils/server.go +++ b/lib/sshutils/server.go @@ -374,20 +374,30 @@ func (s *Server) Shutdown(ctx context.Context) error { if activeConnections == 0 { return err } + minReportInterval := 10 * s.shutdownPollPeriod + maxReportInterval := 600 * s.shutdownPollPeriod s.log.Infof("Shutdown: waiting for %v connections to finish.", activeConnections) - lastReport := time.Time{} + reportedConnections := activeConnections + lastReport := time.Now() + reportInterval := minReportInterval ticker := time.NewTicker(s.shutdownPollPeriod) defer ticker.Stop() for { select { - case <-ticker.C: + case now := <-ticker.C: activeConnections = s.trackUserConnections(0) if activeConnections == 0 { return err } - if time.Since(lastReport) > 10*s.shutdownPollPeriod { + if activeConnections != reportedConnections || now.Sub(lastReport) > reportInterval { s.log.Infof("Shutdown: waiting for %v connections to finish.", activeConnections) - lastReport = time.Now() + lastReport = now + if activeConnections == reportedConnections { + reportInterval = min(reportInterval*2, maxReportInterval) + } else { + reportInterval = minReportInterval + reportedConnections = activeConnections + } } case <-ctx.Done(): s.log.Infof("Context canceled wait, returning.") From 9c9b21cc763762712285020b1c699e1f475a0e87 Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Wed, 7 Feb 2024 17:50:53 +0100 Subject: [PATCH 3/5] hold an exclusive advisory lock on the pidfile --- lib/service/service.go | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/service/service.go b/lib/service/service.go index a4eac7287c9fa..62e5d2562e436 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -58,6 +58,7 @@ import ( "golang.org/x/crypto/acme" "golang.org/x/crypto/acme/autocert" "golang.org/x/crypto/ssh" + "golang.org/x/sys/unix" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/keepalive" @@ -1255,8 +1256,8 @@ func NewTeleport(cfg *servicecfg.Config) (*TeleportProcess, error) { // create the new pid file only after started successfully if cfg.PIDFile != "" { - if err := renameio.WriteFile(cfg.PIDFile, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { - return nil, trace.ConvertSystemError(err) + if err := createLockedPIDFile(cfg.PIDFile); err != nil { + return nil, trace.Wrap(err, "creating pidfile") } } @@ -6099,3 +6100,35 @@ func (process *TeleportProcess) newExternalAuditStorageConfigurator() (*external statusService := local.NewStatusService(process.backend) return externalauditstorage.NewConfigurator(process.ExitContext(), ecaSvc, integrationSvc, statusService) } + +// createLockedPIDFile creates a PID file in the path specified by pidFile +// containing the current PID, atomically swapping it in the final place and +// leaving it with an exclusive advisory lock that will get released when the +// process ends, for the benefit of "pkill -L". +func createLockedPIDFile(pidFile string) error { + pending, err := renameio.NewPendingFile(pidFile, renameio.WithPermissions(0o644)) + if err != nil { + return trace.ConvertSystemError(err) + } + defer pending.Cleanup() + if _, err := fmt.Fprintf(pending, "%v\n", os.Getpid()); err != nil { + return trace.ConvertSystemError(err) + } + + const minimumDupFD = 3 // skip stdio + locker, err := unix.FcntlInt(pending.Fd(), unix.F_DUPFD_CLOEXEC, minimumDupFD) + runtime.KeepAlive(pending) + if err != nil { + return trace.ConvertSystemError(err) + } + if err := unix.Flock(locker, unix.LOCK_EX|unix.LOCK_NB); err != nil { + _ = unix.Close(locker) + return trace.ConvertSystemError(err) + } + // deliberately leak the fd to hold the lock until the process dies + + if err := pending.CloseAtomicallyReplace(); err != nil { + return trace.ConvertSystemError(err) + } + return nil +} From 18b4e9b2eb11f1ad4a6a91b6b5c37a487ac5d391 Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Wed, 7 Feb 2024 17:51:21 +0100 Subject: [PATCH 4/5] do systemd reloads with the pidfile rather than $MAINPID --- assets/aws/files/system/teleport-acm.service | 4 ++-- assets/aws/files/system/teleport-auth.service | 4 ++-- assets/aws/files/system/teleport-node.service | 4 ++-- assets/aws/files/system/teleport-proxy-acm.service | 4 ++-- assets/aws/files/system/teleport-proxy.service | 4 ++-- assets/aws/files/system/teleport.service | 4 ++-- examples/systemd/fips/teleport.service | 2 +- examples/systemd/production/auth/teleport.service | 4 ++-- examples/systemd/production/node/teleport.service | 2 +- examples/systemd/production/proxy/teleport.service | 4 ++-- examples/systemd/teleport.service | 2 +- lib/config/systemd.go | 5 +++-- lib/config/testdata/TestWriteSystemdUnitFile.golden | 4 ++-- 13 files changed, 24 insertions(+), 23 deletions(-) diff --git a/assets/aws/files/system/teleport-acm.service b/assets/aws/files/system/teleport-acm.service index 231f68ee4ca98..91d7e1a84fa5d 100644 --- a/assets/aws/files/system/teleport-acm.service +++ b/assets/aws/files/system/teleport-acm.service @@ -11,6 +11,6 @@ Restart=always RestartSec=5 RuntimeDirectory=teleport ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid -LimitNOFILE=524288 \ No newline at end of file +LimitNOFILE=524288 diff --git a/assets/aws/files/system/teleport-auth.service b/assets/aws/files/system/teleport-auth.service index 35c65d496741c..f1b99b9879a0d 100644 --- a/assets/aws/files/system/teleport-auth.service +++ b/assets/aws/files/system/teleport-auth.service @@ -11,9 +11,9 @@ Restart=always RestartSec=5 RuntimeDirectory=teleport ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/assets/aws/files/system/teleport-node.service b/assets/aws/files/system/teleport-node.service index ea60fcfdaa573..b0b26651ac970 100644 --- a/assets/aws/files/system/teleport-node.service +++ b/assets/aws/files/system/teleport-node.service @@ -12,9 +12,9 @@ RestartSec=5 RuntimeDirectory=teleport ExecStartPre=/usr/local/bin/teleport-ssm-get-token ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/assets/aws/files/system/teleport-proxy-acm.service b/assets/aws/files/system/teleport-proxy-acm.service index 8eb909dbba9ef..e711afb8fc663 100644 --- a/assets/aws/files/system/teleport-proxy-acm.service +++ b/assets/aws/files/system/teleport-proxy-acm.service @@ -13,9 +13,9 @@ RuntimeDirectory=teleport EnvironmentFile=/etc/teleport.d/conf ExecStartPre=/usr/local/bin/teleport-ssm-get-token ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/assets/aws/files/system/teleport-proxy.service b/assets/aws/files/system/teleport-proxy.service index d9b124c75c1e7..4d6f729bfc7d6 100644 --- a/assets/aws/files/system/teleport-proxy.service +++ b/assets/aws/files/system/teleport-proxy.service @@ -14,9 +14,9 @@ EnvironmentFile=/etc/teleport.d/conf ExecStartPre=/usr/local/bin/teleport-ssm-get-token ExecStartPre=/bin/aws s3 sync s3://${TELEPORT_S3_BUCKET}/live/${TELEPORT_DOMAIN_NAME} /var/lib/teleport ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/assets/aws/files/system/teleport.service b/assets/aws/files/system/teleport.service index 5885ce464ab55..5e6a091f75d47 100644 --- a/assets/aws/files/system/teleport.service +++ b/assets/aws/files/system/teleport.service @@ -12,9 +12,9 @@ RestartSec=5 RuntimeDirectory=teleport ExecStartPre=/usr/local/bin/teleport-all-pre-start ExecStart=/usr/local/bin/teleport start --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport/teleport.pid PIDFile=/run/teleport/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/examples/systemd/fips/teleport.service b/examples/systemd/fips/teleport.service index ad30a0a4d424e..a87df21500486 100644 --- a/examples/systemd/fips/teleport.service +++ b/examples/systemd/fips/teleport.service @@ -7,7 +7,7 @@ Type=simple Restart=on-failure EnvironmentFile=-/etc/default/teleport ExecStart=/usr/local/bin/teleport start --config /etc/teleport.yaml --fips --pid-file=/run/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport.pid PIDFile=/run/teleport.pid LimitNOFILE=524288 diff --git a/examples/systemd/production/auth/teleport.service b/examples/systemd/production/auth/teleport.service index db5c2109e11fd..51309784e2372 100644 --- a/examples/systemd/production/auth/teleport.service +++ b/examples/systemd/production/auth/teleport.service @@ -11,9 +11,9 @@ Restart=on-failure # --roles='proxy,auth,node' is the default value # if none is set ExecStart=/usr/local/bin/teleport start --roles=auth --config=/etc/teleport.yaml --diag-addr=127.0.0.1:3000 --pid-file=/run/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport.pid PIDFile=/run/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/examples/systemd/production/node/teleport.service b/examples/systemd/production/node/teleport.service index ca3e66dd2b564..0845c6c05ec8d 100644 --- a/examples/systemd/production/node/teleport.service +++ b/examples/systemd/production/node/teleport.service @@ -11,7 +11,7 @@ Restart=on-failure # --roles='proxy,auth,node' is the default value # if none is set ExecStart=/usr/local/bin/teleport start --roles=node --config=/etc/teleport.yaml --pid-file=/run/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport.pid PIDFile=/run/teleport.pid LimitNOFILE=524288 diff --git a/examples/systemd/production/proxy/teleport.service b/examples/systemd/production/proxy/teleport.service index 7c64812aa52c6..029785694a49e 100644 --- a/examples/systemd/production/proxy/teleport.service +++ b/examples/systemd/production/proxy/teleport.service @@ -11,9 +11,9 @@ Restart=on-failure # --roles='proxy,auth,node' is the default value # if none is set ExecStart=/usr/local/bin/teleport start --roles=proxy --config=/etc/teleport.yaml --pid-file=/run/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport.pid PIDFile=/run/teleport.pid LimitNOFILE=524288 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/examples/systemd/teleport.service b/examples/systemd/teleport.service index 3a84fa0329119..226e2e373bd42 100644 --- a/examples/systemd/teleport.service +++ b/examples/systemd/teleport.service @@ -7,7 +7,7 @@ Type=simple Restart=on-failure EnvironmentFile=-/etc/default/teleport ExecStart=/usr/local/bin/teleport start --config /etc/teleport.yaml --pid-file=/run/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F /run/teleport.pid PIDFile=/run/teleport.pid LimitNOFILE=524288 diff --git a/lib/config/systemd.go b/lib/config/systemd.go index 12fd56bbf8362..6146b89926752 100644 --- a/lib/config/systemd.go +++ b/lib/config/systemd.go @@ -47,12 +47,13 @@ Type=simple Restart=on-failure EnvironmentFile=-{{ .EnvironmentFile }} ExecStart={{ .TeleportInstallationFile }} start --config {{ .TeleportConfigPath }} --pid-file={{ .PIDFile }} -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F "{{ .PIDFile }}" PIDFile={{ .PIDFile }} LimitNOFILE={{ .FileDescriptorLimit }} [Install] -WantedBy=multi-user.target`)) +WantedBy=multi-user.target +`)) // SystemdFlags specifies configuration parameters for a systemd unit file. type SystemdFlags struct { diff --git a/lib/config/testdata/TestWriteSystemdUnitFile.golden b/lib/config/testdata/TestWriteSystemdUnitFile.golden index bfd0987b65225..8153d3270b129 100644 --- a/lib/config/testdata/TestWriteSystemdUnitFile.golden +++ b/lib/config/testdata/TestWriteSystemdUnitFile.golden @@ -7,9 +7,9 @@ Type=simple Restart=on-failure EnvironmentFile=-/custom/env/dir/teleport ExecStart=/custom/install/dir/teleport start --config /etc/teleport.yaml --pid-file=/custom/pid/dir/teleport.pid -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=pkill -HUP -L -F "/custom/pid/dir/teleport.pid" PIDFile=/custom/pid/dir/teleport.pid LimitNOFILE=16384 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target From 34d453ab7356b8416b1e0dff16459d9e86b0dd6c Mon Sep 17 00:00:00 2001 From: Edoardo Spadolini Date: Thu, 8 Feb 2024 19:28:23 +0100 Subject: [PATCH 5/5] prioritize signals over internal restarts --- lib/service/signals.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/service/signals.go b/lib/service/signals.go index b96a2728fe2a3..c698293ce3a69 100644 --- a/lib/service/signals.go +++ b/lib/service/signals.go @@ -131,6 +131,16 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { process.log.Infof("Ignoring %q.", signal) } case <-process.ReloadContext().Done(): + // it's fine to signal.Stop the same channel multiple times, and + // after the function returns we're guaranteed to have restored the + // default handlers for the signals and that no more signals are + // pushed into the channel + signal.Stop(sigC) + if len(sigC) > 0 { + // exhaust all signals before the internal reload, so we don't + // miss signals to exit or to graceful restart instead + continue + } process.log.Infof("Exiting signal handler: process has started internal reload.") return ErrTeleportReloading case <-process.ExitContext().Done():