From 97cf46f36446518af44a58644a7042691b2010d1 Mon Sep 17 00:00:00 2001 From: John Kyros <79665180+jkyros@users.noreply.github.com> Date: Tue, 7 Jun 2022 01:28:40 -0500 Subject: [PATCH 1/4] cmd/common: add signal handler helper In order to know we need to shut down properly, we need a signal handler. And everything is context-driven, so we need to make sure that signal handler can cancel the context to signal everything else to shut down. This adds a signal handler function to the command helpers that the operator and the controller can share, since they both perform leader elections and they will both need to shut down cleanly to release their leases. --- cmd/common/helpers.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cmd/common/helpers.go b/cmd/common/helpers.go index c129e6a721..fc77c49bf4 100644 --- a/cmd/common/helpers.go +++ b/cmd/common/helpers.go @@ -2,6 +2,8 @@ package common import ( "os" + "os/signal" + "syscall" "github.com/golang/glog" "github.com/openshift/machine-config-operator/internal/clients" @@ -69,3 +71,23 @@ func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection { return defaultLeaderElection } + +// SignalHandler catches SIGINT/SIGTERM signals and makes sure the passed context gets cancelled when those signals happen. This allows us to use a +// context to shut down our operations cleanly when we are signalled to shutdown. +func SignalHandler(runCancel context.CancelFunc) { + + // make a signal handling channel for os signals + ch := make(chan os.Signal, 1) + // stop listening for signals when we leave this function + defer func() { signal.Stop(ch) }() + // catch SIGINT and SIGTERM + signal.Notify(ch, os.Interrupt, syscall.SIGTERM) + sig := <-ch + glog.Infof("Shutting down due to: %s", sig) + // if we're shutting down, cancel the context so everything else will stop + runCancel() + glog.Infof("Context cancelled") + sig = <-ch + glog.Fatalf("Received shutdown signal twice, exiting: %s", sig) + +} From 49857fd3971547abe08c1623ba5bdb459bce472d Mon Sep 17 00:00:00 2001 From: John Kyros <79665180+jkyros@users.noreply.github.com> Date: Tue, 7 Jun 2022 01:40:30 -0500 Subject: [PATCH 2/4] Add clean leader lease release to controller Previously we did not cleanly release the leader lease on controller shutdown, which resulted in slow leader elections as the existing lease was forced to time out when a pod got rescheduled. This adds a signal handler, a context, and sets the leaderelection settings such that when the controller receives a shutdown signal, it will release its leader lease and terminate so the new leader can more quickly take over. --- cmd/machine-config-controller/start.go | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index d43e1dac09..d6a0a9dab4 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "os" "github.com/golang/glog" "github.com/openshift/machine-config-operator/cmd/common" @@ -18,6 +19,7 @@ import ( "github.com/openshift/machine-config-operator/pkg/version" "github.com/spf13/cobra" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/leaderelection" ) @@ -48,6 +50,10 @@ func runStartCmd(cmd *cobra.Command, args []string) { flag.Set("logtostderr", "true") flag.Parse() + // This is 'main' context that we thread through the controller context and + // the leader elections. Cancelling this is "stop everything, we are shutting down". + runContext, runCancel := context.WithCancel(context.Background()) + // To help debugging, immediately log version glog.Infof("Version: %+v (%s)", version.Raw, version.Hash) @@ -56,6 +62,8 @@ func runStartCmd(cmd *cobra.Command, args []string) { ctrlcommon.WriteTerminationError(fmt.Errorf("creating clients: %w", err)) } run := func(ctx context.Context) { + go common.SignalHandler(runCancel) + ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), componentName) // Start the metrics handler @@ -82,20 +90,23 @@ func runStartCmd(cmd *cobra.Command, args []string) { } go draincontroller.Run(5, ctrlctx.Stop) - select {} + // wait here in this function until the context gets cancelled (which tells us whe were being shut down) + <-ctx.Done() } leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) - leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{ - Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName), - LeaseDuration: leaderElectionCfg.LeaseDuration.Duration, - RenewDeadline: leaderElectionCfg.RenewDeadline.Duration, - RetryPeriod: leaderElectionCfg.RetryPeriod.Duration, + leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ + Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName), + ReleaseOnCancel: true, + LeaseDuration: leaderElectionCfg.LeaseDuration.Duration, + RenewDeadline: leaderElectionCfg.RenewDeadline.Duration, + RetryPeriod: leaderElectionCfg.RetryPeriod.Duration, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: run, OnStoppedLeading: func() { - glog.Fatalf("leaderelection lost") + glog.Infof("Stopped leading. Terminating.") + os.Exit(0) }, }, }) From 25aaa58c435e47c47ac61d01cc28afb9331e133d Mon Sep 17 00:00:00 2001 From: John Kyros <79665180+jkyros@users.noreply.github.com> Date: Tue, 7 Jun 2022 01:40:55 -0500 Subject: [PATCH 3/4] Add clean leader lease release to operator Much like the controller, the operator does not relinquish its leader lease on shutdown. This results in additional delays when updating/redeploying especially because the controller depends on controllerconfig/renderconfig being updated, and that has to wait behind the operator leader election. This makes sure the operator shuts down properly and release its leader lease when its context is cancelled on shutdown. --- cmd/machine-config-controller/start.go | 1 - cmd/machine-config-operator/start.go | 23 ++++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cmd/machine-config-controller/start.go b/cmd/machine-config-controller/start.go index d6a0a9dab4..b437ae66ea 100644 --- a/cmd/machine-config-controller/start.go +++ b/cmd/machine-config-controller/start.go @@ -19,7 +19,6 @@ import ( "github.com/openshift/machine-config-operator/pkg/version" "github.com/spf13/cobra" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/leaderelection" ) diff --git a/cmd/machine-config-operator/start.go b/cmd/machine-config-operator/start.go index c6da99e420..46857b86c5 100644 --- a/cmd/machine-config-operator/start.go +++ b/cmd/machine-config-operator/start.go @@ -39,6 +39,10 @@ func runStartCmd(cmd *cobra.Command, args []string) { flag.Set("logtostderr", "true") flag.Parse() + // This is 'main' context that we thread through the controller context and + // the leader elections. Cancelling this is "stop everything, we are shutting down". + runContext, runCancel := context.WithCancel(context.Background()) + // To help debugging, immediately log version glog.Infof("Version: %s (Raw: %s, Hash: %s)", os.Getenv("RELEASE_VERSION"), version.Raw, version.Hash) @@ -51,6 +55,8 @@ func runStartCmd(cmd *cobra.Command, args []string) { glog.Fatalf("error creating clients: %v", err) } run := func(ctx context.Context) { + go common.SignalHandler(runCancel) + ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), ctrlcommon.MCONamespace) controller := operator.New( ctrlcommon.MCONamespace, componentName, @@ -91,20 +97,23 @@ func runStartCmd(cmd *cobra.Command, args []string) { go controller.Run(2, ctrlctx.Stop) - select {} + // wait here in this function until the context gets cancelled (which tells us whe were being shut down) + <-ctx.Done() } leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig()) - leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{ - Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName), - LeaseDuration: leaderElectionCfg.LeaseDuration.Duration, - RenewDeadline: leaderElectionCfg.RenewDeadline.Duration, - RetryPeriod: leaderElectionCfg.RetryPeriod.Duration, + leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{ + Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName), + ReleaseOnCancel: true, + LeaseDuration: leaderElectionCfg.LeaseDuration.Duration, + RenewDeadline: leaderElectionCfg.RenewDeadline.Duration, + RetryPeriod: leaderElectionCfg.RetryPeriod.Duration, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: run, OnStoppedLeading: func() { - glog.Fatalf("leaderelection lost") + glog.Info("Stopped leading. Terminating.") + os.Exit(0) }, }, }) From 61a90054ba19564f9242fc882e7da313859ac700 Mon Sep 17 00:00:00 2001 From: John Kyros <79665180+jkyros@users.noreply.github.com> Date: Tue, 7 Jun 2022 01:41:48 -0500 Subject: [PATCH 4/4] controller: clearer metrics handler shutdown logs The metrics handler previously always emitted an error when being shut down, even if the error was nil. This was okay before because we never actually properly shutdown the metrics handler before, but now that we're going to, this needs to be clearer. This makes the metrics handler only emit an error when there is an error, and emit an info message when it successfully shuts down. --- pkg/controller/common/metrics.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg/controller/common/metrics.go b/pkg/controller/common/metrics.go index c09b6ccba9..d3bf9049da 100644 --- a/pkg/controller/common/metrics.go +++ b/pkg/controller/common/metrics.go @@ -62,7 +62,12 @@ func StartMetricsListener(addr string, stopCh <-chan struct{}) { } }() <-stopCh - if err := s.Shutdown(context.Background()); err != http.ErrServerClosed { - glog.Errorf("error stopping metrics listener: %v", err) + if err := s.Shutdown(context.Background()); err != nil { + if err != http.ErrServerClosed { + glog.Errorf("error stopping metrics listener: %v", err) + } + } else { + glog.Infof("Metrics listener successfully stopped") } + }