From 9e56f4f53a572cdbcf7ad03b7797e0099497a6dd Mon Sep 17 00:00:00 2001 From: ehila Date: Mon, 13 Dec 2021 17:11:34 -0500 Subject: [PATCH 1/3] feat: added leader election conventons updated leader election to follow convention and use different leader election for SNO topology Signed-off-by: ehila --- cmd/package-server-manager/main.go | 12 +++-- cmd/package-server-manager/util.go | 16 ------ pkg/leaderelection/leaderelection.go | 77 ++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 20 deletions(-) create mode 100644 pkg/leaderelection/leaderelection.go diff --git a/cmd/package-server-manager/main.go b/cmd/package-server-manager/main.go index 45a01fcd15..8c746e46e1 100644 --- a/cmd/package-server-manager/main.go +++ b/cmd/package-server-manager/main.go @@ -17,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + "github.com/openshift/operator-framework-olm/pkg/leaderelection" controllers "github.com/openshift/operator-framework-olm/pkg/package-server-manager" //+kubebuilder:scaffold:imports ) @@ -59,17 +60,20 @@ func run(cmd *cobra.Command, args []string) error { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) setupLog := ctrl.Log.WithName("setup") + restConfig := ctrl.GetConfigOrDie() + le := leaderelection.GetLeaderElectionConfig(setupLog, restConfig, !disableLeaderElection) + packageserverCSVFields := fields.Set{"metadata.name": name} - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), manager.Options{ + mgr, err := ctrl.NewManager(restConfig, manager.Options{ Scheme: setupScheme(), Namespace: namespace, MetricsBindAddress: defaultMetricsPort, LeaderElection: !disableLeaderElection, LeaderElectionNamespace: namespace, LeaderElectionID: leaderElectionConfigmapName, - RetryPeriod: timeDurationPtr(defaultRetryPeriod), - RenewDeadline: timeDurationPtr(defaultRenewDeadline), - LeaseDuration: timeDurationPtr(defaultLeaseDuration), + LeaseDuration: &le.LeaseDuration.Duration, + RenewDeadline: &le.RenewDeadline.Duration, + RetryPeriod: &le.RetryPeriod.Duration, HealthProbeBindAddress: healthCheckAddr, NewCache: cache.BuilderWithOptions(cache.Options{ SelectorsByObject: cache.SelectorsByObject{ diff --git a/cmd/package-server-manager/util.go b/cmd/package-server-manager/util.go index eb54b9e6e5..a5228e350a 100644 --- a/cmd/package-server-manager/util.go +++ b/cmd/package-server-manager/util.go @@ -1,8 +1,6 @@ package main import ( - "time" - configv1 "github.com/openshift/api/config/v1" olmv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1" @@ -11,20 +9,6 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ) -const ( - // Note: In order for SNO to GA, controllers need to handle ~60s of API server - // disruptions when attempting to get and sustain leader election: - // - https://github.com/openshift/library-go/pull/1104#discussion_r649313822 - // - https://bugzilla.redhat.com/show_bug.cgi?id=1985697 - defaultRetryPeriod = 30 * time.Second - defaultRenewDeadline = 60 * time.Second - defaultLeaseDuration = 90 * time.Second -) - -func timeDurationPtr(t time.Duration) *time.Duration { - return &t -} - func setupScheme() *runtime.Scheme { scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) diff --git a/pkg/leaderelection/leaderelection.go b/pkg/leaderelection/leaderelection.go new file mode 100644 index 0000000000..e80281ee3c --- /dev/null +++ b/pkg/leaderelection/leaderelection.go @@ -0,0 +1,77 @@ +package leaderelection + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + + configv1 "github.com/openshift/api/config/v1" + + openshiftcorev1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "k8s.io/client-go/rest" +) + +const ( + infraResourceName = "cluster" + + // Defaults follow conventions + // https://github.com/openshift/enhancements/blob/master/CONVENTIONS.md#high-availability + // Impl Calculations: https://github.com/openshift/library-go/commit/7e7d216ed91c3119800219c9194e5e57113d059a + defaultLeaseDuration = 137 * time.Second + defaultRenewDeadline = 107 * time.Second + defaultRetryPeriod = 26 * time.Second +) + +func GetLeaderElectionConfig(log logr.Logger, restClient *rest.Config, enabled bool) (defaultConfig configv1.LeaderElection) { + defaultConfig = configv1.LeaderElection{ + Disable: !enabled, + LeaseDuration: metav1.Duration{Duration: defaultLeaseDuration}, + RenewDeadline: metav1.Duration{Duration: defaultRenewDeadline}, + RetryPeriod: metav1.Duration{Duration: defaultRetryPeriod}, + } + + if enabled { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*3)) + defer cancel() + infra, err := getClusterInfraStatus(ctx, restClient) + if err != nil { + log.Error(err, "unable to get cluster infrastructure status, using HA cluster values for leader election") + return + } + if infra != nil && infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { + return leaderElectionSNOConfig(defaultConfig) + } + } + return +} + +// Default leader election for SNO environments +// Impl Calculations: +// https://github.com/openshift/library-go/commit/2612981f3019479805ac8448b997266fc07a236a#diff-61dd95c7fd45fa18038e825205fbfab8a803f1970068157608b6b1e9e6c27248R127 +func leaderElectionSNOConfig(config configv1.LeaderElection) configv1.LeaderElection { + ret := *(&config).DeepCopy() + ret.LeaseDuration.Duration = 270 * time.Second + ret.RenewDeadline.Duration = 240 * time.Second + ret.RetryPeriod.Duration = 60 * time.Second + return ret +} + +// Retrieve the cluster status, used to determine if we should use different leader election. +func getClusterInfraStatus(ctx context.Context, restClient *rest.Config) (*configv1.InfrastructureStatus, error) { + client, err := openshiftcorev1.NewForConfig(restClient) + if err != nil { + return nil, err + } + infra, err := client.Infrastructures().Get(ctx, infraResourceName, metav1.GetOptions{}) + if err != nil { + return nil, err + } + if infra == nil { + return nil, fmt.Errorf("getting resource Infrastructure (name: %s) succeeded but object was nil", infraResourceName) + } + return &infra.Status, nil +} From da97b0d3fcafef4c995efc5fe388f28cc775b840 Mon Sep 17 00:00:00 2001 From: ehila Date: Mon, 13 Dec 2021 17:48:59 -0500 Subject: [PATCH 2/3] upkeep: ran make verify Signed-off-by: ehila --- go.mod | 1 + vendor/modules.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/go.mod b/go.mod index 3a51a44c89..e41b3a377c 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/mikefarah/yq/v3 v3.0.0-20201202084205-8846255d1c37 github.com/onsi/ginkgo v1.16.4 github.com/openshift/api v0.0.0-20200331152225-585af27e34fd + github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 github.com/operator-framework/api v0.10.8-0.20211210205029-40cb9fd4036a github.com/operator-framework/operator-lifecycle-manager v0.0.0-00010101000000-000000000000 github.com/operator-framework/operator-registry v1.17.5 diff --git a/vendor/modules.txt b/vendor/modules.txt index e2004212de..5f8cd7b16d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -489,6 +489,7 @@ github.com/opencontainers/runc/libcontainer/user ## explicit github.com/openshift/api/config/v1 # github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 => github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 +## explicit github.com/openshift/client-go/config/clientset/versioned github.com/openshift/client-go/config/clientset/versioned/scheme github.com/openshift/client-go/config/clientset/versioned/typed/config/v1 From b2d3ba8908b0973614ad6284cc26f7fe6afc08fd Mon Sep 17 00:00:00 2001 From: ehila Date: Mon, 24 Jan 2022 14:21:48 -0500 Subject: [PATCH 3/3] test: added unit tests for leader election refactored to make unit testing easier added unit tests for leader election values ran go mod tidy/vendor/verify Signed-off-by: ehila --- go.mod | 1 - pkg/leaderelection/leaderelection.go | 56 +++++++----- pkg/leaderelection/leaderelection_test.go | 102 ++++++++++++++++++++++ vendor/modules.txt | 1 - 4 files changed, 135 insertions(+), 25 deletions(-) create mode 100644 pkg/leaderelection/leaderelection_test.go diff --git a/go.mod b/go.mod index e41b3a377c..3a51a44c89 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,6 @@ require ( github.com/mikefarah/yq/v3 v3.0.0-20201202084205-8846255d1c37 github.com/onsi/ginkgo v1.16.4 github.com/openshift/api v0.0.0-20200331152225-585af27e34fd - github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 github.com/operator-framework/api v0.10.8-0.20211210205029-40cb9fd4036a github.com/operator-framework/operator-lifecycle-manager v0.0.0-00010101000000-000000000000 github.com/operator-framework/operator-registry v1.17.5 diff --git a/pkg/leaderelection/leaderelection.go b/pkg/leaderelection/leaderelection.go index e80281ee3c..cb597925d4 100644 --- a/pkg/leaderelection/leaderelection.go +++ b/pkg/leaderelection/leaderelection.go @@ -2,17 +2,16 @@ package leaderelection import ( "context" - "fmt" "time" "github.com/go-logr/logr" configv1 "github.com/openshift/api/config/v1" - openshiftcorev1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -24,54 +23,65 @@ const ( defaultLeaseDuration = 137 * time.Second defaultRenewDeadline = 107 * time.Second defaultRetryPeriod = 26 * time.Second + + // Default leader election for SNO environments + // Impl Calculations: + // https://github.com/openshift/library-go/commit/2612981f3019479805ac8448b997266fc07a236a#diff-61dd95c7fd45fa18038e825205fbfab8a803f1970068157608b6b1e9e6c27248R127 + defaultSingleNodeLeaseDuration = 270 * time.Second + defaultSingleNodeRenewDeadline = 240 * time.Second + defaultSingleNodeRetryPeriod = 60 * time.Second ) -func GetLeaderElectionConfig(log logr.Logger, restClient *rest.Config, enabled bool) (defaultConfig configv1.LeaderElection) { - defaultConfig = configv1.LeaderElection{ - Disable: !enabled, +var ( + defaultLeaderElectionConfig = configv1.LeaderElection{ LeaseDuration: metav1.Duration{Duration: defaultLeaseDuration}, RenewDeadline: metav1.Duration{Duration: defaultRenewDeadline}, RetryPeriod: metav1.Duration{Duration: defaultRetryPeriod}, } +) +func GetLeaderElectionConfig(log logr.Logger, restConfig *rest.Config, enabled bool) (defaultConfig configv1.LeaderElection) { + client, err := client.New(restConfig, client.Options{}) + if err != nil { + log.Error(err, "unable to create client, using HA cluster values for leader election") + return defaultLeaderElectionConfig + } + configv1.AddToScheme(client.Scheme()) + return getLeaderElectionConfig(log, client, enabled) +} + +func getLeaderElectionConfig(log logr.Logger, client client.Client, enabled bool) (config configv1.LeaderElection) { + config = defaultLeaderElectionConfig + config.Disable = !enabled if enabled { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*3)) defer cancel() - infra, err := getClusterInfraStatus(ctx, restClient) + infra, err := getClusterInfraStatus(ctx, client) if err != nil { log.Error(err, "unable to get cluster infrastructure status, using HA cluster values for leader election") return } if infra != nil && infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { - return leaderElectionSNOConfig(defaultConfig) + return leaderElectionSNOConfig(config) } } return } -// Default leader election for SNO environments -// Impl Calculations: -// https://github.com/openshift/library-go/commit/2612981f3019479805ac8448b997266fc07a236a#diff-61dd95c7fd45fa18038e825205fbfab8a803f1970068157608b6b1e9e6c27248R127 func leaderElectionSNOConfig(config configv1.LeaderElection) configv1.LeaderElection { ret := *(&config).DeepCopy() - ret.LeaseDuration.Duration = 270 * time.Second - ret.RenewDeadline.Duration = 240 * time.Second - ret.RetryPeriod.Duration = 60 * time.Second + ret.LeaseDuration.Duration = defaultSingleNodeLeaseDuration + ret.RenewDeadline.Duration = defaultSingleNodeRenewDeadline + ret.RetryPeriod.Duration = defaultSingleNodeRetryPeriod return ret } // Retrieve the cluster status, used to determine if we should use different leader election. -func getClusterInfraStatus(ctx context.Context, restClient *rest.Config) (*configv1.InfrastructureStatus, error) { - client, err := openshiftcorev1.NewForConfig(restClient) +func getClusterInfraStatus(ctx context.Context, client client.Client) (*configv1.InfrastructureStatus, error) { + infra := &configv1.Infrastructure{} + err := client.Get(ctx, types.NamespacedName{Name: infraResourceName}, infra) if err != nil { return nil, err } - infra, err := client.Infrastructures().Get(ctx, infraResourceName, metav1.GetOptions{}) - if err != nil { - return nil, err - } - if infra == nil { - return nil, fmt.Errorf("getting resource Infrastructure (name: %s) succeeded but object was nil", infraResourceName) - } return &infra.Status, nil } diff --git a/pkg/leaderelection/leaderelection_test.go b/pkg/leaderelection/leaderelection_test.go new file mode 100644 index 0000000000..39f0396f02 --- /dev/null +++ b/pkg/leaderelection/leaderelection_test.go @@ -0,0 +1,102 @@ +package leaderelection + +import ( + "reflect" + "testing" + + configv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestGetLeaderElectionConfig(t *testing.T) { + sch := runtime.NewScheme() + configv1.AddToScheme(sch) + testCases := []struct { + desc string + enabled bool + clusterInfra configv1.Infrastructure + expected configv1.LeaderElection + }{ + { + desc: "single node leader election values when ControlPlaneTopology is SingleReplicaTopologyMode", + enabled: true, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.SingleReplicaTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: false, + LeaseDuration: metav1.Duration{ + Duration: defaultSingleNodeLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultSingleNodeRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultSingleNodeRetryPeriod, + }, + }, + }, + { + desc: "ha leader election values when ControlPlaneTopology is HighlyAvailableTopologyMode", + enabled: true, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.HighlyAvailableTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: false, + LeaseDuration: metav1.Duration{ + Duration: defaultLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultRetryPeriod, + }, + }, + }, + { + desc: "when disabled the default HA values should be returned", + enabled: false, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.SingleReplicaTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: true, + LeaseDuration: metav1.Duration{ + Duration: defaultLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultRetryPeriod, + }, + }, + }, + } + + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + client := fake.NewClientBuilder(). + WithRuntimeObjects(&tC.clusterInfra).WithScheme(sch).Build() + + setupLog := ctrl.Log.WithName("leaderelection_config_testing") + + result := getLeaderElectionConfig(setupLog, client, tC.enabled) + if !reflect.DeepEqual(result, tC.expected) { + t.Errorf("expected %+v but got %+v", tC.expected, result) + } + }) + } +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 5f8cd7b16d..e2004212de 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -489,7 +489,6 @@ github.com/opencontainers/runc/libcontainer/user ## explicit github.com/openshift/api/config/v1 # github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 => github.com/openshift/client-go v0.0.0-20200326155132-2a6cd50aedd0 -## explicit github.com/openshift/client-go/config/clientset/versioned github.com/openshift/client-go/config/clientset/versioned/scheme github.com/openshift/client-go/config/clientset/versioned/typed/config/v1