diff --git a/cmd/package-server-manager/main.go b/cmd/package-server-manager/main.go index 45a01fcd15..8c746e46e1 100644 --- a/cmd/package-server-manager/main.go +++ b/cmd/package-server-manager/main.go @@ -17,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + "github.com/openshift/operator-framework-olm/pkg/leaderelection" controllers "github.com/openshift/operator-framework-olm/pkg/package-server-manager" //+kubebuilder:scaffold:imports ) @@ -59,17 +60,20 @@ func run(cmd *cobra.Command, args []string) error { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) setupLog := ctrl.Log.WithName("setup") + restConfig := ctrl.GetConfigOrDie() + le := leaderelection.GetLeaderElectionConfig(setupLog, restConfig, !disableLeaderElection) + packageserverCSVFields := fields.Set{"metadata.name": name} - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), manager.Options{ + mgr, err := ctrl.NewManager(restConfig, manager.Options{ Scheme: setupScheme(), Namespace: namespace, MetricsBindAddress: defaultMetricsPort, LeaderElection: !disableLeaderElection, LeaderElectionNamespace: namespace, LeaderElectionID: leaderElectionConfigmapName, - RetryPeriod: timeDurationPtr(defaultRetryPeriod), - RenewDeadline: timeDurationPtr(defaultRenewDeadline), - LeaseDuration: timeDurationPtr(defaultLeaseDuration), + LeaseDuration: &le.LeaseDuration.Duration, + RenewDeadline: &le.RenewDeadline.Duration, + RetryPeriod: &le.RetryPeriod.Duration, HealthProbeBindAddress: healthCheckAddr, NewCache: cache.BuilderWithOptions(cache.Options{ SelectorsByObject: cache.SelectorsByObject{ diff --git a/cmd/package-server-manager/util.go b/cmd/package-server-manager/util.go index eb54b9e6e5..a5228e350a 100644 --- a/cmd/package-server-manager/util.go +++ b/cmd/package-server-manager/util.go @@ -1,8 +1,6 @@ package main import ( - "time" - configv1 "github.com/openshift/api/config/v1" olmv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1" @@ -11,20 +9,6 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ) -const ( - // Note: In order for SNO to GA, controllers need to handle ~60s of API server - // disruptions when attempting to get and sustain leader election: - // - https://github.com/openshift/library-go/pull/1104#discussion_r649313822 - // - https://bugzilla.redhat.com/show_bug.cgi?id=1985697 - defaultRetryPeriod = 30 * time.Second - defaultRenewDeadline = 60 * time.Second - defaultLeaseDuration = 90 * time.Second -) - -func timeDurationPtr(t time.Duration) *time.Duration { - return &t -} - func setupScheme() *runtime.Scheme { scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) diff --git a/pkg/leaderelection/leaderelection.go b/pkg/leaderelection/leaderelection.go new file mode 100644 index 0000000000..cb597925d4 --- /dev/null +++ b/pkg/leaderelection/leaderelection.go @@ -0,0 +1,87 @@ +package leaderelection + +import ( + "context" + "time" + + "github.com/go-logr/logr" + + configv1 "github.com/openshift/api/config/v1" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + infraResourceName = "cluster" + + // Defaults follow conventions + // https://github.com/openshift/enhancements/blob/master/CONVENTIONS.md#high-availability + // Impl Calculations: https://github.com/openshift/library-go/commit/7e7d216ed91c3119800219c9194e5e57113d059a + defaultLeaseDuration = 137 * time.Second + defaultRenewDeadline = 107 * time.Second + defaultRetryPeriod = 26 * time.Second + + // Default leader election for SNO environments + // Impl Calculations: + // https://github.com/openshift/library-go/commit/2612981f3019479805ac8448b997266fc07a236a#diff-61dd95c7fd45fa18038e825205fbfab8a803f1970068157608b6b1e9e6c27248R127 + defaultSingleNodeLeaseDuration = 270 * time.Second + defaultSingleNodeRenewDeadline = 240 * time.Second + defaultSingleNodeRetryPeriod = 60 * time.Second +) + +var ( + defaultLeaderElectionConfig = configv1.LeaderElection{ + LeaseDuration: metav1.Duration{Duration: defaultLeaseDuration}, + RenewDeadline: metav1.Duration{Duration: defaultRenewDeadline}, + RetryPeriod: metav1.Duration{Duration: defaultRetryPeriod}, + } +) + +func GetLeaderElectionConfig(log logr.Logger, restConfig *rest.Config, enabled bool) (defaultConfig configv1.LeaderElection) { + client, err := client.New(restConfig, client.Options{}) + if err != nil { + log.Error(err, "unable to create client, using HA cluster values for leader election") + return defaultLeaderElectionConfig + } + configv1.AddToScheme(client.Scheme()) + return getLeaderElectionConfig(log, client, enabled) +} + +func getLeaderElectionConfig(log logr.Logger, client client.Client, enabled bool) (config configv1.LeaderElection) { + config = defaultLeaderElectionConfig + config.Disable = !enabled + if enabled { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*3)) + defer cancel() + infra, err := getClusterInfraStatus(ctx, client) + if err != nil { + log.Error(err, "unable to get cluster infrastructure status, using HA cluster values for leader election") + return + } + if infra != nil && infra.ControlPlaneTopology == configv1.SingleReplicaTopologyMode { + return leaderElectionSNOConfig(config) + } + } + return +} + +func leaderElectionSNOConfig(config configv1.LeaderElection) configv1.LeaderElection { + ret := *(&config).DeepCopy() + ret.LeaseDuration.Duration = defaultSingleNodeLeaseDuration + ret.RenewDeadline.Duration = defaultSingleNodeRenewDeadline + ret.RetryPeriod.Duration = defaultSingleNodeRetryPeriod + return ret +} + +// Retrieve the cluster status, used to determine if we should use different leader election. +func getClusterInfraStatus(ctx context.Context, client client.Client) (*configv1.InfrastructureStatus, error) { + infra := &configv1.Infrastructure{} + err := client.Get(ctx, types.NamespacedName{Name: infraResourceName}, infra) + if err != nil { + return nil, err + } + return &infra.Status, nil +} diff --git a/pkg/leaderelection/leaderelection_test.go b/pkg/leaderelection/leaderelection_test.go new file mode 100644 index 0000000000..39f0396f02 --- /dev/null +++ b/pkg/leaderelection/leaderelection_test.go @@ -0,0 +1,102 @@ +package leaderelection + +import ( + "reflect" + "testing" + + configv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestGetLeaderElectionConfig(t *testing.T) { + sch := runtime.NewScheme() + configv1.AddToScheme(sch) + testCases := []struct { + desc string + enabled bool + clusterInfra configv1.Infrastructure + expected configv1.LeaderElection + }{ + { + desc: "single node leader election values when ControlPlaneTopology is SingleReplicaTopologyMode", + enabled: true, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.SingleReplicaTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: false, + LeaseDuration: metav1.Duration{ + Duration: defaultSingleNodeLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultSingleNodeRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultSingleNodeRetryPeriod, + }, + }, + }, + { + desc: "ha leader election values when ControlPlaneTopology is HighlyAvailableTopologyMode", + enabled: true, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.HighlyAvailableTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: false, + LeaseDuration: metav1.Duration{ + Duration: defaultLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultRetryPeriod, + }, + }, + }, + { + desc: "when disabled the default HA values should be returned", + enabled: false, + clusterInfra: configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infraResourceName}, + Status: configv1.InfrastructureStatus{ + ControlPlaneTopology: configv1.SingleReplicaTopologyMode, + }}, + expected: configv1.LeaderElection{ + Disable: true, + LeaseDuration: metav1.Duration{ + Duration: defaultLeaseDuration, + }, + RenewDeadline: metav1.Duration{ + Duration: defaultRenewDeadline, + }, + RetryPeriod: metav1.Duration{ + Duration: defaultRetryPeriod, + }, + }, + }, + } + + for _, tC := range testCases { + t.Run(tC.desc, func(t *testing.T) { + client := fake.NewClientBuilder(). + WithRuntimeObjects(&tC.clusterInfra).WithScheme(sch).Build() + + setupLog := ctrl.Log.WithName("leaderelection_config_testing") + + result := getLeaderElectionConfig(setupLog, client, tC.enabled) + if !reflect.DeepEqual(result, tC.expected) { + t.Errorf("expected %+v but got %+v", tC.expected, result) + } + }) + } +}