diff --git a/bindata/bootkube/manifests/00_openshift-etcd-ns.yaml b/bindata/bootkube/manifests/00_openshift-etcd-ns.yaml index 300f2b0863..72885cc3b5 100644 --- a/bindata/bootkube/manifests/00_openshift-etcd-ns.yaml +++ b/bindata/bootkube/manifests/00_openshift-etcd-ns.yaml @@ -3,6 +3,9 @@ kind: Namespace metadata: annotations: openshift.io/node-selector: "" +{{- range $key, $val := .NamespaceAnnotations }} + {{$key}}: "{{$val}}" +{{- end}} name: openshift-etcd labels: openshift.io/run-level: "0" diff --git a/pkg/cmd/render/render.go b/pkg/cmd/render/render.go index ef81a58d56..d827f3b856 100644 --- a/pkg/cmd/render/render.go +++ b/pkg/cmd/render/render.go @@ -15,6 +15,7 @@ import ( "strings" "github.com/openshift/cluster-etcd-operator/pkg/dnshelpers" + "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" "github.com/openshift/cluster-etcd-operator/pkg/tlshelpers" "github.com/ghodss/yaml" @@ -42,6 +43,8 @@ type renderOpts struct { clusterConfigMapFile string infraConfigFile string bootstrapIP string + + delayedHABootstrapScalingStrategyMarker string } // NewRenderCommand creates a render command. @@ -132,6 +135,7 @@ func (r *renderOpts) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&r.clusterConfigMapFile, "cluster-configmap-file", "/assets/manifests/cluster-config.yaml", "File containing the cluster-config-v1 configmap.") fs.StringVar(&r.infraConfigFile, "infra-config-file", "/assets/manifests/cluster-infrastructure-02-config.yml", "File containing infrastructure.config.openshift.io manifest.") fs.StringVar(&r.bootstrapIP, "bootstrap-ip", r.bootstrapIP, "bootstrap IP used to indicate where to find the first etcd endpoint") + fs.StringVar(&r.delayedHABootstrapScalingStrategyMarker, "delayed-ha-bootstrap-scaling-marker-file", "/assets/assisted-install-bootstrap", "Marker file that, if present, enables the delayed HA bootstrap scaling strategy") } // Validate verifies the inputs. @@ -198,6 +202,9 @@ type TemplateData struct { // ComputedEnvVars name/value pairs to populate env: for static pod. ComputedEnvVars string + + // NamespaceAnnotations are addition annotations to apply to the etcd namespace. + NamespaceAnnotations map[string]string } type StaticFile struct { @@ -279,6 +286,15 @@ func newTemplateData(opts *renderOpts) (*TemplateData, error) { return nil, err } + // Use a marker file to configure the bootstrap scaling strategy. + if _, err := os.Stat(opts.delayedHABootstrapScalingStrategyMarker); err == nil { + if templateData.NamespaceAnnotations == nil { + templateData.NamespaceAnnotations = map[string]string{} + } + templateData.NamespaceAnnotations[ceohelpers.DelayedHABootstrapScalingStrategyAnnotation] = "" + klog.Infof("using delayed HA bootstrap scaling strategy due to presence of marker file %s", opts.delayedHABootstrapScalingStrategyMarker) + } + return &templateData, nil } diff --git a/pkg/etcdenvvar/envvarcontroller.go b/pkg/etcdenvvar/envvarcontroller.go index 3a212a6d49..d6589bb073 100644 --- a/pkg/etcdenvvar/envvarcontroller.go +++ b/pkg/etcdenvvar/envvarcontroller.go @@ -9,7 +9,6 @@ import ( operatorv1 "github.com/openshift/api/operator/v1" configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1" configv1listers "github.com/openshift/client-go/config/listers/config/v1" - "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" "github.com/openshift/library-go/pkg/operator/events" "github.com/openshift/library-go/pkg/operator/v1helpers" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -18,6 +17,9 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" + + "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" + "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" ) const workQueueKey = "key" @@ -34,6 +36,7 @@ type EnvVarController struct { networkLister configv1listers.NetworkLister configmapLister corev1listers.ConfigMapLister nodeLister corev1listers.NodeLister + namespaceLister corev1listers.NamespaceLister // queue only ever has one item, but it has nice error handling backoff/retry semantics queue workqueue.RateLimitingInterface @@ -57,6 +60,7 @@ func NewEnvVarController( operatorClient: operatorClient, infrastructureLister: infrastructureInformer.Lister(), networkLister: networkInformer.Lister(), + namespaceLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Namespaces().Lister(), configmapLister: kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Core().V1().ConfigMaps().Lister(), nodeLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Nodes().Lister(), targetImagePullSpec: targetImagePullSpec, @@ -123,6 +127,10 @@ func (c *EnvVarController) sync() error { } func (c *EnvVarController) checkEnvVars() error { + if err := ceohelpers.CheckSafeToScaleCluster(c.configmapLister, c.operatorClient, c.namespaceLister); err != nil { + return fmt.Errorf("can't update etcd pod configurations because scaling is currently unsafe: %w", err) + } + operatorSpec, operatorStatus, _, err := c.operatorClient.GetStaticPodOperatorState() if err != nil { return err diff --git a/pkg/etcdenvvar/etcd_env.go b/pkg/etcdenvvar/etcd_env.go index 8b63b289cd..947213a411 100644 --- a/pkg/etcdenvvar/etcd_env.go +++ b/pkg/etcdenvvar/etcd_env.go @@ -6,7 +6,6 @@ import ( "strings" "github.com/openshift/cluster-etcd-operator/pkg/dnshelpers" - "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" operatorv1 "github.com/openshift/api/operator/v1" @@ -59,20 +58,6 @@ var envVarFns = []envVarFunc{ // NODE_%s_ETCD_URL_HOST // NODE_%s_ETCD_NAME func getEtcdEnvVars(envVarContext envVarContext) (map[string]string, error) { - // TODO once we are past bootstrapping, this restriction shouldn't be needed anymore. - // we have it because the env vars were not getting set in the pod and the static pod operator started - // rolling out to another node, which caused a failure. - isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&envVarContext.spec) - if err != nil { - return nil, err - } - switch { - case isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 1: - return nil, fmt.Errorf("at least one node is required to have a valid configuration") - case !isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 3: - return nil, fmt.Errorf("at least three nodes are required to have a valid configuration") - } - ret := map[string]string{} for _, envVarFn := range envVarFns { diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go index a8769e2e98..57202e0fed 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go @@ -2,6 +2,7 @@ package bootstrapteardown import ( "context" + "fmt" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -20,6 +21,7 @@ type BootstrapTeardownController struct { operatorClient v1helpers.StaticPodOperatorClient etcdClient etcdcli.EtcdClient configmapLister corev1listers.ConfigMapLister + namespaceLister corev1listers.NamespaceLister } func NewBootstrapTeardownController( @@ -32,6 +34,7 @@ func NewBootstrapTeardownController( operatorClient: operatorClient, etcdClient: etcdClient, configmapLister: kubeInformersForNamespaces.InformersFor("kube-system").Core().V1().ConfigMaps().Lister(), + namespaceLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Namespaces().Lister(), } return factory.New().ResyncEvery(time.Minute).WithInformers( @@ -147,23 +150,24 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro return false, hasBootstrap, nil } - isUnsupportedUnsafeEtcd, err := c.isUnsupportedUnsafeEtcd() + scalingStrategy, err := ceohelpers.GetBootstrapScalingStrategy(c.operatorClient, c.namespaceLister) if err != nil { - return false, hasBootstrap, err + return false, hasBootstrap, fmt.Errorf("failed to get bootstrap scaling strategy: %w", err) } - switch { - case !isUnsupportedUnsafeEtcd && len(members) < 4: - // bootstrap is not safe to remove until we scale to 4 - return false, hasBootstrap, nil - case isUnsupportedUnsafeEtcd && len(members) < 2: - // if etcd is unsupported, bootstrap is not safe to remove - // until we scale to 2 - return false, hasBootstrap, nil - default: - // do nothing fall through on checking the unhealthy members + // First, enforce the main HA invariants in terms of member counts. + switch scalingStrategy { + case ceohelpers.HAScalingStrategy: + if len(members) < 4 { + return false, hasBootstrap, nil + } + case ceohelpers.DelayedHAScalingStrategy, ceohelpers.UnsafeScalingStrategy: + if len(members) < 2 { + return false, hasBootstrap, nil + } } + // Next, given member counts are satisfied, check member health. unhealthyMembers, err := c.etcdClient.UnhealthyMembers() if err != nil { return false, hasBootstrap, nil @@ -182,11 +186,3 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro return false, hasBootstrap, nil } } - -func (c *BootstrapTeardownController) isUnsupportedUnsafeEtcd() (bool, error) { - spec, _, _, err := c.operatorClient.GetStaticPodOperatorState() - if err != nil { - return false, err - } - return ceohelpers.IsUnsupportedUnsafeEtcd(spec) -} diff --git a/pkg/operator/ceohelpers/bootstrap.go b/pkg/operator/ceohelpers/bootstrap.go new file mode 100644 index 0000000000..dda79512a7 --- /dev/null +++ b/pkg/operator/ceohelpers/bootstrap.go @@ -0,0 +1,164 @@ +package ceohelpers + +import ( + "fmt" + + "github.com/openshift/library-go/pkg/operator/v1helpers" + "k8s.io/apimachinery/pkg/api/errors" + corev1listers "k8s.io/client-go/listers/core/v1" + "k8s.io/klog/v2" + + "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" +) + +// BootstrapScalingStrategy describes the invariants which will be enforced when +// scaling the etcd cluster. +type BootstrapScalingStrategy string + +const ( + // HAScalingStrategy means the etcd cluster will only be scaled up when at least + // 3 node are available so that HA is enforced at all times. This rule applies + // during bootstrapping and the steady state. + // + // This is the default strategy. + HAScalingStrategy BootstrapScalingStrategy = "HAScalingStrategy" + + // DelayedHAScalingStrategy means that during bootstrapping, the etcd cluster will + // be allowed to scale when at least 2 members are available (which is not HA), + // but after bootstrapping any further scaling will require 3 nodes in the same + // way as HAScalingStrategy. + // + // This strategy is selected by adding the `openshift.io/delayed-ha-bootstrap` + // annotation to the openshift-etcd namesapce. + DelayedHAScalingStrategy BootstrapScalingStrategy = "DelayedHAScalingStrategy" + + // UnsafeScalingStrategy means scaling will occur without regards to nodes and + // any effect on quorum. Use of this strategy isn't officially tested or supported, + // but is made available for ad-hoc use. + // + // This strategy is selected by setting unsupportedConfigOverrides on the + // operator config. + UnsafeScalingStrategy BootstrapScalingStrategy = "UnsafeScalingStrategy" +) + +const ( + // DelayedHABootstrapScalingStrategyAnnotation is an annotation on the openshift-etcd + // namespace which if present indicates the DelayedHAScalingStrategy strategy + // should be used. + DelayedHABootstrapScalingStrategyAnnotation = "openshift.io/delayed-ha-bootstrap" +) + +// GetBootstrapScalingStrategy determines the scaling strategy to use. +func GetBootstrapScalingStrategy(staticPodClient v1helpers.StaticPodOperatorClient, namespaceLister corev1listers.NamespaceLister) (BootstrapScalingStrategy, error) { + var strategy BootstrapScalingStrategy + + operatorSpec, _, _, err := staticPodClient.GetStaticPodOperatorState() + if err != nil { + return strategy, fmt.Errorf("failed to get operator state: %w", err) + } + + isUnsupportedUnsafeEtcd, err := isUnsupportedUnsafeEtcd(operatorSpec) + if err != nil { + return strategy, fmt.Errorf("couldn't determine etcd unsupported override status, assuming default HA scaling strategy: %w", err) + } + + etcdNamespace, err := namespaceLister.Get(operatorclient.TargetNamespace) + if err != nil { + return strategy, fmt.Errorf("failed to get %s namespace: %w", operatorclient.TargetNamespace, err) + } + _, hasDelayedHAAnnotation := etcdNamespace.Annotations[DelayedHABootstrapScalingStrategyAnnotation] + + switch { + case isUnsupportedUnsafeEtcd: + strategy = UnsafeScalingStrategy + case hasDelayedHAAnnotation: + strategy = DelayedHAScalingStrategy + default: + strategy = HAScalingStrategy + } + return strategy, nil +} + +// CheckSafeToScaleCluster is used to implement the bootstrap scaling strategy invariants. +// This function returns nil if cluster conditions are such that it's safe to scale +// the etcd cluster based on the scaling strategy in use, and otherwise will return +// an error explaining why it's unsafe to scale. +func CheckSafeToScaleCluster(configmapLister corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient, namespaceLister corev1listers.NamespaceLister) error { + bootstrapComplete, err := IsBootstrapComplete(configmapLister, staticPodClient) + if err != nil { + return fmt.Errorf("failed to determine bootstrap status: %w", err) + } + + _, operatorStatus, _, err := staticPodClient.GetStaticPodOperatorState() + if err != nil { + return fmt.Errorf("failed to get operator state: %w", err) + } + + scalingStrategy, err := GetBootstrapScalingStrategy(staticPodClient, namespaceLister) + if err != nil { + return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err) + } + + var minimumNodes int + switch scalingStrategy { + case HAScalingStrategy: + minimumNodes = 3 + case UnsafeScalingStrategy: + minimumNodes = 1 + case DelayedHAScalingStrategy: + if bootstrapComplete { + minimumNodes = 3 + } else { + minimumNodes = 2 + } + default: + return fmt.Errorf("unrecognized scaling strategy %q", scalingStrategy) + } + + nodeCount := len(operatorStatus.NodeStatuses) + if nodeCount < minimumNodes { + return fmt.Errorf("%d nodes are required, but only %d are available", minimumNodes, nodeCount) + } + + klog.V(4).Infof("node count %d satisfies minimum of %d required by the %s bootstrap scaling strategy", nodeCount, minimumNodes, scalingStrategy) + return nil +} + +// IsBootstrapComplete returns true if bootstrap has completed. +func IsBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) { + // do a cheap check to see if the annotation is already gone. + // check to see if bootstrapping is complete + bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap") + if err != nil { + if errors.IsNotFound(err) { + // If the resource was deleted (e.g. by an admin) after bootstrap is actually complete, + // this is a false negative. + klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found") + return false, nil + } + // We don't know, give up quickly. + return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err) + } + + if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" { + // do nothing, not torn down + klog.V(4).Infof("bootstrap considered incomplete because status is %q", status) + return false, nil + } + + // now run check to stability of revisions + _, status, _, err := staticPodClient.GetStaticPodOperatorState() + if err != nil { + return false, fmt.Errorf("failed to get static pod operator state: %w", err) + } + if status.LatestAvailableRevision == 0 { + return false, nil + } + for _, curr := range status.NodeStatuses { + if curr.CurrentRevision != status.LatestAvailableRevision { + klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision) + return false, nil + } + } + return true, nil +} diff --git a/pkg/operator/ceohelpers/bootstrap_test.go b/pkg/operator/ceohelpers/bootstrap_test.go new file mode 100644 index 0000000000..82a28c55d3 --- /dev/null +++ b/pkg/operator/ceohelpers/bootstrap_test.go @@ -0,0 +1,291 @@ +package ceohelpers + +import ( + "fmt" + "testing" + + "github.com/openshift/library-go/pkg/operator/v1helpers" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + corev1listers "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + + operatorv1 "github.com/openshift/api/operator/v1" + + "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" +) + +var ( + defaultNamespace = &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: operatorclient.TargetNamespace}, + } + + namespaceWithDelayedHAEnabled = &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: operatorclient.TargetNamespace, + Annotations: map[string]string{ + DelayedHABootstrapScalingStrategyAnnotation: "", + }, + }, + } + + defaultOperatorConfig = operatorv1.StaticPodOperatorSpec{} + + unsupportedOperatorConfig = operatorv1.StaticPodOperatorSpec{ + OperatorSpec: operatorv1.OperatorSpec{ + UnsupportedConfigOverrides: runtime.RawExtension{ + Raw: []byte(`useUnsupportedUnsafeNonHANonProductionUnstableEtcd: "true"`), + }, + }, + } + + bootstrapComplete = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "bootstrap", Namespace: "kube-system"}, + Data: map[string]string{"status": "complete"}, + } + + bootstrapProgressing = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "bootstrap", Namespace: "kube-system"}, + Data: map[string]string{"status": "progressing"}, + } + + oneNodeAtCurrentRevision = []operatorv1.NodeStatus{ + {NodeName: "node-1", CurrentRevision: 1}, + } + + twoNodesAtCurrentRevision = []operatorv1.NodeStatus{ + {NodeName: "node-1", CurrentRevision: 1}, + {NodeName: "node-2", CurrentRevision: 1}, + } + + twoNodesProgressingTowardsCurrentRevision = []operatorv1.NodeStatus{ + {NodeName: "node-1", CurrentRevision: 1}, + {NodeName: "node-2", CurrentRevision: 0}, + } + + threeNodesAtCurrentRevision = []operatorv1.NodeStatus{ + {NodeName: "node-1", CurrentRevision: 1}, + {NodeName: "node-2", CurrentRevision: 1}, + {NodeName: "node-3", CurrentRevision: 1}, + } + + zeroNodesAtAnyRevision = []operatorv1.NodeStatus{} +) + +func Test_GetBootstrapScalingStrategy(t *testing.T) { + tests := map[string]struct { + namespace *corev1.Namespace + operatorConfig operatorv1.StaticPodOperatorSpec + expectStrategy BootstrapScalingStrategy + }{ + "default should be HA": { + namespace: defaultNamespace, + operatorConfig: defaultOperatorConfig, + expectStrategy: HAScalingStrategy, + }, + "unsupported": { + namespace: defaultNamespace, + operatorConfig: unsupportedOperatorConfig, + expectStrategy: UnsafeScalingStrategy, + }, + "delayed HA": { + namespace: namespaceWithDelayedHAEnabled, + operatorConfig: defaultOperatorConfig, + expectStrategy: DelayedHAScalingStrategy, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) + if test.namespace != nil { + if err := indexer.Add(test.namespace); err != nil { + t.Fatal(err) + } + } + fakeNamespaceMapLister := corev1listers.NewNamespaceLister(indexer) + + fakeStaticPodClient := v1helpers.NewFakeStaticPodOperatorClient(&test.operatorConfig, nil, nil, nil) + + actualStrategy, err := GetBootstrapScalingStrategy(fakeStaticPodClient, fakeNamespaceMapLister) + if err != nil { + t.Errorf("unexpected error: %s", err) + return + } + if test.expectStrategy != actualStrategy { + t.Errorf("expected stategy=%v, got %v", test.expectStrategy, actualStrategy) + } + }) + } +} + +func Test_IsBootstrapComplete(t *testing.T) { + tests := map[string]struct { + bootstrapConfigMap *corev1.ConfigMap + nodes []operatorv1.NodeStatus + expectComplete bool + expectError error + }{ + "bootstrap complete, nodes up to date": { + bootstrapConfigMap: bootstrapComplete, + nodes: twoNodesAtCurrentRevision, + expectComplete: true, + expectError: nil, + }, + "bootstrap progressing, nodes up to date": { + bootstrapConfigMap: bootstrapProgressing, + nodes: twoNodesAtCurrentRevision, + expectComplete: false, + expectError: nil, + }, + "bootstrap configmap missing": { + bootstrapConfigMap: nil, + nodes: twoNodesAtCurrentRevision, + expectComplete: false, + expectError: nil, + }, + "bootstrap complete, no recorded revisions": { + bootstrapConfigMap: bootstrapComplete, + nodes: zeroNodesAtAnyRevision, + expectComplete: true, + expectError: nil, + }, + "bootstrap complete, node progressing": { + bootstrapConfigMap: bootstrapComplete, + nodes: twoNodesProgressingTowardsCurrentRevision, + expectComplete: false, + expectError: nil, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) + if test.bootstrapConfigMap != nil { + if err := indexer.Add(test.bootstrapConfigMap); err != nil { + t.Fatal(err) + } + } + fakeConfigMapLister := corev1listers.NewConfigMapLister(indexer) + + operatorStatus := &operatorv1.StaticPodOperatorStatus{ + LatestAvailableRevision: 1, + NodeStatuses: test.nodes, + } + fakeStaticPodClient := v1helpers.NewFakeStaticPodOperatorClient(nil, operatorStatus, nil, nil) + + actualComplete, actualErr := IsBootstrapComplete(fakeConfigMapLister, fakeStaticPodClient) + + if test.expectComplete != actualComplete { + t.Errorf("expected complete=%v, got %v", test.expectComplete, actualComplete) + } + if test.expectError != actualErr { + t.Errorf("expected error=%v, got %v", test.expectError, actualErr) + } + }) + } +} + +func Test_CheckSafeToScaleCluster(t *testing.T) { + tests := map[string]struct { + namespace *corev1.Namespace + bootstrapConfigMap *corev1.ConfigMap + operatorConfig operatorv1.StaticPodOperatorSpec + nodes []operatorv1.NodeStatus + expectComplete bool + expectError error + }{ + "HA with sufficient nodes": { + namespace: defaultNamespace, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: defaultOperatorConfig, + nodes: threeNodesAtCurrentRevision, + expectError: nil, + }, + "HA with insufficient nodes": { + namespace: defaultNamespace, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: defaultOperatorConfig, + nodes: twoNodesAtCurrentRevision, + expectError: fmt.Errorf("not enough nodes"), + }, + "unsupported with sufficient nodes": { + namespace: defaultNamespace, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: unsupportedOperatorConfig, + nodes: oneNodeAtCurrentRevision, + expectError: nil, + }, + "unsupported with insufficient nodes": { + namespace: defaultNamespace, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: unsupportedOperatorConfig, + nodes: zeroNodesAtAnyRevision, + expectError: fmt.Errorf("not enough nodes"), + }, + "delayed HA with sufficient nodes during bootstrap": { + namespace: namespaceWithDelayedHAEnabled, + bootstrapConfigMap: bootstrapProgressing, + operatorConfig: defaultOperatorConfig, + nodes: twoNodesAtCurrentRevision, + expectError: nil, + }, + "delayed HA with insufficient nodes during bootstrap": { + namespace: namespaceWithDelayedHAEnabled, + bootstrapConfigMap: bootstrapProgressing, + operatorConfig: defaultOperatorConfig, + nodes: oneNodeAtCurrentRevision, + expectError: fmt.Errorf("not enough nodes"), + }, + "delayed HA with sufficient nodes during steady state": { + namespace: namespaceWithDelayedHAEnabled, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: defaultOperatorConfig, + nodes: threeNodesAtCurrentRevision, + expectError: nil, + }, + "delayed HA with insufficient nodes during steady state": { + namespace: namespaceWithDelayedHAEnabled, + bootstrapConfigMap: bootstrapComplete, + operatorConfig: defaultOperatorConfig, + nodes: twoNodesAtCurrentRevision, + expectError: fmt.Errorf("not enough nodes"), + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + namespaceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) + if test.namespace != nil { + if err := namespaceIndexer.Add(test.namespace); err != nil { + t.Fatal(err) + } + } + fakeNamespaceMapLister := corev1listers.NewNamespaceLister(namespaceIndexer) + + configmapIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) + if test.bootstrapConfigMap != nil { + if err := configmapIndexer.Add(test.bootstrapConfigMap); err != nil { + t.Fatal(err) + } + } + fakeConfigMapLister := corev1listers.NewConfigMapLister(configmapIndexer) + + operatorStatus := &operatorv1.StaticPodOperatorStatus{ + LatestAvailableRevision: 1, + NodeStatuses: test.nodes, + } + fakeStaticPodClient := v1helpers.NewFakeStaticPodOperatorClient(&test.operatorConfig, operatorStatus, nil, nil) + + actualErr := CheckSafeToScaleCluster(fakeConfigMapLister, fakeStaticPodClient, fakeNamespaceMapLister) + + if test.expectError != nil && actualErr == nil { + t.Errorf("expected error=%v, got %v", test.expectError, actualErr) + } + if test.expectError == nil && actualErr != nil { + t.Errorf("expected error=%v, got %v", test.expectError, actualErr) + } + }) + } +} diff --git a/pkg/operator/ceohelpers/unsupported_override.go b/pkg/operator/ceohelpers/unsupported_override.go index 4f621dec04..d69868ee6d 100644 --- a/pkg/operator/ceohelpers/unsupported_override.go +++ b/pkg/operator/ceohelpers/unsupported_override.go @@ -11,10 +11,10 @@ import ( "k8s.io/klog/v2" ) -// IsUnsupportedUnsafeEtcd returns true if +// isUnsupportedUnsafeEtcd returns true if // useUnsupportedUnsafeNonHANonProductionUnstableEtcd key is set // to any parsable value -func IsUnsupportedUnsafeEtcd(spec *operatorv1.StaticPodOperatorSpec) (bool, error) { +func isUnsupportedUnsafeEtcd(spec *operatorv1.StaticPodOperatorSpec) (bool, error) { unsupportedConfig := map[string]interface{}{} if spec.UnsupportedConfigOverrides.Raw == nil { return false, nil diff --git a/pkg/operator/ceohelpers/unsupported_override_test.go b/pkg/operator/ceohelpers/unsupported_override_test.go index 50babb74a1..24aea9cfd4 100644 --- a/pkg/operator/ceohelpers/unsupported_override_test.go +++ b/pkg/operator/ceohelpers/unsupported_override_test.go @@ -104,7 +104,7 @@ func TestIsUnsupportedUnsafeEtcd(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := IsUnsupportedUnsafeEtcd(tt.args.spec) + got, err := isUnsupportedUnsafeEtcd(tt.args.spec) if (err != nil) != tt.wantErr { t.Errorf("IsUnsupportedUnsafeEtcd() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go b/pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go index f96ee40163..e2cbd1f226 100644 --- a/pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go +++ b/pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go @@ -19,9 +19,9 @@ import ( "k8s.io/client-go/kubernetes" corev1client "k8s.io/client-go/kubernetes/typed/core/v1" corev1listers "k8s.io/client-go/listers/core/v1" - "k8s.io/klog/v2" "github.com/openshift/cluster-etcd-operator/pkg/etcdcli" + "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" ) @@ -87,7 +87,7 @@ func (c *EtcdEndpointsController) sync(ctx context.Context, syncCtx factory.Sync } func (c *EtcdEndpointsController) syncConfigMap(recorder events.Recorder) error { - bootstrapComplete, err := isBootstrapComplete(c.configmapLister, c.operatorClient) + bootstrapComplete, err := ceohelpers.IsBootstrapComplete(c.configmapLister, c.operatorClient) if err != nil { return fmt.Errorf("couldn't determine bootstrap status: %w", err) } @@ -158,43 +158,3 @@ func configMapAsset() *corev1.ConfigMap { }, } } - -// isBootstrapComplete returns true if bootstrap has completed. This is used to -// indicate whether it's safe for clients to forget about the bootstrap member IP. -func isBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) { - // do a cheap check to see if the annotation is already gone. - // check to see if bootstrapping is complete - bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap") - if err != nil { - if errors.IsNotFound(err) { - // If the resource was deleted (e.g. by an admin) after bootstrap is actually complete, - // this is a false negative. - klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found") - return false, nil - } - // We don't know, give up quickly. - return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err) - } - - if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" { - // do nothing, not torn down - klog.V(4).Infof("bootstrap considered incomplete because status is %q", status) - return false, nil - } - - // now run check to stability of revisions - _, status, _, err := staticPodClient.GetStaticPodOperatorState() - if err != nil { - return false, fmt.Errorf("failed to get static pod operator state: %w", err) - } - if status.LatestAvailableRevision == 0 { - return false, nil - } - for _, curr := range status.NodeStatuses { - if curr.CurrentRevision != status.LatestAvailableRevision { - klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision) - return false, nil - } - } - return true, nil -}