diff --git a/pkg/etcdenvvar/envvarcontroller.go b/pkg/etcdenvvar/envvarcontroller.go index 3a212a6d49..8191632a91 100644 --- a/pkg/etcdenvvar/envvarcontroller.go +++ b/pkg/etcdenvvar/envvarcontroller.go @@ -1,6 +1,7 @@ package etcdenvvar import ( + "context" "fmt" "reflect" "sync" @@ -9,11 +10,15 @@ import ( operatorv1 "github.com/openshift/api/operator/v1" configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1" configv1listers "github.com/openshift/client-go/config/listers/config/v1" + "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" "github.com/openshift/library-go/pkg/operator/events" "github.com/openshift/library-go/pkg/operator/v1helpers" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" corev1listers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" @@ -24,6 +29,7 @@ const workQueueKey = "key" type EnvVarController struct { operatorClient v1helpers.StaticPodOperatorClient + kubeClient kubernetes.Interface envVarMapLock sync.Mutex envVarMap map[string]string @@ -48,6 +54,7 @@ type Enqueueable interface { func NewEnvVarController( targetImagePullSpec string, operatorClient v1helpers.StaticPodOperatorClient, + kubeClient kubernetes.Interface, kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces, infrastructureInformer configv1informers.InfrastructureInformer, networkInformer configv1informers.NetworkInformer, @@ -55,6 +62,7 @@ func NewEnvVarController( ) *EnvVarController { c := &EnvVarController{ operatorClient: operatorClient, + kubeClient: kubeClient, infrastructureLister: infrastructureInformer.Lister(), networkLister: networkInformer.Lister(), configmapLister: kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Core().V1().ConfigMaps().Lister(), @@ -128,7 +136,7 @@ func (c *EnvVarController) checkEnvVars() error { return err } - currEnvVarMap, err := getEtcdEnvVars(envVarContext{ + ctx := envVarContext{ targetImagePullSpec: c.targetImagePullSpec, spec: *operatorSpec, status: *operatorStatus, @@ -136,7 +144,64 @@ func (c *EnvVarController) checkEnvVars() error { nodeLister: c.nodeLister, infrastructureLister: c.infrastructureLister, networkLister: c.networkLister, - }) + } + + bootstrapComplete, err := isBootstrapComplete(c.configmapLister, c.operatorClient) + if err != nil { + return fmt.Errorf("couldn't determine bootstrap status: %w", err) + } + isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&ctx.spec) + if err != nil { + return fmt.Errorf("couldn't determine etcd unsupported override status: %w", err) + } + // TODO: determine this through a state check + isManagedByAssistedInstaller := true + nodeCount := len(ctx.status.NodeStatuses) + + // check for valid PDB, while we do internal health checks aginst etcd directly, + // PDB has the data we need without an out of band query to etcd. + // see: https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance + var isEtcdFailureTolerant bool + pdb, err := c.kubeClient.PolicyV1beta1().PodDisruptionBudgets(operatorclient.TargetNamespace).Get(context.Background(), "etcd-quorum-guard", metav1.GetOptions{}) + if err != nil { + return err + } + currentHealthyEtcds := pdb.Status.CurrentHealthy + if currentHealthyEtcds > 2 { + isEtcdFailureTolerant = true + } + + // There are three discrete validation paths to enforce HA invariants + // depending on whether the unsupported/unsafe config is set, whether the + // cluster is managed by assisted installer, and the default configuration. + switch { + case isUnsupportedUnsafeEtcd: + // TODO: this is to help make sure assisted installer can run in a supported config. + // TODO: can remove this after integration testing. + if isManagedByAssistedInstaller { + return fmt.Errorf("assisted installer is running with an unsupported etcd configuration") + } + // When running in an unsupported/unsafe configuration, no guarantees are + // made once a single node is available. + if nodeCount < 1 { + return fmt.Errorf("at least one node is required to have a valid configuration") + } + case isManagedByAssistedInstaller: + // When managed by assisted installer, tolerate unsafe conditions only up + // until bootstrap is complete, and then enforce as in the supported case. + // Only if etcd if failure tolerant is it safe for a new static pod revision. + if !isEtcdFailureTolerant && bootstrapComplete { + return fmt.Errorf("at least three healthy etcd members are required to have a valid configuration (have %d)", currentHealthyEtcds) + } + default: + // When running in a normal supported configuration, enforce HA invariants + // at all times. + if nodeCount < 3 { + return fmt.Errorf("at least three nodes are required to have a valid configuration (have %d)", nodeCount) + } + } + + currEnvVarMap, err := getEtcdEnvVars(ctx) if err != nil { return err } @@ -208,3 +273,43 @@ func (c *EnvVarController) eventHandler() cache.ResourceEventHandler { DeleteFunc: func(obj interface{}) { c.queue.Add(workQueueKey) }, } } + +// isBootstrapComplete returns true if bootstrap has completed. +// TODO: consider sharing with etcdendpointscontroller. +func isBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) { + // do a cheap check to see if the annotation is already gone. + // check to see if bootstrapping is complete + bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap") + if err != nil { + if errors.IsNotFound(err) { + // If the resource was deleted (e.g. by an admin) after bootstrap is actually complete, + // this is a false negative. + klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found") + return false, nil + } + // We don't know, give up quickly. + return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err) + } + + if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" { + // do nothing, not torn down + klog.V(4).Infof("bootstrap considered incomplete because status is %q", status) + return false, nil + } + + // now run check to stability of revisions + _, status, _, err := staticPodClient.GetStaticPodOperatorState() + if err != nil { + return false, fmt.Errorf("failed to get static pod operator state: %w", err) + } + if status.LatestAvailableRevision == 0 { + return false, nil + } + for _, curr := range status.NodeStatuses { + if curr.CurrentRevision != status.LatestAvailableRevision { + klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision) + return false, nil + } + } + return true, nil +} diff --git a/pkg/etcdenvvar/etcd_env.go b/pkg/etcdenvvar/etcd_env.go index 8b63b289cd..947213a411 100644 --- a/pkg/etcdenvvar/etcd_env.go +++ b/pkg/etcdenvvar/etcd_env.go @@ -6,7 +6,6 @@ import ( "strings" "github.com/openshift/cluster-etcd-operator/pkg/dnshelpers" - "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient" operatorv1 "github.com/openshift/api/operator/v1" @@ -59,20 +58,6 @@ var envVarFns = []envVarFunc{ // NODE_%s_ETCD_URL_HOST // NODE_%s_ETCD_NAME func getEtcdEnvVars(envVarContext envVarContext) (map[string]string, error) { - // TODO once we are past bootstrapping, this restriction shouldn't be needed anymore. - // we have it because the env vars were not getting set in the pod and the static pod operator started - // rolling out to another node, which caused a failure. - isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&envVarContext.spec) - if err != nil { - return nil, err - } - switch { - case isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 1: - return nil, fmt.Errorf("at least one node is required to have a valid configuration") - case !isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 3: - return nil, fmt.Errorf("at least three nodes are required to have a valid configuration") - } - ret := map[string]string{} for _, envVarFn := range envVarFns { diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go index a8769e2e98..ec1b7d9894 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go @@ -152,6 +152,12 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro return false, hasBootstrap, err } + // TODO: determine this through a state check + isManagedByAssistedInstaller := true + if isManagedByAssistedInstaller { + isUnsupportedUnsafeEtcd = true + } + switch { case !isUnsupportedUnsafeEtcd && len(members) < 4: // bootstrap is not safe to remove until we scale to 4 diff --git a/pkg/operator/starter.go b/pkg/operator/starter.go index 0beb2d9487..3e31f400dc 100644 --- a/pkg/operator/starter.go +++ b/pkg/operator/starter.go @@ -121,6 +121,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle envVarController := etcdenvvar.NewEnvVarController( os.Getenv("IMAGE"), operatorClient, + kubeClient, kubeInformersForNamespaces, configInformers.Config().V1().Infrastructures(), configInformers.Config().V1().Networks(),