Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 107 additions & 2 deletions pkg/etcdenvvar/envvarcontroller.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package etcdenvvar

import (
"context"
"fmt"
"reflect"
"sync"
Expand All @@ -9,11 +10,15 @@ import (
operatorv1 "github.com/openshift/api/operator/v1"
configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1"
configv1listers "github.com/openshift/client-go/config/listers/config/v1"
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
"github.com/openshift/library-go/pkg/operator/events"
"github.com/openshift/library-go/pkg/operator/v1helpers"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
Expand All @@ -24,6 +29,7 @@ const workQueueKey = "key"

type EnvVarController struct {
operatorClient v1helpers.StaticPodOperatorClient
kubeClient kubernetes.Interface

envVarMapLock sync.Mutex
envVarMap map[string]string
Expand All @@ -48,13 +54,15 @@ type Enqueueable interface {
func NewEnvVarController(
targetImagePullSpec string,
operatorClient v1helpers.StaticPodOperatorClient,
kubeClient kubernetes.Interface,
kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces,
infrastructureInformer configv1informers.InfrastructureInformer,
networkInformer configv1informers.NetworkInformer,
eventRecorder events.Recorder,
) *EnvVarController {
c := &EnvVarController{
operatorClient: operatorClient,
kubeClient: kubeClient,
infrastructureLister: infrastructureInformer.Lister(),
networkLister: networkInformer.Lister(),
configmapLister: kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Core().V1().ConfigMaps().Lister(),
Expand Down Expand Up @@ -128,15 +136,72 @@ func (c *EnvVarController) checkEnvVars() error {
return err
}

currEnvVarMap, err := getEtcdEnvVars(envVarContext{
ctx := envVarContext{
targetImagePullSpec: c.targetImagePullSpec,
spec: *operatorSpec,
status: *operatorStatus,
configmapLister: c.configmapLister,
nodeLister: c.nodeLister,
infrastructureLister: c.infrastructureLister,
networkLister: c.networkLister,
})
}

bootstrapComplete, err := isBootstrapComplete(c.configmapLister, c.operatorClient)
if err != nil {
return fmt.Errorf("couldn't determine bootstrap status: %w", err)
}
isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&ctx.spec)
if err != nil {
return fmt.Errorf("couldn't determine etcd unsupported override status: %w", err)
}
// TODO: determine this through a state check
isManagedByAssistedInstaller := true
nodeCount := len(ctx.status.NodeStatuses)

// check for valid PDB, while we do internal health checks aginst etcd directly,
// PDB has the data we need without an out of band query to etcd.
// see: https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance
var isEtcdFailureTolerant bool
pdb, err := c.kubeClient.PolicyV1beta1().PodDisruptionBudgets(operatorclient.TargetNamespace).Get(context.Background(), "etcd-quorum-guard", metav1.GetOptions{})
if err != nil {
return err
}
currentHealthyEtcds := pdb.Status.CurrentHealthy
if currentHealthyEtcds > 2 {
isEtcdFailureTolerant = true
}

// There are three discrete validation paths to enforce HA invariants
// depending on whether the unsupported/unsafe config is set, whether the
// cluster is managed by assisted installer, and the default configuration.
switch {
case isUnsupportedUnsafeEtcd:
// TODO: this is to help make sure assisted installer can run in a supported config.
// TODO: can remove this after integration testing.
if isManagedByAssistedInstaller {
return fmt.Errorf("assisted installer is running with an unsupported etcd configuration")
}
// When running in an unsupported/unsafe configuration, no guarantees are
// made once a single node is available.
if nodeCount < 1 {
return fmt.Errorf("at least one node is required to have a valid configuration")
}
case isManagedByAssistedInstaller:
// When managed by assisted installer, tolerate unsafe conditions only up
// until bootstrap is complete, and then enforce as in the supported case.
// Only if etcd if failure tolerant is it safe for a new static pod revision.
if !isEtcdFailureTolerant && bootstrapComplete {
return fmt.Errorf("at least three healthy etcd members are required to have a valid configuration (have %d)", currentHealthyEtcds)
}
default:
// When running in a normal supported configuration, enforce HA invariants
// at all times.
if nodeCount < 3 {
return fmt.Errorf("at least three nodes are required to have a valid configuration (have %d)", nodeCount)
}
}

currEnvVarMap, err := getEtcdEnvVars(ctx)
if err != nil {
return err
}
Expand Down Expand Up @@ -208,3 +273,43 @@ func (c *EnvVarController) eventHandler() cache.ResourceEventHandler {
DeleteFunc: func(obj interface{}) { c.queue.Add(workQueueKey) },
}
}

// isBootstrapComplete returns true if bootstrap has completed.
// TODO: consider sharing with etcdendpointscontroller.
func isBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) {
// do a cheap check to see if the annotation is already gone.
// check to see if bootstrapping is complete
bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap")
if err != nil {
if errors.IsNotFound(err) {
// If the resource was deleted (e.g. by an admin) after bootstrap is actually complete,
// this is a false negative.
klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found")
return false, nil
}
// We don't know, give up quickly.
return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err)
}

if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" {
// do nothing, not torn down
klog.V(4).Infof("bootstrap considered incomplete because status is %q", status)
return false, nil
}

// now run check to stability of revisions
_, status, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return false, fmt.Errorf("failed to get static pod operator state: %w", err)
}
if status.LatestAvailableRevision == 0 {
return false, nil
}
for _, curr := range status.NodeStatuses {
if curr.CurrentRevision != status.LatestAvailableRevision {
klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision)
return false, nil
}
}
return true, nil
}
15 changes: 0 additions & 15 deletions pkg/etcdenvvar/etcd_env.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"strings"

"github.com/openshift/cluster-etcd-operator/pkg/dnshelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"

operatorv1 "github.com/openshift/api/operator/v1"
Expand Down Expand Up @@ -59,20 +58,6 @@ var envVarFns = []envVarFunc{
// NODE_%s_ETCD_URL_HOST
// NODE_%s_ETCD_NAME
func getEtcdEnvVars(envVarContext envVarContext) (map[string]string, error) {
// TODO once we are past bootstrapping, this restriction shouldn't be needed anymore.
// we have it because the env vars were not getting set in the pod and the static pod operator started
// rolling out to another node, which caused a failure.
isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&envVarContext.spec)
if err != nil {
return nil, err
}
switch {
case isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 1:
return nil, fmt.Errorf("at least one node is required to have a valid configuration")
case !isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 3:
return nil, fmt.Errorf("at least three nodes are required to have a valid configuration")
}

ret := map[string]string{}

for _, envVarFn := range envVarFns {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro
return false, hasBootstrap, err
}

// TODO: determine this through a state check
isManagedByAssistedInstaller := true
if isManagedByAssistedInstaller {
isUnsupportedUnsafeEtcd = true
}

switch {
case !isUnsupportedUnsafeEtcd && len(members) < 4:
// bootstrap is not safe to remove until we scale to 4
Expand Down
1 change: 1 addition & 0 deletions pkg/operator/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
envVarController := etcdenvvar.NewEnvVarController(
os.Getenv("IMAGE"),
operatorClient,
kubeClient,
kubeInformersForNamespaces,
configInformers.Config().V1().Infrastructures(),
configInformers.Config().V1().Networks(),
Expand Down