Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bindata/bootkube/manifests/00_openshift-etcd-ns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ kind: Namespace
metadata:
annotations:
openshift.io/node-selector: ""
{{- range $key, $val := .NamespaceAnnotations }}
{{$key}}: "{{$val}}"
{{- end}}
name: openshift-etcd
labels:
openshift.io/run-level: "0"
16 changes: 16 additions & 0 deletions pkg/cmd/render/render.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"strings"

"github.com/openshift/cluster-etcd-operator/pkg/dnshelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/tlshelpers"

"github.com/ghodss/yaml"
Expand Down Expand Up @@ -42,6 +43,8 @@ type renderOpts struct {
clusterConfigMapFile string
infraConfigFile string
bootstrapIP string

delayedHABootstrapScalingStrategyMarker string
}

// NewRenderCommand creates a render command.
Expand Down Expand Up @@ -132,6 +135,7 @@ func (r *renderOpts) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&r.clusterConfigMapFile, "cluster-configmap-file", "/assets/manifests/cluster-config.yaml", "File containing the cluster-config-v1 configmap.")
fs.StringVar(&r.infraConfigFile, "infra-config-file", "/assets/manifests/cluster-infrastructure-02-config.yml", "File containing infrastructure.config.openshift.io manifest.")
fs.StringVar(&r.bootstrapIP, "bootstrap-ip", r.bootstrapIP, "bootstrap IP used to indicate where to find the first etcd endpoint")
fs.StringVar(&r.delayedHABootstrapScalingStrategyMarker, "delayed-ha-bootstrap-scaling-marker-file", "/assets/assisted-install-bootstrap", "Marker file that, if present, enables the delayed HA bootstrap scaling strategy")
}

// Validate verifies the inputs.
Expand Down Expand Up @@ -198,6 +202,9 @@ type TemplateData struct {

// ComputedEnvVars name/value pairs to populate env: for static pod.
ComputedEnvVars string

// NamespaceAnnotations are addition annotations to apply to the etcd namespace.
NamespaceAnnotations map[string]string
}

type StaticFile struct {
Expand Down Expand Up @@ -279,6 +286,15 @@ func newTemplateData(opts *renderOpts) (*TemplateData, error) {
return nil, err
}

// Use a marker file to configure the bootstrap scaling strategy.
if _, err := os.Stat(opts.delayedHABootstrapScalingStrategyMarker); err == nil {
if templateData.NamespaceAnnotations == nil {
templateData.NamespaceAnnotations = map[string]string{}
}
templateData.NamespaceAnnotations[ceohelpers.DelayedHABootstrapScalingStrategyAnnotation] = ""
klog.Infof("using delayed HA bootstrap scaling strategy due to presence of marker file %s", opts.delayedHABootstrapScalingStrategyMarker)
}

return &templateData, nil
}

Expand Down
10 changes: 9 additions & 1 deletion pkg/etcdenvvar/envvarcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
operatorv1 "github.com/openshift/api/operator/v1"
configv1informers "github.com/openshift/client-go/config/informers/externalversions/config/v1"
configv1listers "github.com/openshift/client-go/config/listers/config/v1"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
"github.com/openshift/library-go/pkg/operator/events"
"github.com/openshift/library-go/pkg/operator/v1helpers"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand All @@ -18,6 +17,9 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"

"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
)

const workQueueKey = "key"
Expand All @@ -34,6 +36,7 @@ type EnvVarController struct {
networkLister configv1listers.NetworkLister
configmapLister corev1listers.ConfigMapLister
nodeLister corev1listers.NodeLister
namespaceLister corev1listers.NamespaceLister

// queue only ever has one item, but it has nice error handling backoff/retry semantics
queue workqueue.RateLimitingInterface
Expand All @@ -57,6 +60,7 @@ func NewEnvVarController(
operatorClient: operatorClient,
infrastructureLister: infrastructureInformer.Lister(),
networkLister: networkInformer.Lister(),
namespaceLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Namespaces().Lister(),
configmapLister: kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Core().V1().ConfigMaps().Lister(),
nodeLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Nodes().Lister(),
targetImagePullSpec: targetImagePullSpec,
Expand Down Expand Up @@ -123,6 +127,10 @@ func (c *EnvVarController) sync() error {
}

func (c *EnvVarController) checkEnvVars() error {
if err := ceohelpers.CheckSafeToScaleCluster(c.configmapLister, c.operatorClient, c.namespaceLister); err != nil {
return fmt.Errorf("can't update etcd pod configurations because scaling is currently unsafe: %w", err)
}

operatorSpec, operatorStatus, _, err := c.operatorClient.GetStaticPodOperatorState()
if err != nil {
return err
Expand Down
15 changes: 0 additions & 15 deletions pkg/etcdenvvar/etcd_env.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"strings"

"github.com/openshift/cluster-etcd-operator/pkg/dnshelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"

operatorv1 "github.com/openshift/api/operator/v1"
Expand Down Expand Up @@ -59,20 +58,6 @@ var envVarFns = []envVarFunc{
// NODE_%s_ETCD_URL_HOST
// NODE_%s_ETCD_NAME
func getEtcdEnvVars(envVarContext envVarContext) (map[string]string, error) {
// TODO once we are past bootstrapping, this restriction shouldn't be needed anymore.
// we have it because the env vars were not getting set in the pod and the static pod operator started
// rolling out to another node, which caused a failure.
isUnsupportedUnsafeEtcd, err := ceohelpers.IsUnsupportedUnsafeEtcd(&envVarContext.spec)
if err != nil {
return nil, err
}
switch {
case isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 1:
return nil, fmt.Errorf("at least one node is required to have a valid configuration")
case !isUnsupportedUnsafeEtcd && len(envVarContext.status.NodeStatuses) < 3:
return nil, fmt.Errorf("at least three nodes are required to have a valid configuration")
}

ret := map[string]string{}

for _, envVarFn := range envVarFns {
Expand Down
36 changes: 16 additions & 20 deletions pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package bootstrapteardown

import (
"context"
"fmt"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -20,6 +21,7 @@ type BootstrapTeardownController struct {
operatorClient v1helpers.StaticPodOperatorClient
etcdClient etcdcli.EtcdClient
configmapLister corev1listers.ConfigMapLister
namespaceLister corev1listers.NamespaceLister
}

func NewBootstrapTeardownController(
Expand All @@ -32,6 +34,7 @@ func NewBootstrapTeardownController(
operatorClient: operatorClient,
etcdClient: etcdClient,
configmapLister: kubeInformersForNamespaces.InformersFor("kube-system").Core().V1().ConfigMaps().Lister(),
namespaceLister: kubeInformersForNamespaces.InformersFor("").Core().V1().Namespaces().Lister(),
}

return factory.New().ResyncEvery(time.Minute).WithInformers(
Expand Down Expand Up @@ -147,23 +150,24 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro
return false, hasBootstrap, nil
}

isUnsupportedUnsafeEtcd, err := c.isUnsupportedUnsafeEtcd()
scalingStrategy, err := ceohelpers.GetBootstrapScalingStrategy(c.operatorClient, c.namespaceLister)
if err != nil {
return false, hasBootstrap, err
return false, hasBootstrap, fmt.Errorf("failed to get bootstrap scaling strategy: %w", err)
}

switch {
case !isUnsupportedUnsafeEtcd && len(members) < 4:
// bootstrap is not safe to remove until we scale to 4
return false, hasBootstrap, nil
case isUnsupportedUnsafeEtcd && len(members) < 2:
// if etcd is unsupported, bootstrap is not safe to remove
// until we scale to 2
return false, hasBootstrap, nil
default:
// do nothing fall through on checking the unhealthy members
// First, enforce the main HA invariants in terms of member counts.
switch scalingStrategy {
case ceohelpers.HAScalingStrategy:
if len(members) < 4 {
return false, hasBootstrap, nil
}
case ceohelpers.DelayedHAScalingStrategy, ceohelpers.UnsafeScalingStrategy:
if len(members) < 2 {
return false, hasBootstrap, nil
}
}

// Next, given member counts are satisfied, check member health.
unhealthyMembers, err := c.etcdClient.UnhealthyMembers()
if err != nil {
return false, hasBootstrap, nil
Expand All @@ -182,11 +186,3 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap() (bool, bool, erro
return false, hasBootstrap, nil
}
}

func (c *BootstrapTeardownController) isUnsupportedUnsafeEtcd() (bool, error) {
spec, _, _, err := c.operatorClient.GetStaticPodOperatorState()
if err != nil {
return false, err
}
return ceohelpers.IsUnsupportedUnsafeEtcd(spec)
}
164 changes: 164 additions & 0 deletions pkg/operator/ceohelpers/bootstrap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package ceohelpers

import (
"fmt"

"github.com/openshift/library-go/pkg/operator/v1helpers"
"k8s.io/apimachinery/pkg/api/errors"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"

"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
)

// BootstrapScalingStrategy describes the invariants which will be enforced when
// scaling the etcd cluster.
type BootstrapScalingStrategy string

const (
// HAScalingStrategy means the etcd cluster will only be scaled up when at least
// 3 node are available so that HA is enforced at all times. This rule applies
// during bootstrapping and the steady state.
//
// This is the default strategy.
HAScalingStrategy BootstrapScalingStrategy = "HAScalingStrategy"

// DelayedHAScalingStrategy means that during bootstrapping, the etcd cluster will
// be allowed to scale when at least 2 members are available (which is not HA),
// but after bootstrapping any further scaling will require 3 nodes in the same
// way as HAScalingStrategy.
//
// This strategy is selected by adding the `openshift.io/delayed-ha-bootstrap`
// annotation to the openshift-etcd namesapce.
DelayedHAScalingStrategy BootstrapScalingStrategy = "DelayedHAScalingStrategy"

// UnsafeScalingStrategy means scaling will occur without regards to nodes and
// any effect on quorum. Use of this strategy isn't officially tested or supported,
// but is made available for ad-hoc use.
//
// This strategy is selected by setting unsupportedConfigOverrides on the
// operator config.
UnsafeScalingStrategy BootstrapScalingStrategy = "UnsafeScalingStrategy"
)

const (
// DelayedHABootstrapScalingStrategyAnnotation is an annotation on the openshift-etcd
// namespace which if present indicates the DelayedHAScalingStrategy strategy
// should be used.
DelayedHABootstrapScalingStrategyAnnotation = "openshift.io/delayed-ha-bootstrap"
)

// GetBootstrapScalingStrategy determines the scaling strategy to use.
func GetBootstrapScalingStrategy(staticPodClient v1helpers.StaticPodOperatorClient, namespaceLister corev1listers.NamespaceLister) (BootstrapScalingStrategy, error) {
var strategy BootstrapScalingStrategy

operatorSpec, _, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return strategy, fmt.Errorf("failed to get operator state: %w", err)
}

isUnsupportedUnsafeEtcd, err := isUnsupportedUnsafeEtcd(operatorSpec)
if err != nil {
return strategy, fmt.Errorf("couldn't determine etcd unsupported override status, assuming default HA scaling strategy: %w", err)
}

etcdNamespace, err := namespaceLister.Get(operatorclient.TargetNamespace)
if err != nil {
return strategy, fmt.Errorf("failed to get %s namespace: %w", operatorclient.TargetNamespace, err)
}
_, hasDelayedHAAnnotation := etcdNamespace.Annotations[DelayedHABootstrapScalingStrategyAnnotation]

switch {
case isUnsupportedUnsafeEtcd:
strategy = UnsafeScalingStrategy
case hasDelayedHAAnnotation:
strategy = DelayedHAScalingStrategy
default:
strategy = HAScalingStrategy
}
return strategy, nil
}

// CheckSafeToScaleCluster is used to implement the bootstrap scaling strategy invariants.
// This function returns nil if cluster conditions are such that it's safe to scale
// the etcd cluster based on the scaling strategy in use, and otherwise will return
// an error explaining why it's unsafe to scale.
func CheckSafeToScaleCluster(configmapLister corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient, namespaceLister corev1listers.NamespaceLister) error {
bootstrapComplete, err := IsBootstrapComplete(configmapLister, staticPodClient)
if err != nil {
return fmt.Errorf("failed to determine bootstrap status: %w", err)
}

_, operatorStatus, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return fmt.Errorf("failed to get operator state: %w", err)
}

scalingStrategy, err := GetBootstrapScalingStrategy(staticPodClient, namespaceLister)
if err != nil {
return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err)
}

var minimumNodes int
switch scalingStrategy {
case HAScalingStrategy:
minimumNodes = 3
case UnsafeScalingStrategy:
minimumNodes = 1
case DelayedHAScalingStrategy:
if bootstrapComplete {
minimumNodes = 3
} else {
minimumNodes = 2
}
default:
return fmt.Errorf("unrecognized scaling strategy %q", scalingStrategy)
}

nodeCount := len(operatorStatus.NodeStatuses)
if nodeCount < minimumNodes {
return fmt.Errorf("%d nodes are required, but only %d are available", minimumNodes, nodeCount)
}

klog.V(4).Infof("node count %d satisfies minimum of %d required by the %s bootstrap scaling strategy", nodeCount, minimumNodes, scalingStrategy)
return nil
}

// IsBootstrapComplete returns true if bootstrap has completed.
func IsBootstrapComplete(configMapClient corev1listers.ConfigMapLister, staticPodClient v1helpers.StaticPodOperatorClient) (bool, error) {
// do a cheap check to see if the annotation is already gone.
// check to see if bootstrapping is complete
bootstrapFinishedConfigMap, err := configMapClient.ConfigMaps("kube-system").Get("bootstrap")
if err != nil {
if errors.IsNotFound(err) {
// If the resource was deleted (e.g. by an admin) after bootstrap is actually complete,
// this is a false negative.
klog.V(4).Infof("bootstrap considered incomplete because the kube-system/bootstrap configmap wasn't found")
return false, nil
}
// We don't know, give up quickly.
return false, fmt.Errorf("failed to get configmap %s/%s: %w", "kube-system", "bootstrap", err)
}

if status, ok := bootstrapFinishedConfigMap.Data["status"]; !ok || status != "complete" {
// do nothing, not torn down
klog.V(4).Infof("bootstrap considered incomplete because status is %q", status)
return false, nil
}

// now run check to stability of revisions
_, status, _, err := staticPodClient.GetStaticPodOperatorState()
if err != nil {
return false, fmt.Errorf("failed to get static pod operator state: %w", err)
}
if status.LatestAvailableRevision == 0 {
return false, nil
}
for _, curr := range status.NodeStatuses {
if curr.CurrentRevision != status.LatestAvailableRevision {
klog.V(4).Infof("bootstrap considered incomplete because revision %d is still in progress", status.LatestAvailableRevision)
return false, nil
}
}
return true, nil
}
Loading