diff --git a/pkg/controller/clusterdeployment/clusterinstalls.go b/pkg/controller/clusterdeployment/clusterinstalls.go index cc6d6ab2ecc..dba5cf1f2f8 100644 --- a/pkg/controller/clusterdeployment/clusterinstalls.go +++ b/pkg/controller/clusterdeployment/clusterinstalls.go @@ -140,6 +140,9 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c } updated = false + // Fun extra variable to keep track of whether we should increment metricProvisionFailedTerminal + // later; because we only want to do that if (we change that status and) the status update succeeds. + provisionFailedTerminal := false conditions, updated = controllerutils.SetClusterDeploymentConditionWithChangeCheck(conditions, hivev1.ProvisionStoppedCondition, stopped.Status, @@ -149,6 +152,7 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c ) if updated { statusModified = true + provisionFailedTerminal = true } completed = controllerutils.FindClusterDeploymentCondition(conditions, hivev1.ClusterInstallCompletedClusterDeploymentCondition) @@ -196,6 +200,10 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c logger.WithError(err).Error("failed to update the status of clusterdeployment") return reconcile.Result{}, err } + // If we declared the provision terminally failed, bump our metric + if provisionFailedTerminal { + incProvisionFailedTerminal(cd) + } } return reconcile.Result{}, nil diff --git a/pkg/controller/clusterdeployment/clusterprovisions.go b/pkg/controller/clusterdeployment/clusterprovisions.go index de94aa4188c..70a1c55c3d6 100644 --- a/pkg/controller/clusterdeployment/clusterprovisions.go +++ b/pkg/controller/clusterdeployment/clusterprovisions.go @@ -52,14 +52,14 @@ func (r *ReconcileClusterDeployment) startNewProvision( r.deleteStaleProvisions(existingProvisions, logger) - if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" { - logger.Debug("not creating new provision since the deployment is set to try install only once") + setProvisionStoppedTrue := func(reason, message string) (reconcile.Result, error) { + logger.Debug("not creating new provision: %s", message) conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck( cd.Status.Conditions, hivev1.ProvisionStoppedCondition, corev1.ConditionTrue, - installOnlyOnceSetReason, - "Deployment is set to try install only once", + reason, + message, controllerutils.UpdateConditionIfReasonOrMessageChange) if changed { cd.Status.Conditions = conditions @@ -68,27 +68,16 @@ func (r *ReconcileClusterDeployment) startNewProvision( logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status") return reconcile.Result{}, err } + incProvisionFailedTerminal(cd) } return reconcile.Result{}, nil } + + if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" { + return setProvisionStoppedTrue(installOnlyOnceSetReason, "Deployment is set to try install only once") + } if cd.Spec.InstallAttemptsLimit != nil && cd.Status.InstallRestarts >= int(*cd.Spec.InstallAttemptsLimit) { - logger.Debug("not creating new provision since the install attempts limit has been reached") - conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck( - cd.Status.Conditions, - hivev1.ProvisionStoppedCondition, - corev1.ConditionTrue, - installAttemptsLimitReachedReason, - "Install attempts limit reached", - controllerutils.UpdateConditionIfReasonOrMessageChange) - if changed { - cd.Status.Conditions = conditions - logger.Debugf("setting ProvisionStoppedCondition to %v", corev1.ConditionTrue) - if err := r.Status().Update(context.TODO(), cd); err != nil { - logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status") - return reconcile.Result{}, err - } - } - return reconcile.Result{}, nil + return setProvisionStoppedTrue(installAttemptsLimitReachedReason, "Install attempts limit reached") } conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck( diff --git a/pkg/controller/clusterdeployment/metrics.go b/pkg/controller/clusterdeployment/metrics.go index 43cff0496c8..c2d057398f4 100644 --- a/pkg/controller/clusterdeployment/metrics.go +++ b/pkg/controller/clusterdeployment/metrics.go @@ -4,6 +4,8 @@ import ( "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" + + hivev1 "github.com/openshift/hive/apis/hive/v1" ) var ( @@ -61,8 +63,22 @@ var ( Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800}, }, ) + metricProvisionFailedTerminal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "hive_cluster_deployments_provision_failed_terminal_total", + Help: "Counter incremented when a cluster provision has failed and won't be retried.", + }, + []string{"clusterpool_namespacedname"}, + ) ) +func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment) { + poolNSName := "" + if poolRef := cd.Spec.ClusterPoolRef; poolRef != nil { + poolNSName = poolRef.Namespace + "/" + poolRef.PoolName + } + metricProvisionFailedTerminal.WithLabelValues(poolNSName).Inc() +} + func init() { metrics.Registry.MustRegister(metricInstallJobDuration) metrics.Registry.MustRegister(metricCompletedInstallJobRestarts) @@ -72,4 +88,5 @@ func init() { metrics.Registry.MustRegister(metricClustersInstalled) metrics.Registry.MustRegister(metricClustersDeleted) metrics.Registry.MustRegister(metricDNSDelaySeconds) + metrics.Registry.MustRegister(metricProvisionFailedTerminal) } diff --git a/pkg/controller/clusterpool/clusterpool_controller.go b/pkg/controller/clusterpool/clusterpool_controller.go index 29e55d877e5..037646dc844 100644 --- a/pkg/controller/clusterpool/clusterpool_controller.go +++ b/pkg/controller/clusterpool/clusterpool_controller.go @@ -301,7 +301,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. cds.SyncClaimAssignments(r.Client, claims, logger) origStatus := clp.Status.DeepCopy() - clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable())) + clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken())) clp.Status.Ready = int32(len(cds.Assignable())) if !reflect.DeepEqual(origStatus, &clp.Status) { if err := r.Status().Update(context.Background(), clp); err != nil { @@ -313,7 +313,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. availableCapacity := math.MaxInt32 if clp.Spec.MaxSize != nil { availableCapacity = int(*clp.Spec.MaxSize) - cds.Total() - numUnassigned := len(cds.Installing()) + len(cds.Assignable()) + numUnassigned := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()) numAssigned := cds.NumAssigned() if availableCapacity <= 0 { logger.WithFields(log.Fields{ @@ -329,7 +329,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. } // reserveSize is the number of clusters that the pool currently has in reserve - reserveSize := len(cds.Installing()) + len(cds.Assignable()) - len(claims.Unassigned()) + reserveSize := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()) - len(claims.Unassigned()) if err := assignClustersToClaims(r.Client, claims, cds, logger); err != nil { logger.WithError(err).Error("error assigning clusters <=> claims") @@ -364,12 +364,6 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. "MaxConcurrent": *clp.Spec.MaxConcurrent, "Available": availableCurrent, }).Info("Cannot create/delete clusters as max concurrent quota exceeded.") - // If too many, delete some. - case drift > 0: - toDel := minIntVarible(drift, availableCurrent) - if err := r.deleteExcessClusters(cds, toDel, logger); err != nil { - return reconcile.Result{}, err - } // If too few, create new InstallConfig and ClusterDeployment. case drift < 0: if availableCapacity <= 0 { @@ -383,6 +377,19 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. log.WithError(err).Error("error adding clusters") return reconcile.Result{}, err } + // Delete broken CDs. We put this case after the "addClusters" case because we would rather + // consume our maxConcurrent with additions than deletions. But we put it before the + // "deleteExcessClusters" case because we would rather trim broken clusters than viable ones. + case len(cds.Broken()) > 0: + if err := r.deleteBrokenClusters(cds, availableCurrent, logger); err != nil { + return reconcile.Result{}, err + } + // If too many, delete some. + case drift > 0: + toDel := minIntVarible(drift, availableCurrent) + if err := r.deleteExcessClusters(cds, toDel, logger); err != nil { + return reconcile.Result{}, err + } // Special case for stale CDs: allow deleting one if all CDs are installed. case drift == 0 && len(cds.Installing()) == 0 && len(cds.Stale()) > 0: toDelete := cds.Stale()[0] @@ -710,6 +717,21 @@ func (r *ReconcileClusterPool) deleteExcessClusters( return nil } +func (r *ReconcileClusterPool) deleteBrokenClusters(cds *cdCollection, maxToDelete int, logger log.FieldLogger) error { + numToDel := minIntVarible(maxToDelete, len(cds.Broken())) + logger.WithField("numberToDelete", numToDel).Info("deleting broken clusters") + clustersToDelete := make([]*hivev1.ClusterDeployment, numToDel) + copy(clustersToDelete, cds.Broken()[:numToDel]) + for _, cd := range clustersToDelete { + logger.WithField("cluster", cd.Name).Info("deleting broken cluster deployment") + if err := cds.Delete(r.Client, cd.Name); err != nil { + logger.WithError(err).Error("error deleting cluster deployment") + return err + } + } + return nil +} + func (r *ReconcileClusterPool) reconcileDeletedPool(pool *hivev1.ClusterPool, logger log.FieldLogger) error { if !controllerutils.HasFinalizer(pool, finalizer) { return nil diff --git a/pkg/controller/clusterpool/clusterpool_controller_test.go b/pkg/controller/clusterpool/clusterpool_controller_test.go index 6554d2ae1f9..1903132d6f5 100644 --- a/pkg/controller/clusterpool/clusterpool_controller_test.go +++ b/pkg/controller/clusterpool/clusterpool_controller_test.go @@ -83,7 +83,6 @@ func TestReconcileClusterPool(t *testing.T) { unclaimedCDBuilder := func(name string) testcd.Builder { return cdBuilder(name).Options( testcd.WithUnclaimedClusterPoolReference(testNamespace, testLeasePoolName), - testcd.WithPoolVersion(initialPoolVersion), ) } @@ -260,6 +259,64 @@ func TestReconcileClusterPool(t *testing.T) { // more quickly. Possible subject of a future optimization. expectedDeletedClusters: []string{"c1"}, }, + { + name: "delete broken unclaimed clusters", + existing: []runtime.Object{ + initializedPoolBuilder.Build(testcp.WithSize(4)), + unclaimedCDBuilder("c1").Build(testcd.Broken()), + // ProvisionStopped+Installed doesn't make sense IRL; but we're proving it would + // be counted as broken if it did. + unclaimedCDBuilder("c2").Build(testcd.Broken(), testcd.Installed()), + // Unbroken CDs don't get deleted + unclaimedCDBuilder("c3").Build(), + // ...including assignable ones + unclaimedCDBuilder("c4").Build(testcd.Installed()), + // Claimed CDs don't get deleted + cdBuilder("c5").Build(testcd.Installed(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test1")), + // ...even if they're broken + cdBuilder("c6").Build(testcd.Broken(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test2")), + }, + expectedTotalClusters: 4, + expectedObservedSize: 4, + // The broken one doesn't get counted as Ready + expectedObservedReady: 1, + expectedDeletedClusters: []string{"c1", "c2"}, + expectedAssignedCDs: 2, + }, + { + name: "deleting broken clusters is bounded by maxConcurrent", + existing: []runtime.Object{ + initializedPoolBuilder.Build(testcp.WithSize(4), testcp.WithMaxConcurrent(2)), + unclaimedCDBuilder("c1").Build(testcd.Broken()), + unclaimedCDBuilder("c2").Build(testcd.Broken()), + unclaimedCDBuilder("c3").Build(testcd.Broken()), + unclaimedCDBuilder("c4").Build(testcd.Broken()), + }, + expectedTotalClusters: 2, + expectedObservedSize: 4, + // Note: We can't expectDeletedClusters because we delete broken clusters in arbitrary order + }, + { + name: "adding takes precedence over deleting broken clusters", + existing: []runtime.Object{ + initializedPoolBuilder.Build(testcp.WithSize(3)), + unclaimedCDBuilder("c1").Build(testcd.Broken()), + unclaimedCDBuilder("c2").Build(), + }, + expectedTotalClusters: 3, + expectedObservedSize: 2, + }, + { + name: "delete broken clusters first when reducing capacity", + existing: []runtime.Object{ + initializedPoolBuilder.Build(testcp.WithSize(1)), + unclaimedCDBuilder("c1").Build(testcd.Broken()), + unclaimedCDBuilder("c2").Build(), + }, + expectedTotalClusters: 1, + expectedObservedSize: 2, + expectedDeletedClusters: []string{"c1"}, + }, { name: "create all clusters", existing: []runtime.Object{ @@ -1272,7 +1329,7 @@ func TestReconcileClusterPool(t *testing.T) { assert.Equal(t, test.expectedAssignedCDs, actualAssignedCDs, "unexpected number of assigned CDs") assert.Equal(t, test.expectedTotalClusters-test.expectedAssignedCDs, actualUnassignedCDs, "unexpected number of unassigned CDs") assert.Equal(t, test.expectedRunning, actualRunning, "unexpected number of running CDs") - assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of assigned CDs") + assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of hibernating CDs") pool := &hivev1.ClusterPool{} err = fakeClient.Get(context.Background(), client.ObjectKey{Namespace: testNamespace, Name: testLeasePoolName}, pool) diff --git a/pkg/controller/clusterpool/collections.go b/pkg/controller/clusterpool/collections.go index 7e029671dd0..d4add31178d 100644 --- a/pkg/controller/clusterpool/collections.go +++ b/pkg/controller/clusterpool/collections.go @@ -196,6 +196,8 @@ type cdCollection struct { installing []*hivev1.ClusterDeployment // Clusters with a DeletionTimestamp. Mutually exclusive with markedForDeletion. deleting []*hivev1.ClusterDeployment + // Clusters we've declared unusable (e.g. provision failed terminally). + broken []*hivev1.ClusterDeployment // Clusters with the ClusterClaimRemoveClusterAnnotation. Mutually exclusive with deleting. markedForDeletion []*hivev1.ClusterDeployment // Clusters with a missing or empty pool version annotation @@ -208,6 +210,20 @@ type cdCollection struct { byClaimName map[string]*hivev1.ClusterDeployment } +func isBroken(cd *hivev1.ClusterDeployment) bool { + cond := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition) + if cond == nil { + // Since we should be initializing conditions, this probably means the CD is super fresh. + // Don't declare it broken yet -- give it a chance to come to life. + return false + } + if cond.Status == corev1.ConditionTrue { + return true + } + // Other causes of broken-ness to be added here later. E.g. resume failed. + return false +} + // getAllClusterDeploymentsForPool is the constructor for a cdCollection // comprising all the ClusterDeployments created for the specified ClusterPool. func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, poolVersion string, logger log.FieldLogger) (*cdCollection, error) { @@ -221,6 +237,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, assignable: make([]*hivev1.ClusterDeployment, 0), installing: make([]*hivev1.ClusterDeployment, 0), deleting: make([]*hivev1.ClusterDeployment, 0), + broken: make([]*hivev1.ClusterDeployment, 0), unknownPoolVersion: make([]*hivev1.ClusterDeployment, 0), mismatchedPoolVersion: make([]*hivev1.ClusterDeployment, 0), byCDName: make(map[string]*hivev1.ClusterDeployment), @@ -246,7 +263,9 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, // Do *not* double count "deleting" and "marked for deletion" cdCol.markedForDeletion = append(cdCol.markedForDeletion, ref) } else if claimName == "" { - if cd.Spec.Installed { + if isBroken(&cd) { + cdCol.broken = append(cdCol.broken, ref) + } else if cd.Spec.Installed { cdCol.assignable = append(cdCol.assignable, ref) } else { cdCol.installing = append(cdCol.installing, ref) @@ -306,6 +325,8 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, "deleting": len(cdCol.deleting), "installing": len(cdCol.installing), "unclaimed": len(cdCol.installing) + len(cdCol.assignable), + "stale": len(cdCol.unknownPoolVersion) + len(cdCol.mismatchedPoolVersion), + "broken": len(cdCol.broken), }).Debug("found clusters for ClusterPool") return &cdCol, nil } @@ -351,6 +372,11 @@ func (cds *cdCollection) Installing() []*hivev1.ClusterDeployment { return cds.installing } +// Broken returns the list of ClusterDeployments we've deemed unrecoverably broken. +func (cds *cdCollection) Broken() []*hivev1.ClusterDeployment { + return cds.broken +} + // UnknownPoolVersion returns the list of ClusterDeployments whose pool version annotation is // missing or empty. func (cds *cdCollection) UnknownPoolVersion() []*hivev1.ClusterDeployment { @@ -455,7 +481,7 @@ func (cds *cdCollection) Delete(c client.Client, cdName string) error { // Remove from any of the other lists it might be in removeCDsFromSlice(&cds.assignable, cdName) removeCDsFromSlice(&cds.installing, cdName) - removeCDsFromSlice(&cds.assignable, cdName) + removeCDsFromSlice(&cds.broken, cdName) removeCDsFromSlice(&cds.unknownPoolVersion, cdName) removeCDsFromSlice(&cds.mismatchedPoolVersion, cdName) removeCDsFromSlice(&cds.markedForDeletion, cdName) diff --git a/pkg/test/clusterdeployment/clusterdeployment.go b/pkg/test/clusterdeployment/clusterdeployment.go index 431cb945880..4eb238d99f1 100644 --- a/pkg/test/clusterdeployment/clusterdeployment.go +++ b/pkg/test/clusterdeployment/clusterdeployment.go @@ -3,6 +3,7 @@ package clusterdeployment import ( "time" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -116,6 +117,14 @@ func WithCondition(cond hivev1.ClusterDeploymentCondition) Option { } } +// Broken uses ProvisionStopped=True to make the CD be recognized as broken. +func Broken() Option { + return WithCondition(hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionStoppedCondition, + Status: v1.ConditionTrue, + }) +} + func WithUnclaimedClusterPoolReference(namespace, poolName string) Option { return WithClusterPoolReference(namespace, poolName, "") }