openshift · openshift-merge-robot · Sep 3, 2021 · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021
diff --git a/pkg/controller/clusterdeployment/clusterinstalls.go b/pkg/controller/clusterdeployment/clusterinstalls.go
@@ -140,6 +140,9 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
 	}
 
 	updated = false
+	// Fun extra variable to keep track of whether we should increment metricProvisionFailedTerminal
+	// later; because we only want to do that if (we change that status and) the status update succeeds.
+	provisionFailedTerminal := false
 	conditions, updated = controllerutils.SetClusterDeploymentConditionWithChangeCheck(conditions,
 		hivev1.ProvisionStoppedCondition,
 		stopped.Status,
@@ -149,6 +152,7 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
 	)
 	if updated {
 		statusModified = true
+		provisionFailedTerminal = true
 	}
 
 	completed = controllerutils.FindClusterDeploymentCondition(conditions, hivev1.ClusterInstallCompletedClusterDeploymentCondition)
@@ -196,6 +200,10 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
 			logger.WithError(err).Error("failed to update the status of clusterdeployment")
 			return reconcile.Result{}, err
 		}
+		// If we declared the provision terminally failed, bump our metric
+		if provisionFailedTerminal {
+			incProvisionFailedTerminal(cd)
+		}
 	}
 
 	return reconcile.Result{}, nil

diff --git a/pkg/controller/clusterdeployment/clusterprovisions.go b/pkg/controller/clusterdeployment/clusterprovisions.go
@@ -52,14 +52,14 @@ func (r *ReconcileClusterDeployment) startNewProvision(
 
 	r.deleteStaleProvisions(existingProvisions, logger)
 
-	if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" {
-		logger.Debug("not creating new provision since the deployment is set to try install only once")
+	setProvisionStoppedTrue := func(reason, message string) (reconcile.Result, error) {
+		logger.Debug("not creating new provision: %s", message)
 		conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(
 			cd.Status.Conditions,
 			hivev1.ProvisionStoppedCondition,
 			corev1.ConditionTrue,
-			installOnlyOnceSetReason,
-			"Deployment is set to try install only once",
+			reason,
+			message,
 			controllerutils.UpdateConditionIfReasonOrMessageChange)
 		if changed {
 			cd.Status.Conditions = conditions
@@ -68,27 +68,16 @@ func (r *ReconcileClusterDeployment) startNewProvision(
 				logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status")
 				return reconcile.Result{}, err
 			}
+			incProvisionFailedTerminal(cd)
 		}
 		return reconcile.Result{}, nil
 	}
+
+	if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" {
+		return setProvisionStoppedTrue(installOnlyOnceSetReason, "Deployment is set to try install only once")
+	}
 	if cd.Spec.InstallAttemptsLimit != nil && cd.Status.InstallRestarts >= int(*cd.Spec.InstallAttemptsLimit) {
-		logger.Debug("not creating new provision since the install attempts limit has been reached")
-		conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(
-			cd.Status.Conditions,
-			hivev1.ProvisionStoppedCondition,
-			corev1.ConditionTrue,
-			installAttemptsLimitReachedReason,
-			"Install attempts limit reached",
-			controllerutils.UpdateConditionIfReasonOrMessageChange)
-		if changed {
-			cd.Status.Conditions = conditions
-			logger.Debugf("setting ProvisionStoppedCondition to %v", corev1.ConditionTrue)
-			if err := r.Status().Update(context.TODO(), cd); err != nil {
-				logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status")
-				return reconcile.Result{}, err
-			}
-		}
-		return reconcile.Result{}, nil
+		return setProvisionStoppedTrue(installAttemptsLimitReachedReason, "Install attempts limit reached")
 	}
 
 	conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(

diff --git a/pkg/controller/clusterdeployment/metrics.go b/pkg/controller/clusterdeployment/metrics.go
@@ -4,6 +4,8 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 
 	"sigs.k8s.io/controller-runtime/pkg/metrics"
+
+	hivev1 "github.com/openshift/hive/apis/hive/v1"
 )
 
 var (
@@ -61,8 +63,22 @@ var (
 			Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800},
 		},
 	)
+	metricProvisionFailedTerminal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "hive_cluster_deployments_provision_failed_terminal_total",
+		Help: "Counter incremented when a cluster provision has failed and won't be retried.",
+	},
+		[]string{"clusterpool_namespacedname"},
+	)
 )
 
+func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment) {
+	poolNSName := ""
+	if poolRef := cd.Spec.ClusterPoolRef; poolRef != nil {
+		poolNSName = poolRef.Namespace + "/" + poolRef.PoolName
+	}
+	metricProvisionFailedTerminal.WithLabelValues(poolNSName).Inc()
+}
+
 func init() {
 	metrics.Registry.MustRegister(metricInstallJobDuration)
 	metrics.Registry.MustRegister(metricCompletedInstallJobRestarts)
@@ -72,4 +88,5 @@ func init() {
 	metrics.Registry.MustRegister(metricClustersInstalled)
 	metrics.Registry.MustRegister(metricClustersDeleted)
 	metrics.Registry.MustRegister(metricDNSDelaySeconds)
+	metrics.Registry.MustRegister(metricProvisionFailedTerminal)
 }
diff --git a/pkg/controller/clusterpool/clusterpool_controller.go b/pkg/controller/clusterpool/clusterpool_controller.go
@@ -301,7 +301,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
 	cds.SyncClaimAssignments(r.Client, claims, logger)
 
 	origStatus := clp.Status.DeepCopy()
-	clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable()))
+	clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()))
 	clp.Status.Ready = int32(len(cds.Assignable()))
 	if !reflect.DeepEqual(origStatus, &clp.Status) {
 		if err := r.Status().Update(context.Background(), clp); err != nil {
@@ -313,7 +313,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
 	availableCapacity := math.MaxInt32
 	if clp.Spec.MaxSize != nil {
 		availableCapacity = int(*clp.Spec.MaxSize) - cds.Total()
-		numUnassigned := len(cds.Installing()) + len(cds.Assignable())
+		numUnassigned := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken())
 		numAssigned := cds.NumAssigned()
 		if availableCapacity <= 0 {
 			logger.WithFields(log.Fields{
@@ -329,7 +329,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
 	}
 
 	// reserveSize is the number of clusters that the pool currently has in reserve
-	reserveSize := len(cds.Installing()) + len(cds.Assignable()) - len(claims.Unassigned())
+	reserveSize := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()) - len(claims.Unassigned())
 
 	if err := assignClustersToClaims(r.Client, claims, cds, logger); err != nil {
 		logger.WithError(err).Error("error assigning clusters <=> claims")
@@ -364,12 +364,6 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
 			"MaxConcurrent": *clp.Spec.MaxConcurrent,
 			"Available":     availableCurrent,
 		}).Info("Cannot create/delete clusters as max concurrent quota exceeded.")
-	// If too many, delete some.
-	case drift > 0:
-		toDel := minIntVarible(drift, availableCurrent)
-		if err := r.deleteExcessClusters(cds, toDel, logger); err != nil {
-			return reconcile.Result{}, err
-		}
 	// If too few, create new InstallConfig and ClusterDeployment.
 	case drift < 0:
 		if availableCapacity <= 0 {
@@ -383,6 +377,19 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
 			log.WithError(err).Error("error adding clusters")
 			return reconcile.Result{}, err
 		}
+	// Delete broken CDs. We put this case after the "addClusters" case because we would rather
+	// consume our maxConcurrent with additions than deletions. But we put it before the
+	// "deleteExcessClusters" case because we would rather trim broken clusters than viable ones.
+	case len(cds.Broken()) > 0:
+		if err := r.deleteBrokenClusters(cds, availableCurrent, logger); err != nil {
+			return reconcile.Result{}, err
+		}
+	// If too many, delete some.
+	case drift > 0:
+		toDel := minIntVarible(drift, availableCurrent)
+		if err := r.deleteExcessClusters(cds, toDel, logger); err != nil {
+			return reconcile.Result{}, err
+		}
 	// Special case for stale CDs: allow deleting one if all CDs are installed.
 	case drift == 0 && len(cds.Installing()) == 0 && len(cds.Stale()) > 0:
 		toDelete := cds.Stale()[0]
@@ -710,6 +717,21 @@ func (r *ReconcileClusterPool) deleteExcessClusters(
 	return nil
 }
 
+func (r *ReconcileClusterPool) deleteBrokenClusters(cds *cdCollection, maxToDelete int, logger log.FieldLogger) error {
+	numToDel := minIntVarible(maxToDelete, len(cds.Broken()))
+	logger.WithField("numberToDelete", numToDel).Info("deleting broken clusters")
+	clustersToDelete := make([]*hivev1.ClusterDeployment, numToDel)
+	copy(clustersToDelete, cds.Broken()[:numToDel])
+	for _, cd := range clustersToDelete {
+		logger.WithField("cluster", cd.Name).Info("deleting broken cluster deployment")
+		if err := cds.Delete(r.Client, cd.Name); err != nil {
+			logger.WithError(err).Error("error deleting cluster deployment")
+			return err
+		}
+	}
+	return nil
+}
+
 func (r *ReconcileClusterPool) reconcileDeletedPool(pool *hivev1.ClusterPool, logger log.FieldLogger) error {
 	if !controllerutils.HasFinalizer(pool, finalizer) {
 		return nil

diff --git a/pkg/controller/clusterpool/clusterpool_controller_test.go b/pkg/controller/clusterpool/clusterpool_controller_test.go
@@ -83,7 +83,6 @@ func TestReconcileClusterPool(t *testing.T) {
 	unclaimedCDBuilder := func(name string) testcd.Builder {
 		return cdBuilder(name).Options(
 			testcd.WithUnclaimedClusterPoolReference(testNamespace, testLeasePoolName),
-			testcd.WithPoolVersion(initialPoolVersion),
 		)
 	}
 
@@ -260,6 +259,64 @@ func TestReconcileClusterPool(t *testing.T) {
 			// more quickly. Possible subject of a future optimization.
 			expectedDeletedClusters: []string{"c1"},
 		},
+		{
+			name: "delete broken unclaimed clusters",
+			existing: []runtime.Object{
+				initializedPoolBuilder.Build(testcp.WithSize(4)),
+				unclaimedCDBuilder("c1").Build(testcd.Broken()),
+				// ProvisionStopped+Installed doesn't make sense IRL; but we're proving it would
+				// be counted as broken if it did.
+				unclaimedCDBuilder("c2").Build(testcd.Broken(), testcd.Installed()),
+				// Unbroken CDs don't get deleted
+				unclaimedCDBuilder("c3").Build(),
+				// ...including assignable ones
+				unclaimedCDBuilder("c4").Build(testcd.Installed()),
+				// Claimed CDs don't get deleted
+				cdBuilder("c5").Build(testcd.Installed(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test1")),
+				// ...even if they're broken
+				cdBuilder("c6").Build(testcd.Broken(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test2")),
+			},
+			expectedTotalClusters: 4,
+			expectedObservedSize:  4,
+			// The broken one doesn't get counted as Ready
+			expectedObservedReady:   1,
+			expectedDeletedClusters: []string{"c1", "c2"},
+			expectedAssignedCDs:     2,
+		},
+		{
+			name: "deleting broken clusters is bounded by maxConcurrent",
+			existing: []runtime.Object{
+				initializedPoolBuilder.Build(testcp.WithSize(4), testcp.WithMaxConcurrent(2)),
+				unclaimedCDBuilder("c1").Build(testcd.Broken()),
+				unclaimedCDBuilder("c2").Build(testcd.Broken()),
+				unclaimedCDBuilder("c3").Build(testcd.Broken()),
+				unclaimedCDBuilder("c4").Build(testcd.Broken()),
+			},
+			expectedTotalClusters: 2,
+			expectedObservedSize:  4,
+			// Note: We can't expectDeletedClusters because we delete broken clusters in arbitrary order
+		},
+		{
+			name: "adding takes precedence over deleting broken clusters",
+			existing: []runtime.Object{
+				initializedPoolBuilder.Build(testcp.WithSize(3)),
+				unclaimedCDBuilder("c1").Build(testcd.Broken()),
+				unclaimedCDBuilder("c2").Build(),
+			},
+			expectedTotalClusters: 3,
+			expectedObservedSize:  2,
+		},
+		{
+			name: "delete broken clusters first when reducing capacity",
+			existing: []runtime.Object{
+				initializedPoolBuilder.Build(testcp.WithSize(1)),
+				unclaimedCDBuilder("c1").Build(testcd.Broken()),
+				unclaimedCDBuilder("c2").Build(),
+			},
+			expectedTotalClusters:   1,
+			expectedObservedSize:    2,
+			expectedDeletedClusters: []string{"c1"},
+		},
 		{
 			name: "create all clusters",
 			existing: []runtime.Object{
@@ -1272,7 +1329,7 @@ func TestReconcileClusterPool(t *testing.T) {
 			assert.Equal(t, test.expectedAssignedCDs, actualAssignedCDs, "unexpected number of assigned CDs")
 			assert.Equal(t, test.expectedTotalClusters-test.expectedAssignedCDs, actualUnassignedCDs, "unexpected number of unassigned CDs")
 			assert.Equal(t, test.expectedRunning, actualRunning, "unexpected number of running CDs")
-			assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of assigned CDs")
+			assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of hibernating CDs")
 
 			pool := &hivev1.ClusterPool{}
 			err = fakeClient.Get(context.Background(), client.ObjectKey{Namespace: testNamespace, Name: testLeasePoolName}, pool)

diff --git a/pkg/controller/clusterpool/collections.go b/pkg/controller/clusterpool/collections.go
@@ -196,6 +196,8 @@ type cdCollection struct {
 	installing []*hivev1.ClusterDeployment
 	// Clusters with a DeletionTimestamp. Mutually exclusive with markedForDeletion.
 	deleting []*hivev1.ClusterDeployment
+	// Clusters we've declared unusable (e.g. provision failed terminally).
+	broken []*hivev1.ClusterDeployment
 	// Clusters with the ClusterClaimRemoveClusterAnnotation. Mutually exclusive with deleting.
 	markedForDeletion []*hivev1.ClusterDeployment
 	// Clusters with a missing or empty pool version annotation
@@ -208,6 +210,20 @@ type cdCollection struct {
 	byClaimName map[string]*hivev1.ClusterDeployment
 }
 
+func isBroken(cd *hivev1.ClusterDeployment) bool {
+	cond := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition)
+	if cond == nil {
+		// Since we should be initializing conditions, this probably means the CD is super fresh.
+		// Don't declare it broken yet -- give it a chance to come to life.
+		return false
+	}
+	if cond.Status == corev1.ConditionTrue {
+		return true
+	}
+	// Other causes of broken-ness to be added here later. E.g. resume failed.
+	return false
+}
+
 // getAllClusterDeploymentsForPool is the constructor for a cdCollection
 // comprising all the ClusterDeployments created for the specified ClusterPool.
 func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, poolVersion string, logger log.FieldLogger) (*cdCollection, error) {
@@ -221,6 +237,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
 		assignable:            make([]*hivev1.ClusterDeployment, 0),
 		installing:            make([]*hivev1.ClusterDeployment, 0),
 		deleting:              make([]*hivev1.ClusterDeployment, 0),
+		broken:                make([]*hivev1.ClusterDeployment, 0),
 		unknownPoolVersion:    make([]*hivev1.ClusterDeployment, 0),
 		mismatchedPoolVersion: make([]*hivev1.ClusterDeployment, 0),
 		byCDName:              make(map[string]*hivev1.ClusterDeployment),
@@ -246,7 +263,9 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
 			// Do *not* double count "deleting" and "marked for deletion"
 			cdCol.markedForDeletion = append(cdCol.markedForDeletion, ref)
 		} else if claimName == "" {
-			if cd.Spec.Installed {
+			if isBroken(&cd) {
+				cdCol.broken = append(cdCol.broken, ref)
+			} else if cd.Spec.Installed {
 				cdCol.assignable = append(cdCol.assignable, ref)
 			} else {
 				cdCol.installing = append(cdCol.installing, ref)
@@ -306,6 +325,8 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
 		"deleting":   len(cdCol.deleting),
 		"installing": len(cdCol.installing),
 		"unclaimed":  len(cdCol.installing) + len(cdCol.assignable),
+		"stale":      len(cdCol.unknownPoolVersion) + len(cdCol.mismatchedPoolVersion),
+		"broken":     len(cdCol.broken),
 	}).Debug("found clusters for ClusterPool")
 	return &cdCol, nil
 }
@@ -351,6 +372,11 @@ func (cds *cdCollection) Installing() []*hivev1.ClusterDeployment {
 	return cds.installing
 }
 
+// Broken returns the list of ClusterDeployments we've deemed unrecoverably broken.
+func (cds *cdCollection) Broken() []*hivev1.ClusterDeployment {
+	return cds.broken
+}
+
 // UnknownPoolVersion returns the list of ClusterDeployments whose pool version annotation is
 // missing or empty.
 func (cds *cdCollection) UnknownPoolVersion() []*hivev1.ClusterDeployment {
@@ -455,7 +481,7 @@ func (cds *cdCollection) Delete(c client.Client, cdName string) error {
 	// Remove from any of the other lists it might be in
 	removeCDsFromSlice(&cds.assignable, cdName)
 	removeCDsFromSlice(&cds.installing, cdName)
-	removeCDsFromSlice(&cds.assignable, cdName)
+	removeCDsFromSlice(&cds.broken, cdName)
 	removeCDsFromSlice(&cds.unknownPoolVersion, cdName)
 	removeCDsFromSlice(&cds.mismatchedPoolVersion, cdName)
 	removeCDsFromSlice(&cds.markedForDeletion, cdName)

diff --git a/pkg/test/clusterdeployment/clusterdeployment.go b/pkg/test/clusterdeployment/clusterdeployment.go
@@ -3,6 +3,7 @@ package clusterdeployment
 import (
 	"time"
 
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 
@@ -116,6 +117,14 @@ func WithCondition(cond hivev1.ClusterDeploymentCondition) Option {
 	}
 }
 
+// Broken uses ProvisionStopped=True to make the CD be recognized as broken.
+func Broken() Option {
+	return WithCondition(hivev1.ClusterDeploymentCondition{
+		Type:   hivev1.ProvisionStoppedCondition,
+		Status: v1.ConditionTrue,
+	})
+}
+
 func WithUnclaimedClusterPoolReference(namespace, poolName string) Option {
 	return WithClusterPoolReference(namespace, poolName, "")
 }