Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pkg/controller/clusterdeployment/clusterinstalls.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
}

updated = false
// Fun extra variable to keep track of whether we should increment metricProvisionFailedTerminal
// later; because we only want to do that if (we change that status and) the status update succeeds.
provisionFailedTerminal := false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the drawback of not having this flag as a gating condition to report the metric?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having trouble parsing your sentence (English is hard). Are you suggesting we don't need this variable? How else would we know when to increment the counter?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I had a thorough look at the code, and I think you don't actually need this boolean. Simply observe the metric where you're setting this metric to true.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to observe the metric unless we actually changed the ProvisionStopped condition to True due to a final ProvisionFailed. If I don't use this boolean then, for example, we'll increment it any time we change any of the ClusterInstall* conditions. We don't want that.

conditions, updated = controllerutils.SetClusterDeploymentConditionWithChangeCheck(conditions,
hivev1.ProvisionStoppedCondition,
stopped.Status,
Expand All @@ -149,6 +152,7 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
)
if updated {
statusModified = true
provisionFailedTerminal = true
}

completed = controllerutils.FindClusterDeploymentCondition(conditions, hivev1.ClusterInstallCompletedClusterDeploymentCondition)
Expand Down Expand Up @@ -196,6 +200,10 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c
logger.WithError(err).Error("failed to update the status of clusterdeployment")
return reconcile.Result{}, err
}
// If we declared the provision terminally failed, bump our metric
if provisionFailedTerminal {
incProvisionFailedTerminal(cd)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a related note, you don't actually need to observe it in metrics reconcile, you can fire the metric.observe right here (make it a global variable and set it here)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made incProvisionFailedTerminal a func because it was more than 1LOC and I needed to call it from multiple places.

}
}

return reconcile.Result{}, nil
Expand Down
31 changes: 10 additions & 21 deletions pkg/controller/clusterdeployment/clusterprovisions.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ func (r *ReconcileClusterDeployment) startNewProvision(

r.deleteStaleProvisions(existingProvisions, logger)

if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" {
logger.Debug("not creating new provision since the deployment is set to try install only once")
setProvisionStoppedTrue := func(reason, message string) (reconcile.Result, error) {
logger.Debug("not creating new provision: %s", message)
conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(
cd.Status.Conditions,
hivev1.ProvisionStoppedCondition,
corev1.ConditionTrue,
installOnlyOnceSetReason,
"Deployment is set to try install only once",
reason,
message,
controllerutils.UpdateConditionIfReasonOrMessageChange)
if changed {
cd.Status.Conditions = conditions
Expand All @@ -68,27 +68,16 @@ func (r *ReconcileClusterDeployment) startNewProvision(
logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status")
return reconcile.Result{}, err
}
incProvisionFailedTerminal(cd)
}
return reconcile.Result{}, nil
}

if cd.Status.InstallRestarts > 0 && cd.Annotations[tryInstallOnceAnnotation] == "true" {
return setProvisionStoppedTrue(installOnlyOnceSetReason, "Deployment is set to try install only once")
}
if cd.Spec.InstallAttemptsLimit != nil && cd.Status.InstallRestarts >= int(*cd.Spec.InstallAttemptsLimit) {
logger.Debug("not creating new provision since the install attempts limit has been reached")
conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(
cd.Status.Conditions,
hivev1.ProvisionStoppedCondition,
corev1.ConditionTrue,
installAttemptsLimitReachedReason,
"Install attempts limit reached",
controllerutils.UpdateConditionIfReasonOrMessageChange)
if changed {
cd.Status.Conditions = conditions
logger.Debugf("setting ProvisionStoppedCondition to %v", corev1.ConditionTrue)
if err := r.Status().Update(context.TODO(), cd); err != nil {
logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status")
return reconcile.Result{}, err
}
}
return reconcile.Result{}, nil
return setProvisionStoppedTrue(installAttemptsLimitReachedReason, "Install attempts limit reached")
}

conditions, changed := controllerutils.SetClusterDeploymentConditionWithChangeCheck(
Expand Down
17 changes: 17 additions & 0 deletions pkg/controller/clusterdeployment/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"github.com/prometheus/client_golang/prometheus"

"sigs.k8s.io/controller-runtime/pkg/metrics"

hivev1 "github.com/openshift/hive/apis/hive/v1"
)

var (
Expand Down Expand Up @@ -61,8 +63,22 @@ var (
Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800},
},
)
metricProvisionFailedTerminal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "hive_cluster_deployments_provision_failed_terminal_total",
Help: "Counter incremented when a cluster provision has failed and won't be retried.",
},
[]string{"clusterpool_namespacedname"},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't we want to clear the metric when provision succeeds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provision will not succeed. We set this when we've failed for the last time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given how you're logging this metric, it would be deleted only when hive controller restarts - since it is not attached to anything else. So it should be cleared in the clusterpool controller for the relevant clusterpool

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still not following this. I don't see why we should ever clear this metric. I definitely don't see why controller restart should be a significant event. Why should this metric behave any differently from e.g. hive_cluster_deployments_installed_total?

)
)

func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If for every CD the counter is incremented, then it should also be decremented if the relevant clusterdeployment is deleted.
I think you should make this metric a guage and report it every time for how many provisions failed.
Note that prometheus doesn't fire a change in metric until it is actually changed. So firing the metrics.Set multiple times for the same value is fine.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If for every CD the counter is incremented, then it should also be decremented if the relevant clusterdeployment is deleted.

We're using a counter because we want to track how many times pool CDs failed to provision, ever. We'll use prometheus to report things like the rate at which these failures are occurring.

I think you should make this metric a guage

We talked about metrics when doing stale CD replacement, which is very similar to this. We discussed using a (separate) gauge metric to indicate the number of stale CDs in a pool at any given time, but ended up deciding YAGNI.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! This clarifies things - I had misjudged what the metric was used for

poolNSName := ""
if poolRef := cd.Spec.ClusterPoolRef; poolRef != nil {
poolNSName = poolRef.Namespace + "/" + poolRef.PoolName
}
metricProvisionFailedTerminal.WithLabelValues(poolNSName).Inc()
}

func init() {
metrics.Registry.MustRegister(metricInstallJobDuration)
metrics.Registry.MustRegister(metricCompletedInstallJobRestarts)
Expand All @@ -72,4 +88,5 @@ func init() {
metrics.Registry.MustRegister(metricClustersInstalled)
metrics.Registry.MustRegister(metricClustersDeleted)
metrics.Registry.MustRegister(metricDNSDelaySeconds)
metrics.Registry.MustRegister(metricProvisionFailedTerminal)
}
40 changes: 31 additions & 9 deletions pkg/controller/clusterpool/clusterpool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
cds.SyncClaimAssignments(r.Client, claims, logger)

origStatus := clp.Status.DeepCopy()
clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable()))
clp.Status.Size = int32(len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()))
clp.Status.Ready = int32(len(cds.Assignable()))
if !reflect.DeepEqual(origStatus, &clp.Status) {
if err := r.Status().Update(context.Background(), clp); err != nil {
Expand All @@ -313,7 +313,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
availableCapacity := math.MaxInt32
if clp.Spec.MaxSize != nil {
availableCapacity = int(*clp.Spec.MaxSize) - cds.Total()
numUnassigned := len(cds.Installing()) + len(cds.Assignable())
numUnassigned := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken())
numAssigned := cds.NumAssigned()
if availableCapacity <= 0 {
logger.WithFields(log.Fields{
Expand All @@ -329,7 +329,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
}

// reserveSize is the number of clusters that the pool currently has in reserve
reserveSize := len(cds.Installing()) + len(cds.Assignable()) - len(claims.Unassigned())
reserveSize := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()) - len(claims.Unassigned())

if err := assignClustersToClaims(r.Client, claims, cds, logger); err != nil {
logger.WithError(err).Error("error assigning clusters <=> claims")
Expand Down Expand Up @@ -364,12 +364,6 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
"MaxConcurrent": *clp.Spec.MaxConcurrent,
"Available": availableCurrent,
}).Info("Cannot create/delete clusters as max concurrent quota exceeded.")
// If too many, delete some.
case drift > 0:
toDel := minIntVarible(drift, availableCurrent)
if err := r.deleteExcessClusters(cds, toDel, logger); err != nil {
return reconcile.Result{}, err
}
// If too few, create new InstallConfig and ClusterDeployment.
case drift < 0:
if availableCapacity <= 0 {
Expand All @@ -383,6 +377,19 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
log.WithError(err).Error("error adding clusters")
return reconcile.Result{}, err
}
// Delete broken CDs. We put this case after the "addClusters" case because we would rather
// consume our maxConcurrent with additions than deletions. But we put it before the
// "deleteExcessClusters" case because we would rather trim broken clusters than viable ones.
case len(cds.Broken()) > 0:
if err := r.deleteBrokenClusters(cds, availableCurrent, logger); err != nil {
return reconcile.Result{}, err
}
// If too many, delete some.
case drift > 0:
toDel := minIntVarible(drift, availableCurrent)
if err := r.deleteExcessClusters(cds, toDel, logger); err != nil {
return reconcile.Result{}, err
}
Comment on lines +387 to +392
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to reviewers: this block moved down unchanged. Since drift > 0 and drift < 0 are disjoint, this didn't affect existing logic; but it was necessary to be able to inject the new case in the right spot logically.

// Special case for stale CDs: allow deleting one if all CDs are installed.
case drift == 0 && len(cds.Installing()) == 0 && len(cds.Stale()) > 0:
toDelete := cds.Stale()[0]
Expand Down Expand Up @@ -710,6 +717,21 @@ func (r *ReconcileClusterPool) deleteExcessClusters(
return nil
}

func (r *ReconcileClusterPool) deleteBrokenClusters(cds *cdCollection, maxToDelete int, logger log.FieldLogger) error {
numToDel := minIntVarible(maxToDelete, len(cds.Broken()))
logger.WithField("numberToDelete", numToDel).Info("deleting broken clusters")
clustersToDelete := make([]*hivev1.ClusterDeployment, numToDel)
copy(clustersToDelete, cds.Broken()[:numToDel])
for _, cd := range clustersToDelete {
logger.WithField("cluster", cd.Name).Info("deleting broken cluster deployment")
if err := cds.Delete(r.Client, cd.Name); err != nil {
logger.WithError(err).Error("error deleting cluster deployment")
return err
}
}
return nil
}

func (r *ReconcileClusterPool) reconcileDeletedPool(pool *hivev1.ClusterPool, logger log.FieldLogger) error {
if !controllerutils.HasFinalizer(pool, finalizer) {
return nil
Expand Down
61 changes: 59 additions & 2 deletions pkg/controller/clusterpool/clusterpool_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ func TestReconcileClusterPool(t *testing.T) {
unclaimedCDBuilder := func(name string) testcd.Builder {
return cdBuilder(name).Options(
testcd.WithUnclaimedClusterPoolReference(testNamespace, testLeasePoolName),
testcd.WithPoolVersion(initialPoolVersion),
)
}

Expand Down Expand Up @@ -260,6 +259,64 @@ func TestReconcileClusterPool(t *testing.T) {
// more quickly. Possible subject of a future optimization.
expectedDeletedClusters: []string{"c1"},
},
{
name: "delete broken unclaimed clusters",
existing: []runtime.Object{
initializedPoolBuilder.Build(testcp.WithSize(4)),
unclaimedCDBuilder("c1").Build(testcd.Broken()),
// ProvisionStopped+Installed doesn't make sense IRL; but we're proving it would
// be counted as broken if it did.
unclaimedCDBuilder("c2").Build(testcd.Broken(), testcd.Installed()),
// Unbroken CDs don't get deleted
unclaimedCDBuilder("c3").Build(),
// ...including assignable ones
unclaimedCDBuilder("c4").Build(testcd.Installed()),
// Claimed CDs don't get deleted
cdBuilder("c5").Build(testcd.Installed(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test1")),
// ...even if they're broken
cdBuilder("c6").Build(testcd.Broken(), testcd.WithClusterPoolReference(testNamespace, testLeasePoolName, "test2")),
},
expectedTotalClusters: 4,
expectedObservedSize: 4,
// The broken one doesn't get counted as Ready
expectedObservedReady: 1,
expectedDeletedClusters: []string{"c1", "c2"},
expectedAssignedCDs: 2,
},
{
name: "deleting broken clusters is bounded by maxConcurrent",
existing: []runtime.Object{
initializedPoolBuilder.Build(testcp.WithSize(4), testcp.WithMaxConcurrent(2)),
unclaimedCDBuilder("c1").Build(testcd.Broken()),
unclaimedCDBuilder("c2").Build(testcd.Broken()),
unclaimedCDBuilder("c3").Build(testcd.Broken()),
unclaimedCDBuilder("c4").Build(testcd.Broken()),
},
expectedTotalClusters: 2,
expectedObservedSize: 4,
// Note: We can't expectDeletedClusters because we delete broken clusters in arbitrary order
},
{
name: "adding takes precedence over deleting broken clusters",
existing: []runtime.Object{
initializedPoolBuilder.Build(testcp.WithSize(3)),
unclaimedCDBuilder("c1").Build(testcd.Broken()),
unclaimedCDBuilder("c2").Build(),
},
expectedTotalClusters: 3,
expectedObservedSize: 2,
},
{
name: "delete broken clusters first when reducing capacity",
existing: []runtime.Object{
initializedPoolBuilder.Build(testcp.WithSize(1)),
unclaimedCDBuilder("c1").Build(testcd.Broken()),
unclaimedCDBuilder("c2").Build(),
},
expectedTotalClusters: 1,
expectedObservedSize: 2,
expectedDeletedClusters: []string{"c1"},
},
{
name: "create all clusters",
existing: []runtime.Object{
Expand Down Expand Up @@ -1272,7 +1329,7 @@ func TestReconcileClusterPool(t *testing.T) {
assert.Equal(t, test.expectedAssignedCDs, actualAssignedCDs, "unexpected number of assigned CDs")
assert.Equal(t, test.expectedTotalClusters-test.expectedAssignedCDs, actualUnassignedCDs, "unexpected number of unassigned CDs")
assert.Equal(t, test.expectedRunning, actualRunning, "unexpected number of running CDs")
assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of assigned CDs")
assert.Equal(t, test.expectedTotalClusters-test.expectedRunning, actualHibernating, "unexpected number of hibernating CDs")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Latent copypasta


pool := &hivev1.ClusterPool{}
err = fakeClient.Get(context.Background(), client.ObjectKey{Namespace: testNamespace, Name: testLeasePoolName}, pool)
Expand Down
30 changes: 28 additions & 2 deletions pkg/controller/clusterpool/collections.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ type cdCollection struct {
installing []*hivev1.ClusterDeployment
// Clusters with a DeletionTimestamp. Mutually exclusive with markedForDeletion.
deleting []*hivev1.ClusterDeployment
// Clusters we've declared unusable (e.g. provision failed terminally).
broken []*hivev1.ClusterDeployment
// Clusters with the ClusterClaimRemoveClusterAnnotation. Mutually exclusive with deleting.
markedForDeletion []*hivev1.ClusterDeployment
// Clusters with a missing or empty pool version annotation
Expand All @@ -208,6 +210,20 @@ type cdCollection struct {
byClaimName map[string]*hivev1.ClusterDeployment
}

func isBroken(cd *hivev1.ClusterDeployment) bool {
cond := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition)
if cond == nil {
// Since we should be initializing conditions, this probably means the CD is super fresh.
// Don't declare it broken yet -- give it a chance to come to life.
return false
}
if cond.Status == corev1.ConditionTrue {
return true
}
// Other causes of broken-ness to be added here later. E.g. resume failed.
return false
}

// getAllClusterDeploymentsForPool is the constructor for a cdCollection
// comprising all the ClusterDeployments created for the specified ClusterPool.
func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, poolVersion string, logger log.FieldLogger) (*cdCollection, error) {
Expand All @@ -221,6 +237,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
assignable: make([]*hivev1.ClusterDeployment, 0),
installing: make([]*hivev1.ClusterDeployment, 0),
deleting: make([]*hivev1.ClusterDeployment, 0),
broken: make([]*hivev1.ClusterDeployment, 0),
unknownPoolVersion: make([]*hivev1.ClusterDeployment, 0),
mismatchedPoolVersion: make([]*hivev1.ClusterDeployment, 0),
byCDName: make(map[string]*hivev1.ClusterDeployment),
Expand All @@ -246,7 +263,9 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
// Do *not* double count "deleting" and "marked for deletion"
cdCol.markedForDeletion = append(cdCol.markedForDeletion, ref)
} else if claimName == "" {
if cd.Spec.Installed {
if isBroken(&cd) {
cdCol.broken = append(cdCol.broken, ref)
} else if cd.Spec.Installed {
cdCol.assignable = append(cdCol.assignable, ref)
} else {
cdCol.installing = append(cdCol.installing, ref)
Expand Down Expand Up @@ -306,6 +325,8 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool,
"deleting": len(cdCol.deleting),
"installing": len(cdCol.installing),
"unclaimed": len(cdCol.installing) + len(cdCol.assignable),
"stale": len(cdCol.unknownPoolVersion) + len(cdCol.mismatchedPoolVersion),
"broken": len(cdCol.broken),
}).Debug("found clusters for ClusterPool")
return &cdCol, nil
}
Expand Down Expand Up @@ -351,6 +372,11 @@ func (cds *cdCollection) Installing() []*hivev1.ClusterDeployment {
return cds.installing
}

// Broken returns the list of ClusterDeployments we've deemed unrecoverably broken.
func (cds *cdCollection) Broken() []*hivev1.ClusterDeployment {
return cds.broken
}

// UnknownPoolVersion returns the list of ClusterDeployments whose pool version annotation is
// missing or empty.
func (cds *cdCollection) UnknownPoolVersion() []*hivev1.ClusterDeployment {
Expand Down Expand Up @@ -455,7 +481,7 @@ func (cds *cdCollection) Delete(c client.Client, cdName string) error {
// Remove from any of the other lists it might be in
removeCDsFromSlice(&cds.assignable, cdName)
removeCDsFromSlice(&cds.installing, cdName)
removeCDsFromSlice(&cds.assignable, cdName)
removeCDsFromSlice(&cds.broken, cdName)
Copy link
Member Author

@2uasimojo 2uasimojo Sep 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this was previously a dup of L482, mistake made in 3dc3c3c / #1484.

removeCDsFromSlice(&cds.unknownPoolVersion, cdName)
removeCDsFromSlice(&cds.mismatchedPoolVersion, cdName)
removeCDsFromSlice(&cds.markedForDeletion, cdName)
Expand Down
9 changes: 9 additions & 0 deletions pkg/test/clusterdeployment/clusterdeployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package clusterdeployment
import (
"time"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"

Expand Down Expand Up @@ -116,6 +117,14 @@ func WithCondition(cond hivev1.ClusterDeploymentCondition) Option {
}
}

// Broken uses ProvisionStopped=True to make the CD be recognized as broken.
func Broken() Option {
return WithCondition(hivev1.ClusterDeploymentCondition{
Type: hivev1.ProvisionStoppedCondition,
Status: v1.ConditionTrue,
})
}

func WithUnclaimedClusterPoolReference(namespace, poolName string) Option {
return WithClusterPoolReference(namespace, poolName, "")
}
Expand Down