diff --git a/pkg/controller/clusterpool/clusterpool_controller.go b/pkg/controller/clusterpool/clusterpool_controller.go index da08c2781ca..ee87b4d1d62 100644 --- a/pkg/controller/clusterpool/clusterpool_controller.go +++ b/pkg/controller/clusterpool/clusterpool_controller.go @@ -332,6 +332,20 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. // reserveSize is the number of clusters that the pool currently has in reserve reserveSize := len(cds.Installing()) + len(cds.Assignable()) + len(cds.Broken()) - len(claims.Unassigned()) + // excessSize is the number of clusters in excess of the pool's capacity (Size) that we are + // creating to satisfy unassigned claims. For example: + // - In steady state, if Size=3 and we have 5 unassigned claims, excessSize is 2. + // - If Size=3, and we only have 2 clusters in the pool, and we have 6 unassigned claims, + // excessSize is 4: the two existing clusters satisfy two of the claims, leaving four; + // we need three to refill the pool and another four to fulfil the remaining unassigned + // claims. + // - Any time the number of unassigned claims is less than the number of clusters in the + // pool, we will be able to satisfy them from the pool, so excessSize is 0. + excessSize := 0 + if reserveSize < 0 { + excessSize = -reserveSize + } + if err := assignClustersToClaims(r.Client, claims, cds, logger); err != nil { logger.WithError(err).Error("error assigning clusters <=> claims") return reconcile.Result{}, err @@ -400,7 +414,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. metricStaleClusterDeploymentsDeleted.WithLabelValues(clp.Namespace, clp.Name).Inc() } - if err := r.reconcileRunningClusters(clp, cds, logger); err != nil { + if err := r.reconcileRunningClusters(clp, cds, excessSize, logger); err != nil { log.WithError(err).Error("error updating hibernating/running state") return reconcile.Result{}, err } @@ -426,9 +440,13 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. func (r *ReconcileClusterPool) reconcileRunningClusters( clp *hivev1.ClusterPool, cds *cdCollection, + excessCount int, logger log.FieldLogger, ) error { - runningCount := int(clp.Spec.RunningCount) + // If we're creating excess clusters to satisfy unassigned claims, add that many + // to the runningCount. They'll get snatched up immediately, bringing the number + // of running clusters back down to runningCount once the pool reaches steady state. + runningCount := int(clp.Spec.RunningCount) + excessCount cdList := append(cds.Assignable(), cds.Installing()...) // Sort by age, oldest first sort.Slice( diff --git a/pkg/controller/clusterpool/clusterpool_controller_test.go b/pkg/controller/clusterpool/clusterpool_controller_test.go index 22af48b099b..3c4ee71db9e 100644 --- a/pkg/controller/clusterpool/clusterpool_controller_test.go +++ b/pkg/controller/clusterpool/clusterpool_controller_test.go @@ -1169,6 +1169,8 @@ func TestReconcileClusterPool(t *testing.T) { expectedTotalClusters: 5, // The assignments don't happen until a subsequent reconcile after the CDs are ready expectedUnassignedClaims: 3, + // Even though runningCount is zero, we run enough clusters to fulfill the excess claims + expectedRunning: 3, }, { name: "zero size pool", @@ -1179,6 +1181,8 @@ func TestReconcileClusterPool(t *testing.T) { expectedTotalClusters: 1, // The assignments don't happen until a subsequent reconcile after the CDs are ready expectedUnassignedClaims: 1, + // Even though runningCount is zero, we run enough clusters to fulfill the excess claims + expectedRunning: 1, }, { name: "no CDs match pool version", @@ -1364,10 +1368,8 @@ func TestReconcileClusterPool(t *testing.T) { expectedTotalClusters: 9, // The four original pool CDs got claimed. Two were already running; we started the other two. // We create five new CDs to satisfy the pool size plus the additional claim. - // Of those, we start two for runningCount, and hibernate the rest. - // (Intuitively, perhaps the one for the excess claim should start off in running state. - // But that's not how the logic works today. Should it?) - expectedRunning: 6, + // Of those, we start two for runningCount, and one more for the excess claim. + expectedRunning: 7, expectedAssignedCDs: 4, expectedAssignedClaims: 4, expectedUnassignedClaims: 1,