diff --git a/apis/hive/v1/clusterdeployment_types.go b/apis/hive/v1/clusterdeployment_types.go index f037b04f7f3..4d3ec7f6e90 100644 --- a/apis/hive/v1/clusterdeployment_types.go +++ b/apis/hive/v1/clusterdeployment_types.go @@ -252,6 +252,9 @@ type ClusterPoolReference struct { // ClaimName is the name of the ClusterClaim that claimed the cluster from the pool. // +optional ClaimName string `json:"claimName,omitempty"` + // ClaimedTimestamp is the time this cluster was assigned to a ClusterClaim. This is only used for + // ClusterDeployments belonging to ClusterPools. + ClaimedTimestamp *metav1.Time `json:"claimedTimestamp,omitempty"` } // ClusterMetadata contains metadata information about the installed cluster. diff --git a/apis/hive/v1/clusterpool_types.go b/apis/hive/v1/clusterpool_types.go index ffc8cc3a8bf..20fe500080d 100644 --- a/apis/hive/v1/clusterpool_types.go +++ b/apis/hive/v1/clusterpool_types.go @@ -21,6 +21,12 @@ type ClusterPoolSpec struct { // +required Size int32 `json:"size"` + // RunningCount is the number of clusters we should keep running. The remainder will be kept hibernated until claimed. + // By default no clusters will be kept running (all will be hibernated). + // +kubebuilder:validation:Minimum=0 + // +optional + RunningCount int32 `json:"runningCount,omitempty"` + // MaxSize is the maximum number of clusters that will be provisioned including clusters that have been claimed // and ones waiting to be used. // By default there is no limit. diff --git a/config/crds/hive.openshift.io_clusterdeployments.yaml b/config/crds/hive.openshift.io_clusterdeployments.yaml index 9e504a3df97..2ef6283444f 100644 --- a/config/crds/hive.openshift.io_clusterdeployments.yaml +++ b/config/crds/hive.openshift.io_clusterdeployments.yaml @@ -181,6 +181,12 @@ spec: description: ClaimName is the name of the ClusterClaim that claimed the cluster from the pool. type: string + claimedTimestamp: + description: ClaimedTimestamp is the time this cluster was assigned + to a ClusterClaim. This is only used for ClusterDeployments + belonging to ClusterPools. + format: date-time + type: string namespace: description: Namespace is the namespace where the ClusterPool resides. diff --git a/config/crds/hive.openshift.io_clusterpools.yaml b/config/crds/hive.openshift.io_clusterpools.yaml index 0ab5af062e3..f57d86053ed 100644 --- a/config/crds/hive.openshift.io_clusterpools.yaml +++ b/config/crds/hive.openshift.io_clusterpools.yaml @@ -463,6 +463,13 @@ spec: TODO: Add other useful fields. apiVersion, kind, uid?' type: string type: object + runningCount: + description: RunningCount is the number of clusters we should keep + running. The remainder will be kept hibernated until claimed. By + default no clusters will be kept running (all will be hibernated). + format: int32 + minimum: 0 + type: integer size: description: Size is the default number of clusters that we should keep provisioned and waiting for use. diff --git a/docs/clusterpools.md b/docs/clusterpools.md index 555fb2e92d3..8bb1da26e12 100644 --- a/docs/clusterpools.md +++ b/docs/clusterpools.md @@ -24,12 +24,12 @@ the pool. `ClusterClaims` must be created in the same namespace as their The user who claims a cluster can be given RBAC to their clusters namespace to prevent anyone else from being able to access it. -Presently once a `ClusterDeployment` is ready, it will be +By default once a `ClusterDeployment` is ready, it will be [hibernated](./hibernating-clusters.md) automatically. Once claimed it will be automatically resumed, meaning that the typical time to claim a cluster and be -ready to go is in the 2-5 minute range while the cluster starts up. An option -to keep the clusters in the pool always running so they can instantly be used -may be added in the future. +ready to go is in the 2-5 minute range while the cluster starts up. You can +keep a subset of clusters active by setting `ClusterPool.Spec.RunningCount`; +such clusters will be ready immediately when claimed. When done with a cluster, users can just delete their `ClusterClaim` and the `ClusterDeployment` will be automatically deprovisioned. An optional @@ -67,7 +67,8 @@ spec: region: us-east-1 pullSecretRef: name: hive-team-pull-secret - size: 1 + runningCount: 1 + size: 3 ``` ## Sample Cluster Claim diff --git a/hack/app-sre/saas-template.yaml b/hack/app-sre/saas-template.yaml index 128edfffe46..6d27a7bd0ef 100644 --- a/hack/app-sre/saas-template.yaml +++ b/hack/app-sre/saas-template.yaml @@ -434,6 +434,12 @@ objects: description: ClaimName is the name of the ClusterClaim that claimed the cluster from the pool. type: string + claimedTimestamp: + description: ClaimedTimestamp is the time this cluster was assigned + to a ClusterClaim. This is only used for ClusterDeployments + belonging to ClusterPools. + format: date-time + type: string namespace: description: Namespace is the namespace where the ClusterPool resides. @@ -2143,6 +2149,13 @@ objects: TODO: Add other useful fields. apiVersion, kind, uid?' type: string type: object + runningCount: + description: RunningCount is the number of clusters we should keep + running. The remainder will be kept hibernated until claimed. + By default no clusters will be kept running (all will be hibernated). + format: int32 + minimum: 0 + type: integer size: description: Size is the default number of clusters that we should keep provisioned and waiting for use. diff --git a/pkg/controller/clusterpool/clusterpool_controller.go b/pkg/controller/clusterpool/clusterpool_controller.go index 037646dc844..da08c2781ca 100644 --- a/pkg/controller/clusterpool/clusterpool_controller.go +++ b/pkg/controller/clusterpool/clusterpool_controller.go @@ -5,6 +5,7 @@ import ( "fmt" "math" "reflect" + "sort" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -370,10 +371,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. break } toAdd := minIntVarible(-drift, availableCapacity, availableCurrent) - // TODO: Do this in cdLookup so we can add the new CDs to Installing(). For now it's okay - // because the new CDs are guaranteed to have a matching PoolVersion, and that's the only - // thing we need to keep consistent. - if err := r.addClusters(clp, poolVersion, toAdd, logger); err != nil { + if err := r.addClusters(clp, poolVersion, cds, toAdd, logger); err != nil { log.WithError(err).Error("error adding clusters") return reconcile.Result{}, err } @@ -402,6 +400,11 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. metricStaleClusterDeploymentsDeleted.WithLabelValues(clp.Namespace, clp.Name).Inc() } + if err := r.reconcileRunningClusters(clp, cds, logger); err != nil { + log.WithError(err).Error("error updating hibernating/running state") + return reconcile.Result{}, err + } + // One more (possible) status update: wait until the end to detect whether all unassigned CDs // are current with the pool config, since assigning or deleting CDs may eliminate the last // one that isn't. @@ -418,6 +421,47 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile. return reconcile.Result{}, nil } +// reconcileRunningClusters ensures the oldest pool.spec.runningCount unassigned clusters are +// set to running, and the remainder are set to hibernating. +func (r *ReconcileClusterPool) reconcileRunningClusters( + clp *hivev1.ClusterPool, + cds *cdCollection, + logger log.FieldLogger, +) error { + runningCount := int(clp.Spec.RunningCount) + cdList := append(cds.Assignable(), cds.Installing()...) + // Sort by age, oldest first + sort.Slice( + cdList, + func(i, j int) bool { + return cdList[i].CreationTimestamp.Before(&cdList[j].CreationTimestamp) + }, + ) + for i := 0; i < len(cdList); i++ { + cd := cdList[i] + var desiredPowerState hivev1.ClusterPowerState + if i < runningCount { + desiredPowerState = hivev1.RunningClusterPowerState + } else { + desiredPowerState = hivev1.HibernatingClusterPowerState + } + if cd.Spec.PowerState == desiredPowerState { + continue + } + cd.Spec.PowerState = desiredPowerState + contextLogger := logger.WithFields(log.Fields{ + "cluster": cd.Name, + "newPowerState": desiredPowerState, + }) + contextLogger.Info("Changing cluster powerState to satisfy runningCount") + if err := r.Update(context.Background(), cd); err != nil { + contextLogger.WithError(err).Error("could not update powerState") + return err + } + } + return nil +} + // calculatePoolVersion computes a hash of the important (to ClusterDeployments) fields of the // ClusterPool.Spec. This is annotated on ClusterDeployments when the pool creates them, which // subsequently allows us to tell whether all unclaimed CDs are up to date and set a condition @@ -546,6 +590,7 @@ func (r *ReconcileClusterPool) applyRoleBinding(desired, observed *rbacv1.RoleBi func (r *ReconcileClusterPool) addClusters( clp *hivev1.ClusterPool, poolVersion string, + cds *cdCollection, newClusterCount int, logger log.FieldLogger, ) error { @@ -585,9 +630,11 @@ func (r *ReconcileClusterPool) addClusters( } for i := 0; i < newClusterCount; i++ { - if err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger); err != nil { + cd, err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger) + if err != nil { return err } + cds.RegisterNewCluster(cd) } return nil @@ -600,11 +647,11 @@ func (r *ReconcileClusterPool) createCluster( installConfigTemplate string, poolVersion string, logger log.FieldLogger, -) error { +) (*hivev1.ClusterDeployment, error) { ns, err := r.createRandomNamespace(clp) if err != nil { logger.WithError(err).Error("error obtaining random namespace") - return err + return nil, err } logger.WithField("cluster", ns.Name).Info("Creating new cluster") @@ -637,20 +684,21 @@ func (r *ReconcileClusterPool) createCluster( objs, err := builder.Build() if err != nil { - return errors.Wrap(err, "error building resources") + return nil, errors.Wrap(err, "error building resources") } poolKey := types.NamespacedName{Namespace: clp.Namespace, Name: clp.Name}.String() r.expectations.ExpectCreations(poolKey, 1) + var cd *hivev1.ClusterDeployment // Add the ClusterPoolRef to the ClusterDeployment, and move it to the end of the slice. for i, obj := range objs { - cd, ok := obj.(*hivev1.ClusterDeployment) + var ok bool + cd, ok = obj.(*hivev1.ClusterDeployment) if !ok { continue } poolRef := poolReference(clp) cd.Spec.ClusterPoolRef = &poolRef - cd.Spec.PowerState = hivev1.HibernatingClusterPowerState lastIndex := len(objs) - 1 objs[i], objs[lastIndex] = objs[lastIndex], objs[i] } @@ -658,11 +706,11 @@ func (r *ReconcileClusterPool) createCluster( for _, obj := range objs { if err := r.Client.Create(context.Background(), obj.(client.Object)); err != nil { r.expectations.CreationObserved(poolKey) - return err + return nil, err } } - return nil + return cd, nil } func (r *ReconcileClusterPool) createRandomNamespace(clp *hivev1.ClusterPool) (*corev1.Namespace, error) { diff --git a/pkg/controller/clusterpool/clusterpool_controller_test.go b/pkg/controller/clusterpool/clusterpool_controller_test.go index 1903132d6f5..22af48b099b 100644 --- a/pkg/controller/clusterpool/clusterpool_controller_test.go +++ b/pkg/controller/clusterpool/clusterpool_controller_test.go @@ -1251,6 +1251,127 @@ func TestReconcileClusterPool(t *testing.T) { expectedObservedSize: 2, expectedTotalClusters: 3, }, + { + name: "runningCount < size", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(2), + ), + }, + expectedTotalClusters: 4, + expectedRunning: 2, + }, + { + name: "runningCount == size", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(4), + ), + }, + expectedTotalClusters: 4, + expectedRunning: 4, + }, + { + name: "runningCount > size", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(400), + ), + }, + expectedTotalClusters: 4, + expectedRunning: 4, + }, + { + name: "runningCount restored after deletion", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(2), + ), + unclaimedCDBuilder("c1").Build(testcd.Generic(testgeneric.Deleted()), testcd.WithPowerState(hivev1.RunningClusterPowerState)), + unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState)), + unclaimedCDBuilder("c3").Build(), + unclaimedCDBuilder("c4").Build(), + }, + expectedObservedSize: 3, + expectedTotalClusters: 5, + expectedRunning: 3, + }, + { + name: "runningCount restored after claim", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(2), + ), + unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c3").Build(testcd.Installed()), + unclaimedCDBuilder("c4").Build(testcd.Installed()), + testclaim.FullBuilder(testNamespace, "test-claim", scheme).Build(testclaim.WithPool(testLeasePoolName)), + }, + expectedObservedSize: 4, + expectedObservedReady: 4, + expectedTotalClusters: 5, + expectedRunning: 3, + expectedAssignedCDs: 1, + expectedAssignedClaims: 1, + }, + { + name: "pool size > number of claims > runningCount", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(2), + ), + unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c3").Build(testcd.Installed()), + unclaimedCDBuilder("c4").Build(testcd.Installed()), + testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)), + }, + expectedObservedSize: 4, + expectedObservedReady: 4, + expectedTotalClusters: 7, + expectedRunning: 5, + expectedAssignedCDs: 3, + expectedAssignedClaims: 3, + }, + { + name: "number of claims > pool size > runningCount", + existing: []runtime.Object{ + initializedPoolBuilder.Build( + testcp.WithSize(4), + testcp.WithRunningCount(2), + ), + unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()), + unclaimedCDBuilder("c3").Build(testcd.Installed()), + unclaimedCDBuilder("c4").Build(testcd.Installed()), + testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim4", scheme).Build(testclaim.WithPool(testLeasePoolName)), + testclaim.FullBuilder(testNamespace, "test-claim5", scheme).Build(testclaim.WithPool(testLeasePoolName)), + }, + expectedObservedSize: 4, + expectedObservedReady: 4, + expectedTotalClusters: 9, + // The four original pool CDs got claimed. Two were already running; we started the other two. + // We create five new CDs to satisfy the pool size plus the additional claim. + // Of those, we start two for runningCount, and hibernate the rest. + // (Intuitively, perhaps the one for the excess claim should start off in running state. + // But that's not how the logic works today. Should it?) + expectedRunning: 6, + expectedAssignedCDs: 4, + expectedAssignedClaims: 4, + expectedUnassignedClaims: 1, + }, } for _, test := range tests { diff --git a/pkg/controller/clusterpool/collections.go b/pkg/controller/clusterpool/collections.go index d4add31178d..376e6c1b874 100644 --- a/pkg/controller/clusterpool/collections.go +++ b/pkg/controller/clusterpool/collections.go @@ -10,6 +10,7 @@ import ( log "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilerrors "k8s.io/apimachinery/pkg/util/errors" "sigs.k8s.io/controller-runtime/pkg/client" @@ -224,6 +225,29 @@ func isBroken(cd *hivev1.ClusterDeployment) bool { return false } +func isRunning(cd *hivev1.ClusterDeployment) bool { + // TODO: Change this to use the Running condition + cond := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions, hivev1.ClusterHibernatingCondition) + if cond == nil { + // Since we should be initializing conditions, this probably means the CD is super fresh + // and quite unlikely to be running. (That said, this really should never happen, since we + // only check this for Installed clusters.) + return false + } + return cond.Reason == hivev1.RunningHibernationReason +} + +func (cds *cdCollection) sortInstalling() { + // Sort installing CDs so we prioritize deleting those that are furthest away from completing + // their installation (prioritizing preserving those that will be assignable the soonest). + sort.Slice( + cds.installing, + func(i, j int) bool { + return cds.installing[i].CreationTimestamp.After(cds.installing[j].CreationTimestamp.Time) + }, + ) +} + // getAllClusterDeploymentsForPool is the constructor for a cdCollection // comprising all the ClusterDeployments created for the specified ClusterPool. func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, poolVersion string, logger log.FieldLogger) (*cdCollection, error) { @@ -243,6 +267,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, byCDName: make(map[string]*hivev1.ClusterDeployment), byClaimName: make(map[string]*hivev1.ClusterDeployment), } + running := 0 for i, cd := range cdList.Items { poolRef := cd.Spec.ClusterPoolRef if poolRef == nil || poolRef.Namespace != pool.Namespace || poolRef.PoolName != pool.Name { @@ -267,6 +292,9 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, cdCol.broken = append(cdCol.broken, ref) } else if cd.Spec.Installed { cdCol.assignable = append(cdCol.assignable, ref) + if isRunning(&cd) { + running++ + } } else { cdCol.installing = append(cdCol.installing, ref) } @@ -297,14 +325,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, return cdCol.assignable[i].CreationTimestamp.Before(&cdCol.assignable[j].CreationTimestamp) }, ) - // Sort installing CDs so we prioritize deleting those that are furthest away from completing - // their installation (prioritizing preserving those that will be assignable the soonest). - sort.Slice( - cdCol.installing, - func(i, j int) bool { - return cdCol.installing[i].CreationTimestamp.After(cdCol.installing[j].CreationTimestamp.Time) - }, - ) + cdCol.sortInstalling() // Sort stale CDs by age so we delete the oldest first sort.Slice( cdCol.unknownPoolVersion, @@ -325,6 +346,7 @@ func getAllClusterDeploymentsForPool(c client.Client, pool *hivev1.ClusterPool, "deleting": len(cdCol.deleting), "installing": len(cdCol.installing), "unclaimed": len(cdCol.installing) + len(cdCol.assignable), + "running": running, "stale": len(cdCol.unknownPoolVersion) + len(cdCol.mismatchedPoolVersion), "broken": len(cdCol.broken), }).Debug("found clusters for ClusterPool") @@ -395,6 +417,13 @@ func (cds *cdCollection) Stale() []*hivev1.ClusterDeployment { return append(cds.unknownPoolVersion, cds.mismatchedPoolVersion...) } +// RegisterNewCluster adds a freshly-created cluster to the cdCollection, assuming it is installing. +func (cds *cdCollection) RegisterNewCluster(cd *hivev1.ClusterDeployment) { + cds.byCDName[cd.Name] = cd + cds.installing = append(cds.installing, cd) + cds.sortInstalling() +} + // Assign assigns the specified ClusterDeployment to the specified claim, updating its spec on the // server. Errors from the update are bubbled up. Returns an error if the CD is already assigned // (to *any* claim). The CD must be from the Assignable() list; otherwise it is an error. @@ -407,6 +436,9 @@ func (cds *cdCollection) Assign(c client.Client, cd *hivev1.ClusterDeployment, c if cdi.Name == cd.Name { // Update the spec cdi.Spec.ClusterPoolRef.ClaimName = claim.Name + now := metav1.Now() + cdi.Spec.ClusterPoolRef.ClaimedTimestamp = &now + // This may be redundant if we already did it to satisfy runningCount; but no harm. cdi.Spec.PowerState = hivev1.RunningClusterPowerState if err := c.Update(context.Background(), cdi); err != nil { return err @@ -602,6 +634,7 @@ func assignClustersToClaims(c client.Client, claims *claimCollection, cds *cdCol claimList := make([]*hivev1.ClusterClaim, numToAssign) copy(claimList, claims.Unassigned()) cdList := make([]*hivev1.ClusterDeployment, numToAssign) + // Assignable is sorted by age, oldest first. copy(cdList, cds.Assignable()) var errs []error for i := 0; i < numToAssign; i++ { diff --git a/pkg/controller/hibernation/hibernation_controller.go b/pkg/controller/hibernation/hibernation_controller.go index 019a6ae8e6b..e01def02a93 100644 --- a/pkg/controller/hibernation/hibernation_controller.go +++ b/pkg/controller/hibernation/hibernation_controller.go @@ -224,21 +224,45 @@ func (r *hibernationReconciler) Reconcile(ctx context.Context, request reconcile } } - // Check if HibernateAfter is set, and if the cluster has been in running state for longer than this duration, put it to sleep. - if cd.Spec.HibernateAfter != nil && cd.Spec.PowerState != hivev1.HibernatingClusterPowerState { + // Check if HibernateAfter is set, and put it to sleep if necessary. + // As the baseline timestamp for determining whether HibernateAfter should trigger, we use the latest of: + // - When the cluster finished installing (status.installedTimestamp) + // - When the cluster was claimed (spec.clusterPoolRef.claimedTimestamp -- ClusterPool CDs only) + // - The last time the cluster resumed (status.conditions[Hibernating].lastTransitionTime if not hibernating (but see TODO)) + // BUT pool clusters wait until they're claimed for HibernateAfter to have effect. + poolRef := cd.Spec.ClusterPoolRef + isUnclaimedPoolCluster := poolRef != nil && poolRef.PoolName != "" && + // Upgrade note: If we hit this code path on a CD that was claimed before upgrading to + // where we introduced ClaimedTimestamp, then that CD was Hibernating when it was claimed + // (because that's the same time we introduced ClusterPool.RunningCount) so it's safe to + // just use installed/last-resumed as the baseline for hibernateAfter. + (poolRef.ClaimName == "" || poolRef.ClaimedTimestamp == nil) + if cd.Spec.HibernateAfter != nil && cd.Spec.PowerState != hivev1.HibernatingClusterPowerState && !isUnclaimedPoolCluster { hibernateAfterDur := cd.Spec.HibernateAfter.Duration - runningSince := cd.Status.InstalledTimestamp.Time - hibLog := cdLog.WithFields(log.Fields{ - "runningSince": runningSince, - "hibernateAfter": hibernateAfterDur, - }) + hibLog := cdLog.WithField("hibernateAfter", hibernateAfterDur) + + // The nil values of each of these will make them "earliest" and therefore unused. + var installedSince, runningSince, claimedSince time.Time var isRunning bool + installedSince = cd.Status.InstalledTimestamp.Time + hibLog = hibLog.WithField("installedSince", installedSince) + + if poolRef != nil && poolRef.ClaimedTimestamp != nil { + claimedSince = poolRef.ClaimedTimestamp.Time + hibLog = hibLog.WithField("claimedSince", claimedSince) + } else { + // This means it's not a pool cluster (!isUnclaimedPoolCluster && poolRef == nil) + hibLog.Debug("cluster does not belong to a clusterpool") + } + cond := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions, hivev1.ClusterHibernatingCondition) - if cond.Status == corev1.ConditionUnknown { - hibLog.Debug("cluster has never been hibernated, using installed time") + if cond == nil || cond.Status == corev1.ConditionUnknown { + hibLog.Debug("cluster has never been hibernated") isRunning = true } else if cond.Status == corev1.ConditionFalse { + // TODO: This seems wrong. Shouldn't we start the clock from when we declared the + // cluster running, rather than when we first asked it to resume? runningSince = cond.LastTransitionTime.Time hibLog = hibLog.WithField("runningSince", runningSince) hibLog.WithField("reason", cond.Reason).Debug("hibernating condition false") @@ -246,7 +270,13 @@ func (r *hibernationReconciler) Reconcile(ctx context.Context, request reconcile } if isRunning { - expiry := runningSince.Add(hibernateAfterDur) + // Which timestamp should we use to calculate HibernateAfter from? + // Sort our timestamps in descending order and use the first (latest) one. + stamps := []time.Time{installedSince, claimedSince, runningSince} + sort.Slice(stamps, func(i, j int) bool { + return stamps[i].After(stamps[j]) + }) + expiry := stamps[0].Add(hibernateAfterDur) hibLog.Debugf("cluster should be hibernating after: %s", expiry) if time.Now().After(expiry) { hibLog.WithField("expiry", expiry).Debug("cluster has been running longer than hibernate-after duration, moving to hibernating powerState") diff --git a/pkg/test/clusterdeployment/clusterdeployment.go b/pkg/test/clusterdeployment/clusterdeployment.go index 4eb238d99f1..bec683a8c4f 100644 --- a/pkg/test/clusterdeployment/clusterdeployment.go +++ b/pkg/test/clusterdeployment/clusterdeployment.go @@ -136,6 +136,8 @@ func WithClusterPoolReference(namespace, poolName, claimName string) Option { PoolName: poolName, ClaimName: claimName, } + now := metav1.Now() + clusterDeployment.Spec.ClusterPoolRef.ClaimedTimestamp = &now } } diff --git a/pkg/test/clusterpool/clusterpool.go b/pkg/test/clusterpool/clusterpool.go index 6dcabc7570c..985a9a71426 100644 --- a/pkg/test/clusterpool/clusterpool.go +++ b/pkg/test/clusterpool/clusterpool.go @@ -172,3 +172,9 @@ func WithCondition(cond hivev1.ClusterPoolCondition) Option { clusterPool.Status.Conditions = append(clusterPool.Status.Conditions, cond) } } + +func WithRunningCount(size int) Option { + return func(clusterPool *hivev1.ClusterPool) { + clusterPool.Spec.RunningCount = int32(size) + } +} diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/clusterdeployment_types.go b/vendor/github.com/openshift/hive/apis/hive/v1/clusterdeployment_types.go index f037b04f7f3..4d3ec7f6e90 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/clusterdeployment_types.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/clusterdeployment_types.go @@ -252,6 +252,9 @@ type ClusterPoolReference struct { // ClaimName is the name of the ClusterClaim that claimed the cluster from the pool. // +optional ClaimName string `json:"claimName,omitempty"` + // ClaimedTimestamp is the time this cluster was assigned to a ClusterClaim. This is only used for + // ClusterDeployments belonging to ClusterPools. + ClaimedTimestamp *metav1.Time `json:"claimedTimestamp,omitempty"` } // ClusterMetadata contains metadata information about the installed cluster. diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/clusterpool_types.go b/vendor/github.com/openshift/hive/apis/hive/v1/clusterpool_types.go index ffc8cc3a8bf..20fe500080d 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/clusterpool_types.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/clusterpool_types.go @@ -21,6 +21,12 @@ type ClusterPoolSpec struct { // +required Size int32 `json:"size"` + // RunningCount is the number of clusters we should keep running. The remainder will be kept hibernated until claimed. + // By default no clusters will be kept running (all will be hibernated). + // +kubebuilder:validation:Minimum=0 + // +optional + RunningCount int32 `json:"runningCount,omitempty"` + // MaxSize is the maximum number of clusters that will be provisioned including clusters that have been claimed // and ones waiting to be used. // By default there is no limit.