Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apis/hive/v1/clusterdeployment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ type ClusterPoolReference struct {
// ClaimName is the name of the ClusterClaim that claimed the cluster from the pool.
// +optional
ClaimName string `json:"claimName,omitempty"`
// ClaimedTimestamp is the time this cluster was assigned to a ClusterClaim. This is only used for
// ClusterDeployments belonging to ClusterPools.
ClaimedTimestamp *metav1.Time `json:"claimedTimestamp,omitempty"`
}

// ClusterMetadata contains metadata information about the installed cluster.
Expand Down
6 changes: 6 additions & 0 deletions apis/hive/v1/clusterpool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ type ClusterPoolSpec struct {
// +required
Size int32 `json:"size"`

// RunningCount is the number of clusters we should keep running. The remainder will be kept hibernated until claimed.
// By default no clusters will be kept running (all will be hibernated).
// +kubebuilder:validation:Minimum=0
// +optional
RunningCount int32 `json:"runningCount,omitempty"`

// MaxSize is the maximum number of clusters that will be provisioned including clusters that have been claimed
// and ones waiting to be used.
// By default there is no limit.
Expand Down
6 changes: 6 additions & 0 deletions config/crds/hive.openshift.io_clusterdeployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ spec:
description: ClaimName is the name of the ClusterClaim that claimed
the cluster from the pool.
type: string
claimedTimestamp:
description: ClaimedTimestamp is the time this cluster was assigned
to a ClusterClaim. This is only used for ClusterDeployments
belonging to ClusterPools.
format: date-time
type: string
namespace:
description: Namespace is the namespace where the ClusterPool
resides.
Expand Down
7 changes: 7 additions & 0 deletions config/crds/hive.openshift.io_clusterpools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,13 @@ spec:
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
type: object
runningCount:
description: RunningCount is the number of clusters we should keep
running. The remainder will be kept hibernated until claimed. By
default no clusters will be kept running (all will be hibernated).
format: int32
minimum: 0
type: integer
size:
description: Size is the default number of clusters that we should
keep provisioned and waiting for use.
Expand Down
11 changes: 6 additions & 5 deletions docs/clusterpools.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ the pool. `ClusterClaims` must be created in the same namespace as their
The user who claims a cluster can be given RBAC to their clusters namespace to
prevent anyone else from being able to access it.

Presently once a `ClusterDeployment` is ready, it will be
By default once a `ClusterDeployment` is ready, it will be
[hibernated](./hibernating-clusters.md) automatically. Once claimed it will be
automatically resumed, meaning that the typical time to claim a cluster and be
ready to go is in the 2-5 minute range while the cluster starts up. An option
to keep the clusters in the pool always running so they can instantly be used
may be added in the future.
ready to go is in the 2-5 minute range while the cluster starts up. You can
keep a subset of clusters active by setting `ClusterPool.Spec.RunningCount`;
such clusters will be ready immediately when claimed.

When done with a cluster, users can just delete their `ClusterClaim` and the
`ClusterDeployment` will be automatically deprovisioned. An optional
Expand Down Expand Up @@ -67,7 +67,8 @@ spec:
region: us-east-1
pullSecretRef:
name: hive-team-pull-secret
size: 1
runningCount: 1
size: 3
```

## Sample Cluster Claim
Expand Down
13 changes: 13 additions & 0 deletions hack/app-sre/saas-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,12 @@ objects:
description: ClaimName is the name of the ClusterClaim that
claimed the cluster from the pool.
type: string
claimedTimestamp:
description: ClaimedTimestamp is the time this cluster was assigned
to a ClusterClaim. This is only used for ClusterDeployments
belonging to ClusterPools.
format: date-time
type: string
namespace:
description: Namespace is the namespace where the ClusterPool
resides.
Expand Down Expand Up @@ -2143,6 +2149,13 @@ objects:
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
type: object
runningCount:
description: RunningCount is the number of clusters we should keep
running. The remainder will be kept hibernated until claimed.
By default no clusters will be kept running (all will be hibernated).
format: int32
minimum: 0
type: integer
size:
description: Size is the default number of clusters that we should
keep provisioned and waiting for use.
Expand Down
72 changes: 60 additions & 12 deletions pkg/controller/clusterpool/clusterpool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"math"
"reflect"
"sort"

"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -370,10 +371,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
break
}
toAdd := minIntVarible(-drift, availableCapacity, availableCurrent)
// TODO: Do this in cdLookup so we can add the new CDs to Installing(). For now it's okay
// because the new CDs are guaranteed to have a matching PoolVersion, and that's the only
// thing we need to keep consistent.
if err := r.addClusters(clp, poolVersion, toAdd, logger); err != nil {
if err := r.addClusters(clp, poolVersion, cds, toAdd, logger); err != nil {
log.WithError(err).Error("error adding clusters")
return reconcile.Result{}, err
}
Expand Down Expand Up @@ -402,6 +400,11 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
metricStaleClusterDeploymentsDeleted.WithLabelValues(clp.Namespace, clp.Name).Inc()
}

if err := r.reconcileRunningClusters(clp, cds, logger); err != nil {
log.WithError(err).Error("error updating hibernating/running state")
return reconcile.Result{}, err
}

// One more (possible) status update: wait until the end to detect whether all unassigned CDs
// are current with the pool config, since assigning or deleting CDs may eliminate the last
// one that isn't.
Expand All @@ -418,6 +421,47 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
return reconcile.Result{}, nil
}

// reconcileRunningClusters ensures the oldest pool.spec.runningCount unassigned clusters are
// set to running, and the remainder are set to hibernating.
func (r *ReconcileClusterPool) reconcileRunningClusters(
clp *hivev1.ClusterPool,
cds *cdCollection,
logger log.FieldLogger,
) error {
runningCount := int(clp.Spec.RunningCount)
cdList := append(cds.Assignable(), cds.Installing()...)
// Sort by age, oldest first
sort.Slice(
cdList,
func(i, j int) bool {
return cdList[i].CreationTimestamp.Before(&cdList[j].CreationTimestamp)
},
)
for i := 0; i < len(cdList); i++ {
cd := cdList[i]
var desiredPowerState hivev1.ClusterPowerState
if i < runningCount {
desiredPowerState = hivev1.RunningClusterPowerState
} else {
desiredPowerState = hivev1.HibernatingClusterPowerState
}
if cd.Spec.PowerState == desiredPowerState {
continue
}
cd.Spec.PowerState = desiredPowerState
contextLogger := logger.WithFields(log.Fields{
"cluster": cd.Name,
"newPowerState": desiredPowerState,
})
contextLogger.Info("Changing cluster powerState to satisfy runningCount")
if err := r.Update(context.Background(), cd); err != nil {
contextLogger.WithError(err).Error("could not update powerState")
return err
}
}
return nil
}

// calculatePoolVersion computes a hash of the important (to ClusterDeployments) fields of the
// ClusterPool.Spec. This is annotated on ClusterDeployments when the pool creates them, which
// subsequently allows us to tell whether all unclaimed CDs are up to date and set a condition
Expand Down Expand Up @@ -546,6 +590,7 @@ func (r *ReconcileClusterPool) applyRoleBinding(desired, observed *rbacv1.RoleBi
func (r *ReconcileClusterPool) addClusters(
clp *hivev1.ClusterPool,
poolVersion string,
cds *cdCollection,
newClusterCount int,
logger log.FieldLogger,
) error {
Expand Down Expand Up @@ -585,9 +630,11 @@ func (r *ReconcileClusterPool) addClusters(
}

for i := 0; i < newClusterCount; i++ {
if err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger); err != nil {
cd, err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger)
if err != nil {
return err
}
cds.RegisterNewCluster(cd)
}

return nil
Expand All @@ -600,11 +647,11 @@ func (r *ReconcileClusterPool) createCluster(
installConfigTemplate string,
poolVersion string,
logger log.FieldLogger,
) error {
) (*hivev1.ClusterDeployment, error) {
ns, err := r.createRandomNamespace(clp)
if err != nil {
logger.WithError(err).Error("error obtaining random namespace")
return err
return nil, err
}
logger.WithField("cluster", ns.Name).Info("Creating new cluster")

Expand Down Expand Up @@ -637,32 +684,33 @@ func (r *ReconcileClusterPool) createCluster(

objs, err := builder.Build()
if err != nil {
return errors.Wrap(err, "error building resources")
return nil, errors.Wrap(err, "error building resources")
}

poolKey := types.NamespacedName{Namespace: clp.Namespace, Name: clp.Name}.String()
r.expectations.ExpectCreations(poolKey, 1)
var cd *hivev1.ClusterDeployment
// Add the ClusterPoolRef to the ClusterDeployment, and move it to the end of the slice.
for i, obj := range objs {
cd, ok := obj.(*hivev1.ClusterDeployment)
var ok bool
cd, ok = obj.(*hivev1.ClusterDeployment)
if !ok {
continue
}
poolRef := poolReference(clp)
cd.Spec.ClusterPoolRef = &poolRef
cd.Spec.PowerState = hivev1.HibernatingClusterPowerState
lastIndex := len(objs) - 1
objs[i], objs[lastIndex] = objs[lastIndex], objs[i]
}
// Create the resources.
for _, obj := range objs {
if err := r.Client.Create(context.Background(), obj.(client.Object)); err != nil {
r.expectations.CreationObserved(poolKey)
return err
return nil, err
}
}

return nil
return cd, nil
}

func (r *ReconcileClusterPool) createRandomNamespace(clp *hivev1.ClusterPool) (*corev1.Namespace, error) {
Expand Down
121 changes: 121 additions & 0 deletions pkg/controller/clusterpool/clusterpool_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1251,6 +1251,127 @@ func TestReconcileClusterPool(t *testing.T) {
expectedObservedSize: 2,
expectedTotalClusters: 3,
},
{
name: "runningCount < size",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(2),
),
},
expectedTotalClusters: 4,
expectedRunning: 2,
},
{
name: "runningCount == size",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(4),
),
},
expectedTotalClusters: 4,
expectedRunning: 4,
},
{
name: "runningCount > size",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(400),
),
},
expectedTotalClusters: 4,
expectedRunning: 4,
},
{
name: "runningCount restored after deletion",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(2),
),
unclaimedCDBuilder("c1").Build(testcd.Generic(testgeneric.Deleted()), testcd.WithPowerState(hivev1.RunningClusterPowerState)),
unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState)),
unclaimedCDBuilder("c3").Build(),
unclaimedCDBuilder("c4").Build(),
},
expectedObservedSize: 3,
expectedTotalClusters: 5,
expectedRunning: 3,
},
{
name: "runningCount restored after claim",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(2),
),
unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c3").Build(testcd.Installed()),
unclaimedCDBuilder("c4").Build(testcd.Installed()),
testclaim.FullBuilder(testNamespace, "test-claim", scheme).Build(testclaim.WithPool(testLeasePoolName)),
},
expectedObservedSize: 4,
expectedObservedReady: 4,
expectedTotalClusters: 5,
expectedRunning: 3,
expectedAssignedCDs: 1,
expectedAssignedClaims: 1,
},
{
name: "pool size > number of claims > runningCount",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(2),
),
unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c3").Build(testcd.Installed()),
unclaimedCDBuilder("c4").Build(testcd.Installed()),
testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)),
},
expectedObservedSize: 4,
expectedObservedReady: 4,
expectedTotalClusters: 7,
expectedRunning: 5,
expectedAssignedCDs: 3,
expectedAssignedClaims: 3,
},
{
name: "number of claims > pool size > runningCount",
existing: []runtime.Object{
initializedPoolBuilder.Build(
testcp.WithSize(4),
testcp.WithRunningCount(2),
),
unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
unclaimedCDBuilder("c3").Build(testcd.Installed()),
unclaimedCDBuilder("c4").Build(testcd.Installed()),
testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim4", scheme).Build(testclaim.WithPool(testLeasePoolName)),
testclaim.FullBuilder(testNamespace, "test-claim5", scheme).Build(testclaim.WithPool(testLeasePoolName)),
},
expectedObservedSize: 4,
expectedObservedReady: 4,
expectedTotalClusters: 9,
// The four original pool CDs got claimed. Two were already running; we started the other two.
// We create five new CDs to satisfy the pool size plus the additional claim.
// Of those, we start two for runningCount, and hibernate the rest.
// (Intuitively, perhaps the one for the excess claim should start off in running state.
// But that's not how the logic works today. Should it?)
expectedRunning: 6,
expectedAssignedCDs: 4,
expectedAssignedClaims: 4,
expectedUnassignedClaims: 1,
},
}

for _, test := range tests {
Expand Down
Loading