ClusterPool RunningCount #1528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

openshift-merge-robot merged 2 commits into openshift:master from 2uasimojo:HIVE-1576.code

Sep 21, 2021

apis/hive/v1/clusterdeployment_types.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -252,6 +252,9 @@ type ClusterPoolReference struct { @@
     	// ClaimName is the name of the ClusterClaim that claimed the cluster from the pool.
     	// +optional
     	ClaimName string `json:"claimName,omitempty"`
+    	// ClaimedTimestamp is the time this cluster was assigned to a ClusterClaim. This is only used for
+    	// ClusterDeployments belonging to ClusterPools.
+    	ClaimedTimestamp *metav1.Time `json:"claimedTimestamp,omitempty"`
     }
     // ClusterMetadata contains metadata information about the installed cluster.
@@ Expand Down @@

apis/hive/v1/clusterpool_types.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,12 @@ type ClusterPoolSpec struct { @@
     	// +required
     	Size int32 `json:"size"`
+    	// RunningCount is the number of clusters we should keep running. The remainder will be kept hibernated until claimed.
+    	// By default no clusters will be kept running (all will be hibernated).
+    	// +kubebuilder:validation:Minimum=0
+    	// +optional
+    	RunningCount int32 `json:"runningCount,omitempty"`
     	// MaxSize is the maximum number of clusters that will be provisioned including clusters that have been claimed
     	// and ones waiting to be used.
     	// By default there is no limit.
@@ Expand Down @@

config/crds/hive.openshift.io_clusterdeployments.yaml

-Original file line number
+Diff line change
@@ Expand Up / @@ -181,6 +181,12 @@ spec: @@
                         description: ClaimName is the name of the ClusterClaim that claimed
                           the cluster from the pool.
                         type: string
+                      claimedTimestamp:
+                        description: ClaimedTimestamp is the time this cluster was assigned
+                          to a ClusterClaim. This is only used for ClusterDeployments
+                          belonging to ClusterPools.
+                        format: date-time
+                        type: string
                       namespace:
                         description: Namespace is the namespace where the ClusterPool
                           resides.
@@ Expand Down @@

config/crds/hive.openshift.io_clusterpools.yaml

-Original file line number
+Diff line change
@@ Expand Up / @@ -463,6 +463,13 @@ spec: @@
                           TODO: Add other useful fields. apiVersion, kind, uid?'
                         type: string
                     type: object
+                  runningCount:
+                    description: RunningCount is the number of clusters we should keep
+                      running. The remainder will be kept hibernated until claimed. By
+                      default no clusters will be kept running (all will be hibernated).
+                    format: int32
+                    minimum: 0
+                    type: integer
                   size:
                     description: Size is the default number of clusters that we should
                       keep provisioned and waiting for use.
@@ Expand Down @@

docs/clusterpools.md

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -24,12 +24,12 @@ the pool. `ClusterClaims` must be created in the same namespace as their
  
    The user who claims a cluster can be given RBAC to their clusters namespace to

    prevent anyone else from being able to access it.

    Presently once a `ClusterDeployment` is ready, it will be

    By default once a `ClusterDeployment` is ready, it will be

    [hibernated](./hibernating-clusters.md) automatically. Once claimed it will be

    automatically resumed, meaning that the typical time to claim a cluster and be

    ready to go is in the 2-5 minute range while the cluster starts up. An option

    to keep the clusters in the pool always running so they can instantly be used

    may be added in the future.

    ready to go is in the 2-5 minute range while the cluster starts up. You can

    keep a subset of clusters active by setting `ClusterPool.Spec.RunningCount`;

    such clusters will be ready immediately when claimed.

    When done with a cluster, users can just delete their `ClusterClaim` and the

    `ClusterDeployment` will be automatically deprovisioned. An optional

    @@ -67,7 +67,8 @@ spec:
  
          region: us-east-1

      pullSecretRef:

        name: hive-team-pull-secret

      size: 1

      runningCount: 1

      size: 3

    ```

    ## Sample Cluster Claim

hack/app-sre/saas-template.yaml

-Original file line number
+Diff line change
@@ Expand Up / @@ -434,6 +434,12 @@ objects: @@
                           description: ClaimName is the name of the ClusterClaim that
                             claimed the cluster from the pool.
                           type: string
+                        claimedTimestamp:
+                          description: ClaimedTimestamp is the time this cluster was assigned
+                            to a ClusterClaim. This is only used for ClusterDeployments
+                            belonging to ClusterPools.
+                          format: date-time
+                          type: string
                         namespace:
                           description: Namespace is the namespace where the ClusterPool
                             resides.
@@ Expand Down Expand Up / @@ -2143,6 +2149,13 @@ objects: @@
                             TODO: Add other useful fields. apiVersion, kind, uid?'
                           type: string
                       type: object
+                    runningCount:
+                      description: RunningCount is the number of clusters we should keep
+                        running. The remainder will be kept hibernated until claimed.
+                        By default no clusters will be kept running (all will be hibernated).
+                      format: int32
+                      minimum: 0
+                      type: integer
                     size:
                       description: Size is the default number of clusters that we should
                         keep provisioned and waiting for use.
@@ Expand Down @@

pkg/controller/clusterpool/clusterpool_controller.go

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -5,6 +5,7 @@ import (
  
    	"fmt"

    	"math"

    	"reflect"

    	"sort"

    	"github.com/pkg/errors"

    	log "github.com/sirupsen/logrus"

    @@ -370,10 +371,7 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
  
    			break

    		}

    		toAdd := minIntVarible(-drift, availableCapacity, availableCurrent)

    		// TODO: Do this in cdLookup so we can add the new CDs to Installing(). For now it's okay

    		// because the new CDs are guaranteed to have a matching PoolVersion, and that's the only

    		// thing we need to keep consistent.

    		if err := r.addClusters(clp, poolVersion, toAdd, logger); err != nil {

    		if err := r.addClusters(clp, poolVersion, cds, toAdd, logger); err != nil {

    			log.WithError(err).Error("error adding clusters")

    			return reconcile.Result{}, err

    		}

    @@ -402,6 +400,11 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
  
    		metricStaleClusterDeploymentsDeleted.WithLabelValues(clp.Namespace, clp.Name).Inc()

    	}

    	if err := r.reconcileRunningClusters(clp, cds, logger); err != nil {

    		log.WithError(err).Error("error updating hibernating/running state")

    		return reconcile.Result{}, err

    	}

    	// One more (possible) status update: wait until the end to detect whether all unassigned CDs

    	// are current with the pool config, since assigning or deleting CDs may eliminate the last

    	// one that isn't.

    @@ -418,6 +421,47 @@ func (r *ReconcileClusterPool) Reconcile(ctx context.Context, request reconcile.
  
    	return reconcile.Result{}, nil

    }

    // reconcileRunningClusters ensures the oldest pool.spec.runningCount unassigned clusters are

    // set to running, and the remainder are set to hibernating.

    func (r *ReconcileClusterPool) reconcileRunningClusters(

    	clp *hivev1.ClusterPool,

    	cds *cdCollection,

    	logger log.FieldLogger,

    ) error {

    	runningCount := int(clp.Spec.RunningCount)

    	cdList := append(cds.Assignable(), cds.Installing()...)

    	// Sort by age, oldest first

    	sort.Slice(

    		cdList,

    		func(i, j int) bool {

    			return cdList[i].CreationTimestamp.Before(&cdList[j].CreationTimestamp)

    		},

    	)

    	for i := 0; i < len(cdList); i++ {

    		cd := cdList[i]

    		var desiredPowerState hivev1.ClusterPowerState

    		if i < runningCount {

    			desiredPowerState = hivev1.RunningClusterPowerState

    		} else {

    			desiredPowerState = hivev1.HibernatingClusterPowerState

    		}

    		if cd.Spec.PowerState == desiredPowerState {

    			continue

    		}

    		cd.Spec.PowerState = desiredPowerState

    		contextLogger := logger.WithFields(log.Fields{

    			"cluster":       cd.Name,

    			"newPowerState": desiredPowerState,

    		})

    		contextLogger.Info("Changing cluster powerState to satisfy runningCount")

    		if err := r.Update(context.Background(), cd); err != nil {

    			contextLogger.WithError(err).Error("could not update powerState")

    			return err

    		}

    	}

    	return nil

    }

    // calculatePoolVersion computes a hash of the important (to ClusterDeployments) fields of the

    // ClusterPool.Spec. This is annotated on ClusterDeployments when the pool creates them, which

    // subsequently allows us to tell whether all unclaimed CDs are up to date and set a condition

    @@ -546,6 +590,7 @@ func (r *ReconcileClusterPool) applyRoleBinding(desired, observed *rbacv1.RoleBi
  
    func (r *ReconcileClusterPool) addClusters(

    	clp *hivev1.ClusterPool,

    	poolVersion string,

    	cds *cdCollection,

    	newClusterCount int,

    	logger log.FieldLogger,

    ) error {

    @@ -585,9 +630,11 @@ func (r *ReconcileClusterPool) addClusters(
  
    	}

    	for i := 0; i < newClusterCount; i++ {

    		if err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger); err != nil {

    		cd, err := r.createCluster(clp, cloudBuilder, pullSecret, installConfigTemplate, poolVersion, logger)

    		if err != nil {

    			return err

    		}

    		cds.RegisterNewCluster(cd)

    	}

    	return nil

    @@ -600,11 +647,11 @@ func (r *ReconcileClusterPool) createCluster(
  
    	installConfigTemplate string,

    	poolVersion string,

    	logger log.FieldLogger,

    ) error {

    ) (*hivev1.ClusterDeployment, error) {

    	ns, err := r.createRandomNamespace(clp)

    	if err != nil {

    		logger.WithError(err).Error("error obtaining random namespace")

    		return err

    		return nil, err

    	}

    	logger.WithField("cluster", ns.Name).Info("Creating new cluster")

    @@ -637,32 +684,33 @@ func (r *ReconcileClusterPool) createCluster(
  
    	objs, err := builder.Build()

    	if err != nil {

    		return errors.Wrap(err, "error building resources")

    		return nil, errors.Wrap(err, "error building resources")

    	}

    	poolKey := types.NamespacedName{Namespace: clp.Namespace, Name: clp.Name}.String()

    	r.expectations.ExpectCreations(poolKey, 1)

    	var cd *hivev1.ClusterDeployment

    	// Add the ClusterPoolRef to the ClusterDeployment, and move it to the end of the slice.

    	for i, obj := range objs {

    		cd, ok := obj.(*hivev1.ClusterDeployment)

    		var ok bool

    		cd, ok = obj.(*hivev1.ClusterDeployment)

    		if !ok {

    			continue

    		}

    		poolRef := poolReference(clp)

    		cd.Spec.ClusterPoolRef = &poolRef

    		cd.Spec.PowerState = hivev1.HibernatingClusterPowerState

    		lastIndex := len(objs) - 1

    		objs[i], objs[lastIndex] = objs[lastIndex], objs[i]

    	}

    	// Create the resources.

    	for _, obj := range objs {

    		if err := r.Client.Create(context.Background(), obj.(client.Object)); err != nil {

    			r.expectations.CreationObserved(poolKey)

    			return err

    			return nil, err

    		}

    	}

    	return nil

    	return cd, nil

    }

    func (r *ReconcileClusterPool) createRandomNamespace(clp *hivev1.ClusterPool) (*corev1.Namespace, error) {

pkg/controller/clusterpool/clusterpool_controller_test.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -1251,6 +1251,127 @@ func TestReconcileClusterPool(t *testing.T) { @@
     			expectedObservedSize:   2,
     			expectedTotalClusters:  3,
     		},
+    		{
+    			name: "runningCount < size",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(2),
+    				),
+    			},
+    			expectedTotalClusters: 4,
+    			expectedRunning:       2,
+    		},
+    		{
+    			name: "runningCount == size",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(4),
+    				),
+    			},
+    			expectedTotalClusters: 4,
+    			expectedRunning:       4,
+    		},
+    		{
+    			name: "runningCount > size",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(400),
+    				),
+    			},
+    			expectedTotalClusters: 4,
+    			expectedRunning:       4,
+    		},
+    		{
+    			name: "runningCount restored after deletion",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(2),
+    				),
+    				unclaimedCDBuilder("c1").Build(testcd.Generic(testgeneric.Deleted()), testcd.WithPowerState(hivev1.RunningClusterPowerState)),
+    				unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState)),
+    				unclaimedCDBuilder("c3").Build(),
+    				unclaimedCDBuilder("c4").Build(),
+    			},
+    			expectedObservedSize:  3,
+    			expectedTotalClusters: 5,
+    			expectedRunning:       3,
+    		},
+    		{
+    			name: "runningCount restored after claim",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(2),
+    				),
+    				unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c3").Build(testcd.Installed()),
+    				unclaimedCDBuilder("c4").Build(testcd.Installed()),
+    				testclaim.FullBuilder(testNamespace, "test-claim", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    			},
+    			expectedObservedSize:   4,
+    			expectedObservedReady:  4,
+    			expectedTotalClusters:  5,
+    			expectedRunning:        3,
+    			expectedAssignedCDs:    1,
+    			expectedAssignedClaims: 1,
+    		},
+    		{
+    			name: "pool size > number of claims > runningCount",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(2),
+    				),
+    				unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c3").Build(testcd.Installed()),
+    				unclaimedCDBuilder("c4").Build(testcd.Installed()),
+    				testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    			},
+    			expectedObservedSize:   4,
+    			expectedObservedReady:  4,
+    			expectedTotalClusters:  7,
+    			expectedRunning:        5,
+    			expectedAssignedCDs:    3,
+    			expectedAssignedClaims: 3,
+    		},
+    		{
+    			name: "number of claims > pool size > runningCount",
+    			existing: []runtime.Object{
+    				initializedPoolBuilder.Build(
+    					testcp.WithSize(4),
+    					testcp.WithRunningCount(2),
+    				),
+    				unclaimedCDBuilder("c1").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c2").Build(testcd.WithPowerState(hivev1.RunningClusterPowerState), testcd.Installed()),
+    				unclaimedCDBuilder("c3").Build(testcd.Installed()),
+    				unclaimedCDBuilder("c4").Build(testcd.Installed()),
+    				testclaim.FullBuilder(testNamespace, "test-claim1", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim2", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim3", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim4", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    				testclaim.FullBuilder(testNamespace, "test-claim5", scheme).Build(testclaim.WithPool(testLeasePoolName)),
+    			},
+    			expectedObservedSize:  4,
+    			expectedObservedReady: 4,
+    			expectedTotalClusters: 9,
+    			// The four original pool CDs got claimed. Two were already running; we started the other two.
+    			// We create five new CDs to satisfy the pool size plus the additional claim.
+    			// Of those, we start two for runningCount, and hibernate the rest.
+    			// (Intuitively, perhaps the one for the excess claim should start off in running state.
+    			// But that's not how the logic works today. Should it?)
+    			expectedRunning:          6,
+    			expectedAssignedCDs:      4,
+    			expectedAssignedClaims:   4,
+    			expectedUnassignedClaims: 1,
+    		},
     	}
     	for _, test := range tests {
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ClusterPool RunningCount #1528

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!