Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ else
-e "GO111MODULE=$(GO111MODULE)" \
-e "GOFLAGS=$(GOFLAGS)" \
-e "GOPROXY=$(GOPROXY)" \
openshift/origin-release:golang-1.18
registry.ci.openshift.org/openshift/release:golang-1.18
IMAGE_BUILD_CMD = $(ENGINE) build
endif

Expand Down
38 changes: 24 additions & 14 deletions pkg/autoscaler/autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func clusterAutoscalerResource(maxNodesTotal int) *caov1.ClusterAutoscaler {
// and that has high least common multiple to avoid a case
// when a node is considered to be empty even if there are
// pods already scheduled and running on the node.
unneededTimeString := "23s"
unneededTimeString := "60s"
return &caov1.ClusterAutoscaler{
ObjectMeta: metav1.ObjectMeta{
Name: "default",
Expand Down Expand Up @@ -293,8 +293,9 @@ var _ = Describe("[Feature:Machines] Autoscaler should", func() {
Expect(client.Create(ctx, asr)).Should(Succeed())
cleanupObjects[asr.GetName()] = asr

By(fmt.Sprintf("Creating scale-out workload: jobs: %v, memory: %s", expectedReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(expectedReplicas, workloadMemRequest, workloadJobName, autoscalingTestLabel, targetedNodeLabel, "")
uniqueJobName := fmt.Sprintf("%s-scale-from-zero", workloadJobName)
By(fmt.Sprintf("Creating scale-out workload %s: jobs: %v, memory: %s", uniqueJobName, expectedReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(expectedReplicas, workloadMemRequest, uniqueJobName, autoscalingTestLabel, targetedNodeLabel, "")
cleanupObjects[workload.GetName()] = workload
Expect(client.Create(ctx, workload)).Should(Succeed())

Expand Down Expand Up @@ -353,9 +354,10 @@ var _ = Describe("[Feature:Machines] Autoscaler should", func() {
}

jobReplicas := expectedReplicas * int32(2)
By(fmt.Sprintf("Creating scale-out workload: jobs: %v, memory: %s",
jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, workloadJobName, autoscalingTestLabel, targetedNodeLabel, "")
uniqueJobName := fmt.Sprintf("%s-cleanup-after-scale-down", workloadJobName)
By(fmt.Sprintf("Creating scale-out workload %s: jobs: %v, memory: %s",
uniqueJobName, jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, uniqueJobName, autoscalingTestLabel, targetedNodeLabel, "")
cleanupObjects[workload.GetName()] = workload
Expect(client.Create(ctx, workload)).Should(Succeed())

Expand Down Expand Up @@ -506,17 +508,21 @@ var _ = Describe("[Feature:Machines] Autoscaler should", func() {
// to the maximum MachineSet size this will create enough demand to
// grow the cluster to maximum size.
jobReplicas := maxMachineSetReplicas
By(fmt.Sprintf("Creating scale-out workload: jobs: %v, memory: %s",
jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, workloadJobName, autoscalingTestLabel, targetedNodeLabel, "")
uniqueJobName := fmt.Sprintf("%s-scale-to-maxnodestotal", workloadJobName)
By(fmt.Sprintf("Creating scale-out workload %s: jobs: %v, memory: %s",
uniqueJobName, jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, uniqueJobName, autoscalingTestLabel, targetedNodeLabel, "")
cleanupObjects[workload.GetName()] = workload
Expect(client.Create(ctx, workload)).Should(Succeed())

// At this point the autoscaler should be growing the cluster, we
// wait until the cluster has grown to reach MaxNodesTotal size.
// Because the autoscaler will ignore nodes that are not ready or unschedulable,
// we need to check against the number of ready nodes in the cluster since
// previous tests might have left nodes that are not ready or unschedulable.
By(fmt.Sprintf("Waiting for cluster to scale up to %d nodes", caMaxNodesTotal))
Eventually(func() (bool, error) {
nodes, err := framework.GetNodes(client)
nodes, err := framework.GetReadyAndSchedulableNodes(client)
return len(nodes) == caMaxNodesTotal, err
}, framework.WaitLong, pollingInterval).Should(BeTrue())

Expand All @@ -528,9 +534,12 @@ var _ = Describe("[Feature:Machines] Autoscaler should", func() {

// Now that the cluster has reached maximum size, we want to ensure
// that it doesn't try to grow larger.
// Because the autoscaler will ignore nodes that are not ready or unschedulable,
// we need to check against the number of ready nodes in the cluster since
// previous tests might have left nodes that are not ready or unschedulable.
By("Watching Cluster node count to ensure it remains consistent")
Consistently(func() (bool, error) {
nodes, err := framework.GetNodes(client)
nodes, err := framework.GetReadyAndSchedulableNodes(client)
return len(nodes) == caMaxNodesTotal, err
}, framework.WaitShort, pollingInterval).Should(BeTrue())

Expand Down Expand Up @@ -599,9 +608,10 @@ var _ = Describe("[Feature:Machines] Autoscaler should", func() {
// expand its size by 2 nodes. the cluster autoscaler should
// place 1 node in each of the 2 MachineSets created.
jobReplicas := int32(4)
By(fmt.Sprintf("Creating scale-out workload: jobs: %v, memory: %s",
jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, workloadJobName, autoscalingTestLabel, targetedNodeLabel, "")
uniqueJobName := fmt.Sprintf("%s-balance-nodegroups", workloadJobName)
By(fmt.Sprintf("Creating scale-out workload %s: jobs: %v, memory: %s",
uniqueJobName, jobReplicas, workloadMemRequest.String()))
workload := framework.NewWorkLoad(jobReplicas, workloadMemRequest, uniqueJobName, autoscalingTestLabel, targetedNodeLabel, "")
cleanupObjects[workload.GetName()] = workload
Expect(client.Create(ctx, workload)).Should(Succeed())

Expand Down
13 changes: 7 additions & 6 deletions pkg/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,13 @@ const (
RetryMedium = 5 * time.Second
// DefaultMachineSetReplicas is the default number of replicas of a machineset
// if MachineSet.Spec.Replicas field is set to nil
DefaultMachineSetReplicas = 0
MachinePhaseRunning = "Running"
MachinePhaseFailed = "Failed"
MachineRoleLabel = "machine.openshift.io/cluster-api-machine-role"
MachineTypeLabel = "machine.openshift.io/cluster-api-machine-type"
MachineAnnotationKey = "machine.openshift.io/machine"
DefaultMachineSetReplicas = 0
MachinePhaseRunning = "Running"
MachinePhaseFailed = "Failed"
MachineRoleLabel = "machine.openshift.io/cluster-api-machine-role"
MachineTypeLabel = "machine.openshift.io/cluster-api-machine-type"
MachineAnnotationKey = "machine.openshift.io/machine"
ClusterAPIActuatorPkgTaint = "cluster-api-actuator-pkg"
)

var (
Expand Down
4 changes: 4 additions & 0 deletions pkg/framework/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ func NewWorkLoad(njobs int32, memoryRequest resource.Quantity, workloadJobName s
Key: "kubemark",
Operator: corev1.TolerationOpExists,
},
{
Key: ClusterAPIActuatorPkgTaint,
Effect: corev1.TaintEffectPreferNoSchedule,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want a NoSchedule rather than prefer, prefer means we might still get random workloads on here right? You said you had some issues with this IIRC? Maybe you can expand on those here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i tried NoSchedule at it did not work as i expected, which is why i backed off the PreferredNoSchedule. i have a feeling something else is not properly tolerating NoSchedule and causing the nodes to get deprovisioned. we should figure it out, but this current patch is an incremental improvement imo.

},
},
},
},
Expand Down
9 changes: 9 additions & 0 deletions pkg/framework/machinesets.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
. "github.com/onsi/gomega"

machinev1 "github.com/openshift/api/machine/v1beta1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand All @@ -28,6 +29,7 @@ type MachineSetParams struct {
Name string
Replicas int32
Labels map[string]string
Taints []corev1.Taint
ProviderSpec *machinev1.ProviderSpec
}

Expand Down Expand Up @@ -59,6 +61,12 @@ func BuildMachineSetParams(client runtimeclient.Client, replicas int) MachineSet
"e2e.openshift.io": uid.String(),
ClusterKey: clusterName,
},
Taints: []corev1.Taint{
corev1.Taint{
Key: ClusterAPIActuatorPkgTaint,
Effect: corev1.TaintEffectPreferNoSchedule,
},
},
}
}

Expand Down Expand Up @@ -87,6 +95,7 @@ func CreateMachineSet(c client.Client, params MachineSetParams) (*machinev1.Mach
Labels: params.Labels,
},
ProviderSpec: *params.ProviderSpec,
Taints: params.Taints,
},
},
Replicas: pointer.Int32Ptr(params.Replicas),
Expand Down
34 changes: 33 additions & 1 deletion pkg/framework/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func AddNodeCondition(c client.Client, node *corev1.Node, cond corev1.NodeCondit
return c.Status().Patch(context.Background(), nodeCopy, client.MergeFrom(node))
}

// FilterReadyNodes fileter the list of nodes and returns the list with ready nodes
// FilterReadyNodes filters the list of nodes and returns a list with ready nodes
func FilterReadyNodes(nodes []corev1.Node) []corev1.Node {
var readyNodes []corev1.Node
for _, n := range nodes {
Expand All @@ -37,6 +37,17 @@ func FilterReadyNodes(nodes []corev1.Node) []corev1.Node {
return readyNodes
}

// FilterSchedulableNodes filters the list of nodes and returns a list with schedulable nodes
func FilterSchedulableNodes(nodes []corev1.Node) []corev1.Node {
var schedulableNodes []corev1.Node
for _, n := range nodes {
if IsNodeSchedulable(&n) {
schedulableNodes = append(schedulableNodes, n)
}
}
return schedulableNodes
}

// GetNodes gets a list of nodes from a running cluster
// Optionaly, labels may be used to constrain listed nodes.
func GetNodes(c client.Client, selectors ...*metav1.LabelSelector) ([]corev1.Node, error) {
Expand Down Expand Up @@ -102,6 +113,19 @@ func GetNodeForMachine(c client.Client, m *machinev1.Machine) (*corev1.Node, err
return node, nil
}

// GetReadyAndSchedulableNodes returns all the nodes that have the Ready condition and can schedule workloads
func GetReadyAndSchedulableNodes(c client.Client) ([]corev1.Node, error) {
nodes, err := GetNodes(c)
if err != nil {
return nodes, err
}

nodes = FilterReadyNodes(nodes)
nodes = FilterSchedulableNodes(nodes)

return nodes, nil
}

// GetWorkerNodes returns all nodes with the nodeWorkerRoleLabel label
func GetWorkerNodes(c client.Client) ([]corev1.Node, error) {
workerNodes := &corev1.NodeList{}
Expand All @@ -127,6 +151,14 @@ func IsNodeReady(node *corev1.Node) bool {
return false
}

// IsNodeSchedulable returns true is the given node can schedule workloads.
func IsNodeSchedulable(node *corev1.Node) bool {
if node.Spec.Unschedulable {
return false
}
return true
}

// NodesAreReady returns true if an array of nodes are all ready
func NodesAreReady(nodes []*corev1.Node) bool {
// All nodes needs to be ready
Expand Down