diff --git a/openshift-hack/e2e/annotate/generated/zz_generated.annotations.go b/openshift-hack/e2e/annotate/generated/zz_generated.annotations.go index f81e211bb8b58..36a3f8875d451 100644 --- a/openshift-hack/e2e/annotate/generated/zz_generated.annotations.go +++ b/openshift-hack/e2e/annotate/generated/zz_generated.annotations.go @@ -1365,9 +1365,9 @@ var annotations = map[string]string{ "[Top Level] [sig-scheduling] Multi-AZ Cluster Volumes [sig-storage] should schedule pods in the same zones as statically provisioned PVs": "should schedule pods in the same zones as statically provisioned PVs [Suite:openshift/conformance/parallel] [Suite:k8s]", - "[Top Level] [sig-scheduling] Multi-AZ Clusters should spread the pods of a replication controller across zones": "should spread the pods of a replication controller across zones [Suite:openshift/conformance/parallel] [Suite:k8s]", + "[Top Level] [sig-scheduling] Multi-AZ Clusters should spread the pods of a replication controller across zones [Serial]": "should spread the pods of a replication controller across zones [Serial] [Suite:openshift/conformance/serial] [Suite:k8s]", - "[Top Level] [sig-scheduling] Multi-AZ Clusters should spread the pods of a service across zones": "should spread the pods of a service across zones [Serial] [Suite:openshift/conformance/serial] [Suite:k8s]", + "[Top Level] [sig-scheduling] Multi-AZ Clusters should spread the pods of a service across zones [Serial]": "should spread the pods of a service across zones [Serial] [Suite:openshift/conformance/serial] [Suite:k8s]", "[Top Level] [sig-scheduling] SchedulerPredicates [Serial] PodTopologySpread Filtering validates 4 pods with MaxSkew=1 are evenly distributed into 2 nodes": "validates 4 pods with MaxSkew=1 are evenly distributed into 2 nodes [Suite:openshift/conformance/serial] [Suite:k8s]", diff --git a/openshift-hack/e2e/annotate/rules.go b/openshift-hack/e2e/annotate/rules.go index a08d6400686f7..b2a22b74168c0 100644 --- a/openshift-hack/e2e/annotate/rules.go +++ b/openshift-hack/e2e/annotate/rules.go @@ -121,8 +121,6 @@ var ( `Clean up pods on node`, // schedules up to max pods per node `DynamicProvisioner should test that deleting a claim before the volume is provisioned deletes the volume`, // test is very disruptive to other tests - `Multi-AZ Clusters should spread the pods of a service across zones`, // spreading is a priority, not a predicate, and if the node is temporarily full the priority will be ignored - `Should be able to support the 1\.7 Sample API Server using the current Aggregator`, // down apiservices break other clients today https://bugzilla.redhat.com/show_bug.cgi?id=1623195 `\[Feature:HPA\] Horizontal pod autoscaling \(scale resource: CPU\) \[sig-autoscaling\] ReplicationController light Should scale from 1 pod to 2 pods`, diff --git a/test/e2e/scheduling/priorities.go b/test/e2e/scheduling/priorities.go index e3d7cc6cda144..a31f401cf0a92 100644 --- a/test/e2e/scheduling/priorities.go +++ b/test/e2e/scheduling/priorities.go @@ -55,6 +55,11 @@ type Resource struct { var balancePodLabel = map[string]string{"podname": "priority-balanced-memory"} +// track min memory limit based on crio minimum. pods cannot set a limit lower than this +// see: https://github.com/cri-o/cri-o/blob/29805b13e9a43d9d22628553db337ce1c1bec0a8/internal/config/cgmgr/cgmgr.go#L23 +// see: https://bugzilla.redhat.com/show_bug.cgi?id=1595256 +var crioMinMemLimit = 12 * 1024 * 1024 + var podRequestedResource = &v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceMemory: resource.MustParse("100Mi"), @@ -121,6 +126,19 @@ func removeAvoidPodsOffNode(c clientset.Interface, nodeName string) { framework.ExpectNoError(err) } +// nodesAreTooUtilized ensures that each node can support 2*crioMinMemLimit +// We check for double because it needs to support at least the cri-o minimum +// plus whatever delta between node usages (which could be up to or at least crioMinMemLimit) +func nodesAreTooUtilized(cs clientset.Interface, nodeList *v1.NodeList) bool { + for _, node := range nodeList.Items { + _, memFraction, _, memAllocatable := computeCPUMemFraction(cs, node, podRequestedResource) + if float64(memAllocatable)-(memFraction*float64(memAllocatable)) < float64(2*crioMinMemLimit) { + return true + } + } + return false +} + // This test suite is used to verifies scheduler priority functions based on the default provider var _ = SIGDescribe("SchedulerPriorities [Serial]", func() { var cs clientset.Interface @@ -149,6 +167,12 @@ var _ = SIGDescribe("SchedulerPriorities [Serial]", func() { framework.ExpectNoError(err) err = e2epod.WaitForPodsRunningReady(cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout, map[string]string{}) framework.ExpectNoError(err) + + // skip if the most utilized node has less than the cri-o minMemLimit available + // otherwise we will not be able to run the test pod once all nodes are balanced + if nodesAreTooUtilized(cs, nodeList) { + ginkgo.Skip("nodes are too utilized to schedule test pods") + } }) ginkgo.It("Pod should be scheduled to node that don't match the PodAntiAffinity terms", func() { @@ -462,8 +486,9 @@ func createBalancedPodForNodes(f *framework.Framework, cs clientset.Interface, n var maxCPUFraction, maxMemFraction float64 = ratio, ratio var cpuFractionMap = make(map[string]float64) var memFractionMap = make(map[string]float64) + for _, node := range nodes { - cpuFraction, memFraction := computeCPUMemFraction(cs, node, requestedResource) + cpuFraction, memFraction, _, _ := computeCPUMemFraction(cs, node, requestedResource) cpuFractionMap[node.Name] = cpuFraction memFractionMap[node.Name] = memFraction if cpuFraction > maxCPUFraction { @@ -473,6 +498,7 @@ func createBalancedPodForNodes(f *framework.Framework, cs clientset.Interface, n maxMemFraction = memFraction } } + // we need the max one to keep the same cpu/mem use rate ratio = math.Max(maxCPUFraction, maxMemFraction) for _, node := range nodes { @@ -489,7 +515,8 @@ func createBalancedPodForNodes(f *framework.Framework, cs clientset.Interface, n memFraction := memFractionMap[node.Name] needCreateResource[v1.ResourceCPU] = *resource.NewMilliQuantity(int64((ratio-cpuFraction)*float64(cpuAllocatableMil)), resource.DecimalSI) - needCreateResource[v1.ResourceMemory] = *resource.NewQuantity(int64((ratio-memFraction)*float64(memAllocatableVal)), resource.BinarySI) + // add crioMinMemLimit to ensure that all pods are setting at least that much for a limit, while keeping the same ratios + needCreateResource[v1.ResourceMemory] = *resource.NewQuantity(int64((ratio-memFraction)*float64(memAllocatableVal)+float64(crioMinMemLimit)), resource.BinarySI) podConfig := &pausePodConfig{ Name: "", @@ -529,7 +556,7 @@ func createBalancedPodForNodes(f *framework.Framework, cs clientset.Interface, n return cleanUp, nil } -func computeCPUMemFraction(cs clientset.Interface, node v1.Node, resource *v1.ResourceRequirements) (float64, float64) { +func computeCPUMemFraction(cs clientset.Interface, node v1.Node, resource *v1.ResourceRequirements) (float64, float64, int64, int64) { framework.Logf("ComputeCPUMemFraction for node: %v", node.Name) totalRequestedCPUResource := resource.Requests.Cpu().MilliValue() totalRequestedMemResource := resource.Requests.Memory().Value() @@ -568,7 +595,7 @@ func computeCPUMemFraction(cs clientset.Interface, node v1.Node, resource *v1.Re framework.Logf("Node: %v, totalRequestedCPUResource: %v, cpuAllocatableMil: %v, cpuFraction: %v", node.Name, totalRequestedCPUResource, cpuAllocatableMil, cpuFraction) framework.Logf("Node: %v, totalRequestedMemResource: %v, memAllocatableVal: %v, memFraction: %v", node.Name, totalRequestedMemResource, memAllocatableVal, memFraction) - return cpuFraction, memFraction + return cpuFraction, memFraction, cpuAllocatableMil, memAllocatableVal } func getNonZeroRequests(pod *v1.Pod) Resource { diff --git a/test/e2e/scheduling/ubernetes_lite.go b/test/e2e/scheduling/ubernetes_lite.go index 0e019571c802e..6ba22c5407b55 100644 --- a/test/e2e/scheduling/ubernetes_lite.go +++ b/test/e2e/scheduling/ubernetes_lite.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "math" + "time" "github.com/onsi/ginkgo" "github.com/onsi/gomega" @@ -42,6 +43,7 @@ var _ = SIGDescribe("Multi-AZ Clusters", func() { f := framework.NewDefaultFramework("multi-az") var zoneCount int var err error + var cleanUp func() ginkgo.BeforeEach(func() { e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws") if zoneCount <= 0 { @@ -52,12 +54,26 @@ var _ = SIGDescribe("Multi-AZ Clusters", func() { msg := fmt.Sprintf("Zone count is %d, only run for multi-zone clusters, skipping test", zoneCount) e2eskipper.SkipUnlessAtLeast(zoneCount, 2, msg) // TODO: SkipUnlessDefaultScheduler() // Non-default schedulers might not spread + + cs := f.ClientSet + e2enode.WaitForTotalHealthy(cs, time.Minute) + nodeList, err := e2enode.GetReadySchedulableNodes(cs) + framework.ExpectNoError(err) + + // make the nodes have balanced cpu,mem usage + cleanUp, err = createBalancedPodForNodes(f, cs, f.Namespace.Name, nodeList.Items, podRequestedResource, 0.0) + framework.ExpectNoError(err) + }) + ginkgo.AfterEach(func() { + if cleanUp != nil { + cleanUp() + } }) - ginkgo.It("should spread the pods of a service across zones", func() { + ginkgo.It("should spread the pods of a service across zones [Serial]", func() { SpreadServiceOrFail(f, 5*zoneCount, imageutils.GetPauseImageName()) }) - ginkgo.It("should spread the pods of a replication controller across zones", func() { + ginkgo.It("should spread the pods of a replication controller across zones [Serial]", func() { SpreadRCOrFail(f, int32(5*zoneCount), framework.ServeHostnameImage, []string{"serve-hostname"}) }) })