From 60557963fa420e03fc917390b0c1afe8c2b318c6 Mon Sep 17 00:00:00 2001 From: Joel Speed Date: Tue, 13 Oct 2020 17:23:05 +0100 Subject: [PATCH 1/2] Merge spot tests to a single test This means that the test suite only needs to spin up a single machineset for spot instances. This is an easy alternative to having a `BeforeAll` which will hopefully come in Ginkgo V2, once it does, we can revert this and change the `BeforeEach` to a `BeforeAll`. My hope is that by reducing the number of spot instances we need to spin up, this will improve the reliability of this test on Azure which seems to have some issues bringing up spot instances. --- pkg/infra/spot.go | 210 +++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 104 deletions(-) diff --git a/pkg/infra/spot.go b/pkg/infra/spot.go index a58cc6e02..e1d9bb073 100644 --- a/pkg/infra/spot.go +++ b/pkg/infra/spot.go @@ -71,119 +71,121 @@ var _ = Describe("[Feature:Machines] Running on Spot", func() { Expect(deleteObjects(client, delObjects)).To(Succeed()) }) - It("should label the Machine specs as interruptible", func() { - selector := machineSet.Spec.Selector - machines, err := framework.GetMachines(client, &selector) - Expect(err).ToNot(HaveOccurred()) - Expect(machines).To(HaveLen(3)) - - for _, machine := range machines { - Expect(machine.Spec.ObjectMeta.Labels).To(HaveKeyWithValue(machinecontroller.MachineInterruptibleInstanceLabelName, "")) - } - }) - - It("should deploy a termination handler pod to each instance", func() { - nodes, err := framework.GetNodesFromMachineSet(client, machineSet) - Expect(err).ToNot(HaveOccurred()) - Expect(nodes).To(HaveLen(3)) - - terminationLabels := map[string]string{ - "api": "clusterapi", - "k8s-app": "termination-handler", - } - - for _, node := range nodes { - By("Fetching termination Pods running on the Node") - pods := []corev1.Pod{} - Eventually(func() ([]corev1.Pod, error) { - podList := &corev1.PodList{} - err := client.List(context.Background(), podList, runtimeclient.MatchingLabels(terminationLabels)) - if err != nil { - return podList.Items, err - } - for _, pod := range podList.Items { - if pod.Spec.NodeName == node.Name { - pods = append(pods, pod) - } - } - return pods, nil - }, framework.WaitLong, framework.RetryMedium).ShouldNot(BeEmpty()) - // Termination Pods run in a DaemonSet, should only be 1 per node - Expect(pods).To(HaveLen(1)) - podKey := runtimeclient.ObjectKey{Namespace: pods[0].Namespace, Name: pods[0].Name} - - By("Ensuring the termination Pod is running and the containers are ready") - Eventually(func() (bool, error) { - pod := &corev1.Pod{} - err := client.Get(context.Background(), podKey, pod) - if err != nil { - return false, err - } - if pod.Status.Phase != corev1.PodRunning { - return false, nil - } - - // Ensure all containers are ready - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.ContainersReady { - return condition.Status == corev1.ConditionTrue, nil - } - } - - return false, nil - }, framework.WaitLong, framework.RetryMedium).Should(BeTrue()) - } - }) - - It("should terminate a Machine if a termination event is observed", func() { - By("Deploying a mock metadata application", func() { - configMap, err := getMetadataMockConfigMap() + It("should handle the spot instances", func() { + By("should label the Machine specs as interruptible", func() { + selector := machineSet.Spec.Selector + machines, err := framework.GetMachines(client, &selector) Expect(err).ToNot(HaveOccurred()) - Expect(client.Create(ctx, configMap)).To(Succeed()) - delObjects[configMap.Name] = configMap - - service := getMetadataMockService() - Expect(client.Create(ctx, service)).To(Succeed()) - delObjects[service.Name] = service - - deployment := getMetadataMockDeployment(platform) - Expect(client.Create(ctx, deployment)).To(Succeed()) - delObjects[deployment.Name] = deployment + Expect(machines).To(HaveLen(3)) - Expect(framework.IsDeploymentAvailable(client, deployment.Name, deployment.Namespace)).To(BeTrue()) + for _, machine := range machines { + Expect(machine.Spec.ObjectMeta.Labels).To(HaveKeyWithValue(machinecontroller.MachineInterruptibleInstanceLabelName, "")) + } }) - var machine *mapiv1.Machine - By("Choosing a Machine to terminate", func() { - machines, err := framework.GetMachinesFromMachineSet(client, machineSet) + By("should deploy a termination handler pod to each instance", func() { + nodes, err := framework.GetNodesFromMachineSet(client, machineSet) Expect(err).ToNot(HaveOccurred()) - Expect(len(machines)).To(BeNumerically(">", 0)) - - rand.Seed(time.Now().Unix()) - machine = machines[rand.Intn(len(machines))] - Expect(machine.Status.NodeRef).ToNot(BeNil()) - }) - - By("Deploying a job to reroute metadata traffic to the mock", func() { - serviceAccount := getTerminationSimulatorServiceAccount() - Expect(client.Create(ctx, serviceAccount)).To(Succeed()) - delObjects[serviceAccount.Name] = serviceAccount - - role := getTerminationSimulatorRole() - Expect(client.Create(ctx, role)).To(Succeed()) - delObjects[role.Name] = role + Expect(nodes).To(HaveLen(3)) + + terminationLabels := map[string]string{ + "api": "clusterapi", + "k8s-app": "termination-handler", + } + + for _, node := range nodes { + By("Fetching termination Pods running on the Node") + pods := []corev1.Pod{} + Eventually(func() ([]corev1.Pod, error) { + podList := &corev1.PodList{} + err := client.List(context.Background(), podList, runtimeclient.MatchingLabels(terminationLabels)) + if err != nil { + return podList.Items, err + } + for _, pod := range podList.Items { + if pod.Spec.NodeName == node.Name { + pods = append(pods, pod) + } + } + return pods, nil + }, framework.WaitLong, framework.RetryMedium).ShouldNot(BeEmpty()) + // Termination Pods run in a DaemonSet, should only be 1 per node + Expect(pods).To(HaveLen(1)) + podKey := runtimeclient.ObjectKey{Namespace: pods[0].Namespace, Name: pods[0].Name} + + By("Ensuring the termination Pod is running and the containers are ready") + Eventually(func() (bool, error) { + pod := &corev1.Pod{} + err := client.Get(context.Background(), podKey, pod) + if err != nil { + return false, err + } + if pod.Status.Phase != corev1.PodRunning { + return false, nil + } - roleBinding := getTerminationSimulatorRoleBinding() - Expect(client.Create(ctx, roleBinding)).To(Succeed()) - delObjects[roleBinding.Name] = roleBinding + // Ensure all containers are ready + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.ContainersReady { + return condition.Status == corev1.ConditionTrue, nil + } + } - job := getTerminationSimulatorJob(machine.Status.NodeRef.Name) - Expect(client.Create(ctx, job)).To(Succeed()) - delObjects[job.Name] = job + return false, nil + }, framework.WaitLong, framework.RetryMedium).Should(BeTrue()) + } }) - // If the job deploys correctly, the Machine will go away - framework.WaitForMachinesDeleted(client, machine) + By("should terminate a Machine if a termination event is observed", func() { + By("Deploying a mock metadata application", func() { + configMap, err := getMetadataMockConfigMap() + Expect(err).ToNot(HaveOccurred()) + Expect(client.Create(ctx, configMap)).To(Succeed()) + delObjects[configMap.Name] = configMap + + service := getMetadataMockService() + Expect(client.Create(ctx, service)).To(Succeed()) + delObjects[service.Name] = service + + deployment := getMetadataMockDeployment(platform) + Expect(client.Create(ctx, deployment)).To(Succeed()) + delObjects[deployment.Name] = deployment + + Expect(framework.IsDeploymentAvailable(client, deployment.Name, deployment.Namespace)).To(BeTrue()) + }) + + var machine *mapiv1.Machine + By("Choosing a Machine to terminate", func() { + machines, err := framework.GetMachinesFromMachineSet(client, machineSet) + Expect(err).ToNot(HaveOccurred()) + Expect(len(machines)).To(BeNumerically(">", 0)) + + rand.Seed(time.Now().Unix()) + machine = machines[rand.Intn(len(machines))] + Expect(machine.Status.NodeRef).ToNot(BeNil()) + }) + + By("Deploying a job to reroute metadata traffic to the mock", func() { + serviceAccount := getTerminationSimulatorServiceAccount() + Expect(client.Create(ctx, serviceAccount)).To(Succeed()) + delObjects[serviceAccount.Name] = serviceAccount + + role := getTerminationSimulatorRole() + Expect(client.Create(ctx, role)).To(Succeed()) + delObjects[role.Name] = role + + roleBinding := getTerminationSimulatorRoleBinding() + Expect(client.Create(ctx, roleBinding)).To(Succeed()) + delObjects[roleBinding.Name] = roleBinding + + job := getTerminationSimulatorJob(machine.Status.NodeRef.Name) + Expect(client.Create(ctx, job)).To(Succeed()) + delObjects[job.Name] = job + }) + + // If the job deploys correctly, the Machine will go away + framework.WaitForMachinesDeleted(client, machine) + }) }) }) From a07315771e0d49e631fad0bba482c305ea1b589e Mon Sep 17 00:00:00 2001 From: Joel Speed Date: Wed, 14 Oct 2020 10:45:17 +0100 Subject: [PATCH 2/2] Make sure machine name is logged in spot test --- pkg/infra/spot.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/infra/spot.go b/pkg/infra/spot.go index e1d9bb073..61266f92f 100644 --- a/pkg/infra/spot.go +++ b/pkg/infra/spot.go @@ -184,7 +184,9 @@ var _ = Describe("[Feature:Machines] Running on Spot", func() { }) // If the job deploys correctly, the Machine will go away - framework.WaitForMachinesDeleted(client, machine) + By(fmt.Sprintf("Waiting for machine %q to be deleted", machine.Name), func() { + framework.WaitForMachinesDeleted(client, machine) + }) }) }) })