-
Notifications
You must be signed in to change notification settings - Fork 472
add Eventually() to retryable k8s E2E operations #2123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -188,7 +188,7 @@ func WaitForControlPlaneMachinesToExist(ctx context.Context, input WaitForContro | |
| ammpList := &infrav1exp.AzureManagedMachinePoolList{} | ||
|
|
||
| if err := input.Lister.List(ctx, ammpList, opt1, opt2); err != nil { | ||
| Logf("Failed to get machinePool: %+v", err) | ||
| LogWarningf("Failed to get machinePool: %+v", err) | ||
| return false | ||
| } | ||
|
|
||
|
|
@@ -202,7 +202,7 @@ func WaitForControlPlaneMachinesToExist(ctx context.Context, input WaitForContro | |
| ownerMachinePool := &clusterv1exp.MachinePool{} | ||
| if err := input.Getter.Get(ctx, types.NamespacedName{Namespace: input.Namespace, Name: ref.Name}, | ||
| ownerMachinePool); err != nil { | ||
| Logf("Failed to get machinePool: %+v", err) | ||
| LogWarningf("Failed to get machinePool: %+v", err) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems like an actual failure and it's not in a retry block, why log it as a warning?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no existent error equivalent of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, Logf is actually Info which is even less acurrate. I thought we were decreasing severity. |
||
| return false | ||
| } | ||
| if len(ownerMachinePool.Status.NodeRefs) >= minReplicas.value(ownerMachinePool) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,10 +123,16 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) { | |
| if !input.IPv6 { | ||
| By("creating an internal Load Balancer service") | ||
|
|
||
| ilbService := webDeployment.GetService(ports, deploymentBuilder.InternalLoadbalancer) | ||
| ilbService := webDeployment.CreateServiceResourceSpec(ports, deploymentBuilder.InternalLoadbalancer) | ||
| Log("starting to create an internal Load Balancer service") | ||
| _, err = servicesClient.Create(ctx, ilbService, metav1.CreateOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| _, err := servicesClient.Create(ctx, ilbService, metav1.CreateOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed creating service (%s):%s\n", ilbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| ilbSvcInput := WaitForServiceAvailableInput{ | ||
| Getter: servicesClientAdapter{client: servicesClient}, | ||
| Service: ilbService, | ||
|
|
@@ -136,14 +142,28 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) { | |
|
|
||
| By("connecting to the internal LB service from a curl pod") | ||
|
|
||
| svc, err := servicesClient.Get(ctx, ilbService.Name, metav1.GetOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| var svc *corev1.Service | ||
| Eventually(func() error { | ||
| var err error | ||
| svc, err = servicesClient.Get(ctx, ilbService.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed getting service (%s):%s\n", ilbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| ilbIP := extractServiceIp(svc) | ||
|
|
||
| ilbJob := job.CreateCurlJob("curl-to-ilb-job", ilbIP) | ||
| ilbJob := job.CreateCurlJobResourceSpec("curl-to-ilb-job", ilbIP) | ||
| Log("starting to create a curl to ilb job") | ||
| _, err = jobsClient.Create(ctx, ilbJob, metav1.CreateOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| _, err := jobsClient.Create(ctx, ilbJob, metav1.CreateOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed creating job (%s):%s\n", ilbJob.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| ilbJobInput := WaitForJobCompleteInput{ | ||
| Getter: jobsClientAdapter{client: jobsClient}, | ||
| Job: ilbJob, | ||
|
|
@@ -154,19 +174,37 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) { | |
| if !input.SkipCleanup { | ||
| By("deleting the ilb test resources") | ||
| Logf("deleting the ilb service: %s", ilbService.Name) | ||
| err = servicesClient.Delete(ctx, ilbService.Name, metav1.DeleteOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := servicesClient.Delete(ctx, ilbService.Name, metav1.DeleteOptions{}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. based on our previous conversation in slack, my understanding was that this Delete doesn't actually wait for the service to be deleted, it just waits for the client go Delete to return, which doesn't fully wait for the service to be gone. Is that correct? If so, waiting 20 minutes doesn't seem appropriate. We should change it to actually wait for the service to be gone so that we avoid running into flakes later due to cloud provider being in the middle of reconciling the deleted service when we try to create a new service.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's actually not clear (to me) in the client-go code if the delete operation blocks on success here... I don't see a "wait" or "do not wait" option here
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on the timing in logs it seems very likely that the delete is not blocking: We should add a check to make sure the service is gone before we proceed. We don't have to fix it in this PR but let's at least reduce the timeout, 20 minutes is too long if it's just for the action of starting the delete (and not the delete actually completing). |
||
| if err != nil { | ||
| LogWarningf("failed deleting service (%s):%s\n", ilbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| Logf("deleting the ilb job: %s", ilbJob.Name) | ||
| err = jobsClient.Delete(ctx, ilbJob.Name, metav1.DeleteOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := jobsClient.Delete(ctx, ilbJob.Name, metav1.DeleteOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed deleting job (%s):%s\n", ilbJob.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| } | ||
| } | ||
|
|
||
| By("creating an external Load Balancer service") | ||
| elbService := webDeployment.GetService(ports, deploymentBuilder.ExternalLoadbalancer) | ||
| elbService := webDeployment.CreateServiceResourceSpec(ports, deploymentBuilder.ExternalLoadbalancer) | ||
| Log("starting to create an external Load Balancer service") | ||
| _, err = servicesClient.Create(ctx, elbService, metav1.CreateOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| _, err := servicesClient.Create(ctx, elbService, metav1.CreateOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed creating service (%s):%s\n", elbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| elbSvcInput := WaitForServiceAvailableInput{ | ||
| Getter: servicesClientAdapter{client: servicesClient}, | ||
| Service: elbService, | ||
|
|
@@ -175,14 +213,28 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) { | |
| WaitForServiceAvailable(ctx, elbSvcInput, e2eConfig.GetIntervals(specName, "wait-service")...) | ||
|
|
||
| By("connecting to the external LB service from a curl pod") | ||
| svc, err := servicesClient.Get(ctx, elbService.Name, metav1.GetOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| var svc *corev1.Service | ||
| Eventually(func() error { | ||
| var err error | ||
| svc, err = servicesClient.Get(ctx, elbService.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed getting service (%s):%s\n", elbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
|
|
||
| elbIP := extractServiceIp(svc) | ||
| Log("starting to create curl-to-elb job") | ||
| elbJob := job.CreateCurlJob("curl-to-elb-job"+util.RandomString(6), elbIP) | ||
| _, err = jobsClient.Create(ctx, elbJob, metav1.CreateOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| elbJob := job.CreateCurlJobResourceSpec("curl-to-elb-job"+util.RandomString(6), elbIP) | ||
| Eventually(func() error { | ||
| _, err := jobsClient.Create(ctx, elbJob, metav1.CreateOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed creating job (%s):%s\n", elbJob.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| elbJobInput := WaitForJobCompleteInput{ | ||
| Getter: jobsClientAdapter{client: jobsClient}, | ||
| Job: elbJob, | ||
|
|
@@ -208,14 +260,32 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) { | |
| } | ||
| By("deleting the test resources") | ||
| Logf("starting to delete external LB service %s", elbService.Name) | ||
| err = servicesClient.Delete(ctx, elbService.Name, metav1.DeleteOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := servicesClient.Delete(ctx, elbService.Name, metav1.DeleteOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed deleting service (%s):%s\n", elbService.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| Logf("starting to delete deployment %s", deployment.Name) | ||
| err = webDeployment.Client(clientset).Delete(ctx, deployment.Name, metav1.DeleteOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := webDeployment.Client(clientset).Delete(ctx, deployment.Name, metav1.DeleteOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed deleting deployment (%s):%s\n", deployment.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| Logf("starting to delete job %s", elbJob.Name) | ||
| err = jobsClient.Delete(ctx, elbJob.Name, metav1.DeleteOptions{}) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := jobsClient.Delete(ctx, elbJob.Name, metav1.DeleteOptions{}) | ||
| if err != nil { | ||
| LogWarningf("failed deleting job (%s):%s\n", elbJob.Name, err.Error()) | ||
| return err | ||
| } | ||
| return nil | ||
| }, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) | ||
| } | ||
|
|
||
| func extractServiceIp(svc *corev1.Service) string { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,11 @@ import ( | |
| ) | ||
|
|
||
| const ( | ||
| AzureMachinePoolDrainSpecName = "azure-mp-drain" | ||
| AzureMachinePoolDrainSpecName = "azure-mp-drain" | ||
| waitForDrainOperationTimeout = 5 * time.Minute | ||
| waitForDrainSleepBetweenRetries = 500 * time.Millisecond | ||
| waitforResourceOperationTimeout = 30 * time.Second | ||
| waitforResourceOperationSleepBetweenRetries = 3 * time.Second | ||
| ) | ||
|
|
||
| // AzureMachinePoolDrainSpecInput is the input for AzureMachinePoolDrainSpec. | ||
|
|
@@ -154,19 +158,21 @@ func testMachinePoolCordonAndDrain(ctx context.Context, mgmtClusterProxy, worklo | |
| Eventually(func() error { | ||
| helper, err := patch.NewHelper(owningMachinePool, mgmtClusterProxy.GetClient()) | ||
| if err != nil { | ||
| LogWarning(err.Error()) | ||
| return err | ||
| } | ||
|
|
||
| decreasedReplicas := *owningMachinePool.Spec.Replicas - int32(1) | ||
| owningMachinePool.Spec.Replicas = &decreasedReplicas | ||
| return helper.Patch(ctx, owningMachinePool) | ||
| }) | ||
| }, 3*time.Minute, 3*time.Second).Should(Succeed()) | ||
|
|
||
| By(fmt.Sprintf("checking for a machine to start draining for machine pool: %s/%s", amp.Namespace, amp.Name)) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was never actually running, because the See https://kubernetes.slack.com/archives/CEX9HENG7/p1646686318723399
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually it's the "finished draining but hasn't yet deleted" window that is too small. |
||
| Eventually(func() error { | ||
| ampmls, err := getAzureMachinePoolMachines(ctx, mgmtClusterProxy, workloadClusterProxy, amp) | ||
| if err != nil { | ||
| return errors.Wrap(err, "failed to list the azure machine pool machines") | ||
| LogWarning(errors.Wrap(err, "failed to list the azure machine pool machines").Error()) | ||
| return err | ||
| } | ||
|
|
||
| for _, machine := range ampmls { | ||
|
|
@@ -176,35 +182,33 @@ func testMachinePoolCordonAndDrain(ctx context.Context, mgmtClusterProxy, worklo | |
| } | ||
|
|
||
| return errors.New("no machine has started to drain") | ||
| }) | ||
|
|
||
| By(fmt.Sprintf("checking for a machine to successfully complete draining for machine pool: %s/%s", amp.Namespace, amp.Name)) | ||
| Eventually(func() error { | ||
| ampmls, err := getAzureMachinePoolMachines(ctx, mgmtClusterProxy, workloadClusterProxy, amp) | ||
| if err != nil { | ||
| return errors.Wrap(err, "failed to list the azure machine pool machines") | ||
| } | ||
|
|
||
| for _, machine := range ampmls { | ||
| if conditions.Has(&machine, clusterv1.DrainingSucceededCondition) && conditions.IsTrue(&machine, clusterv1.DrainingSucceededCondition) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the issue is that the machine gets deleted too quickly, I wonder if we could add a finalizer to it before the test starts (in the setup above) so it doesn't get deleted until we've done all the checks and we're ready to let it delete
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the proper scope for this PR? Given that these existing drain tests don't actually work at all. Should we simply abandon any changes here and follow-up with targeted improvements to drain testing? |
||
| return nil // started draining the node prior to delete | ||
| } | ||
| } | ||
| }, waitForDrainOperationTimeout, waitForDrainSleepBetweenRetries).Should(Succeed()) | ||
|
|
||
| return errors.New("no machine has finished draining") | ||
| }) | ||
| // TODO setup a watcher to detect the terminal drain success state | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than checking for the expected state transitions serially, we can improve this in the future by setting a watch on the machine in the process of deleting. I'm making the judgment that doing that work is out of scope for this PR (this PR is about standardizing the usage of Eventually() for retryable operations). Bottom line, this test coverage was never actually running due to the lack of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we open an issue for this rather than leaving a TODO in the code? Also does this mean #2120 isn't fully fixed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The referenced issue documents a flake that should be addressed w/ the new |
||
| } | ||
|
|
||
| func labelNodesWithMachinePoolName(ctx context.Context, workloadClient client.Client, mpName string, ampms []infrav1exp.AzureMachinePoolMachine) { | ||
| for _, ampm := range ampms { | ||
| n := &corev1.Node{} | ||
| Expect(workloadClient.Get(ctx, client.ObjectKey{ | ||
| Name: ampm.Status.NodeRef.Name, | ||
| Namespace: ampm.Status.NodeRef.Namespace, | ||
| }, n)).ToNot(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := workloadClient.Get(ctx, client.ObjectKey{ | ||
| Name: ampm.Status.NodeRef.Name, | ||
| Namespace: ampm.Status.NodeRef.Namespace, | ||
| }, n) | ||
| if err != nil { | ||
| LogWarning(err.Error()) | ||
| } | ||
| return err | ||
| }, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed()) | ||
| n.Labels[clusterv1.OwnerKindAnnotation] = "MachinePool" | ||
| n.Labels[clusterv1.OwnerNameAnnotation] = mpName | ||
| Expect(workloadClient.Update(ctx, n)).ToNot(HaveOccurred()) | ||
| Eventually(func() error { | ||
| err := workloadClient.Update(ctx, n) | ||
| if err != nil { | ||
| LogWarning(err.Error()) | ||
| } | ||
| return err | ||
| }, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed()) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -231,10 +235,16 @@ func getOwnerMachinePool(ctx context.Context, c client.Client, obj metav1.Object | |
|
|
||
| if ref.Kind == "MachinePool" && gv.Group == clusterv1exp.GroupVersion.Group { | ||
| mp := &clusterv1exp.MachinePool{} | ||
| err := c.Get(ctx, client.ObjectKey{ | ||
| Name: ref.Name, | ||
| Namespace: obj.Namespace, | ||
| }, mp) | ||
| Eventually(func() error { | ||
| err := c.Get(ctx, client.ObjectKey{ | ||
| Name: ref.Name, | ||
| Namespace: obj.Namespace, | ||
| }, mp) | ||
| if err != nil { | ||
| LogWarning(err.Error()) | ||
| } | ||
| return err | ||
| }, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed()) | ||
| return mp, err | ||
| } | ||
| } | ||
|
|
@@ -271,18 +281,14 @@ func deployHttpService(ctx context.Context, clientset *kubernetes.Clientset, isW | |
| webDeploymentBuilder.AddContainerPort("http", "http", 80, corev1.ProtocolTCP) | ||
|
|
||
| if isWindows { | ||
| var windowsVersion windows.OSVersion | ||
| Eventually(func() error { | ||
| version, err := node.GetWindowsVersion(ctx, clientset) | ||
| windowsVersion = version | ||
| return err | ||
| }, 300*time.Second, 5*time.Second).Should(Succeed()) | ||
| windowsVersion, err := node.GetWindowsVersion(ctx, clientset) | ||
| Expect(err).NotTo(HaveOccurred()) | ||
| iisImage := windows.GetWindowsImage(windows.Httpd, windowsVersion) | ||
| webDeploymentBuilder.SetImage(deploymentName, iisImage) | ||
| webDeploymentBuilder.AddWindowsSelectors() | ||
| } | ||
|
|
||
| elbService := webDeploymentBuilder.GetService(ports, deployments.ExternalLoadbalancer) | ||
| elbService := webDeploymentBuilder.CreateServiceResourceSpec(ports, deployments.ExternalLoadbalancer) | ||
|
|
||
| for _, opt := range opts { | ||
| opt(webDeploymentBuilder, elbService) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.