Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/e2e/aks.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func WaitForControlPlaneMachinesToExist(ctx context.Context, input WaitForContro
ammpList := &infrav1exp.AzureManagedMachinePoolList{}

if err := input.Lister.List(ctx, ammpList, opt1, opt2); err != nil {
Logf("Failed to get machinePool: %+v", err)
LogWarningf("Failed to get machinePool: %+v", err)
Comment thread
CecileRobertMichon marked this conversation as resolved.
return false
}

Expand All @@ -202,7 +202,7 @@ func WaitForControlPlaneMachinesToExist(ctx context.Context, input WaitForContro
ownerMachinePool := &clusterv1exp.MachinePool{}
if err := input.Getter.Get(ctx, types.NamespacedName{Namespace: input.Namespace, Name: ref.Name},
ownerMachinePool); err != nil {
Logf("Failed to get machinePool: %+v", err)
LogWarningf("Failed to get machinePool: %+v", err)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems like an actual failure and it's not in a retry block, why log it as a warning?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no existent error equivalent of Logf Do we want to create one?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, Logf is actually Info which is even less acurrate. I thought we were decreasing severity.

return false
}
if len(ownerMachinePool.Status.NodeRefs) >= minReplicas.value(ownerMachinePool) {
Expand Down
16 changes: 11 additions & 5 deletions test/e2e/azure_failuredomains.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,17 @@ func AzureFailureDomainsSpec(ctx context.Context, inputGetter func() AzureFailur
By("Ensuring zones match CAPI failure domains")

// fetch updated cluster object to ensure Status.FailureDomains is up-to-date
err := input.BootstrapClusterProxy.GetClient().Get(ctx,
apimachinerytypes.NamespacedName{
Namespace: input.Namespace.Name,
Name: input.ClusterName,
}, input.Cluster)
Eventually(func() error {
err := input.BootstrapClusterProxy.GetClient().Get(ctx,
apimachinerytypes.NamespacedName{
Namespace: input.Namespace.Name,
Name: input.ClusterName,
}, input.Cluster)
if err != nil {
LogWarning(err.Error())
}
return err
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
Expect(err).NotTo(HaveOccurred())
Expect(len(input.Cluster.Status.FailureDomains)).To(Equal(len(zones)))
for _, z := range zones {
Expand Down
13 changes: 9 additions & 4 deletions test/e2e/azure_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,15 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
// getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods.
func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string {
podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll)
pods, err := podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
if err != nil {
return err.Error()
}
var pods *corev1.PodList
var err error
Eventually(func() error {
pods, err = podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
if err != nil {
LogWarning(err.Error())
}
return err
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
b := strings.Builder{}
for _, pod := range pods.Items {
b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name))
Expand Down
122 changes: 96 additions & 26 deletions test/e2e/azure_lb.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,16 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {
if !input.IPv6 {
By("creating an internal Load Balancer service")

ilbService := webDeployment.GetService(ports, deploymentBuilder.InternalLoadbalancer)
ilbService := webDeployment.CreateServiceResourceSpec(ports, deploymentBuilder.InternalLoadbalancer)
Log("starting to create an internal Load Balancer service")
_, err = servicesClient.Create(ctx, ilbService, metav1.CreateOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
_, err := servicesClient.Create(ctx, ilbService, metav1.CreateOptions{})
if err != nil {
LogWarningf("failed creating service (%s):%s\n", ilbService.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
ilbSvcInput := WaitForServiceAvailableInput{
Getter: servicesClientAdapter{client: servicesClient},
Service: ilbService,
Expand All @@ -136,14 +142,28 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {

By("connecting to the internal LB service from a curl pod")

svc, err := servicesClient.Get(ctx, ilbService.Name, metav1.GetOptions{})
Expect(err).NotTo(HaveOccurred())
var svc *corev1.Service
Eventually(func() error {
var err error
svc, err = servicesClient.Get(ctx, ilbService.Name, metav1.GetOptions{})
if err != nil {
LogWarningf("failed getting service (%s):%s\n", ilbService.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
ilbIP := extractServiceIp(svc)

ilbJob := job.CreateCurlJob("curl-to-ilb-job", ilbIP)
ilbJob := job.CreateCurlJobResourceSpec("curl-to-ilb-job", ilbIP)
Log("starting to create a curl to ilb job")
_, err = jobsClient.Create(ctx, ilbJob, metav1.CreateOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
_, err := jobsClient.Create(ctx, ilbJob, metav1.CreateOptions{})
if err != nil {
LogWarningf("failed creating job (%s):%s\n", ilbJob.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
ilbJobInput := WaitForJobCompleteInput{
Getter: jobsClientAdapter{client: jobsClient},
Job: ilbJob,
Expand All @@ -154,19 +174,37 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {
if !input.SkipCleanup {
By("deleting the ilb test resources")
Logf("deleting the ilb service: %s", ilbService.Name)
err = servicesClient.Delete(ctx, ilbService.Name, metav1.DeleteOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
err := servicesClient.Delete(ctx, ilbService.Name, metav1.DeleteOptions{})
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

based on our previous conversation in slack, my understanding was that this Delete doesn't actually wait for the service to be deleted, it just waits for the client go Delete to return, which doesn't fully wait for the service to be gone. Is that correct?

If so, waiting 20 minutes doesn't seem appropriate. We should change it to actually wait for the service to be gone so that we avoid running into flakes later due to cloud provider being in the middle of reconciling the deleted service when we try to create a new service.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually not clear (to me) in the client-go code if the delete operation blocks on success here...

https://github.com/kubernetes/client-go/blob/v0.23.0/kubernetes/typed/core/v1/service.go#L160

I don't see a "wait" or "do not wait" option here

https://github.com/kubernetes/apimachinery/blob/v0.23.0/pkg/apis/meta/v1/types.go#L482

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the timing in logs it seems very likely that the delete is not blocking:

Mar  9 20:30:50.686: INFO: job default/curl-to-ilb-joboupqs is complete, took 10.069781536s
�[1mSTEP�[0m: deleting the ilb test resources
Mar  9 20:30:50.686: INFO: deleting the ilb service: webg7snfx-ilb
Mar  9 20:30:50.741: INFO: deleting the ilb job: curl-to-ilb-joboupqs
�[1mSTEP�[0m: creating an external Load Balancer service
Mar  9 20:30:50.776: INFO: starting to create an external Load Balancer service

We should add a check to make sure the service is gone before we proceed. We don't have to fix it in this PR but let's at least reduce the timeout, 20 minutes is too long if it's just for the action of starting the delete (and not the delete actually completing).

if err != nil {
LogWarningf("failed deleting service (%s):%s\n", ilbService.Name, err.Error())
return err
}
return nil
}, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
Logf("deleting the ilb job: %s", ilbJob.Name)
err = jobsClient.Delete(ctx, ilbJob.Name, metav1.DeleteOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
err := jobsClient.Delete(ctx, ilbJob.Name, metav1.DeleteOptions{})
if err != nil {
LogWarningf("failed deleting job (%s):%s\n", ilbJob.Name, err.Error())
return err
}
return nil
}, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
}
}

By("creating an external Load Balancer service")
elbService := webDeployment.GetService(ports, deploymentBuilder.ExternalLoadbalancer)
elbService := webDeployment.CreateServiceResourceSpec(ports, deploymentBuilder.ExternalLoadbalancer)
Log("starting to create an external Load Balancer service")
_, err = servicesClient.Create(ctx, elbService, metav1.CreateOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
_, err := servicesClient.Create(ctx, elbService, metav1.CreateOptions{})
if err != nil {
LogWarningf("failed creating service (%s):%s\n", elbService.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
elbSvcInput := WaitForServiceAvailableInput{
Getter: servicesClientAdapter{client: servicesClient},
Service: elbService,
Expand All @@ -175,14 +213,28 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {
WaitForServiceAvailable(ctx, elbSvcInput, e2eConfig.GetIntervals(specName, "wait-service")...)

By("connecting to the external LB service from a curl pod")
svc, err := servicesClient.Get(ctx, elbService.Name, metav1.GetOptions{})
Expect(err).NotTo(HaveOccurred())
var svc *corev1.Service
Eventually(func() error {
var err error
svc, err = servicesClient.Get(ctx, elbService.Name, metav1.GetOptions{})
if err != nil {
LogWarningf("failed getting service (%s):%s\n", elbService.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())

elbIP := extractServiceIp(svc)
Log("starting to create curl-to-elb job")
elbJob := job.CreateCurlJob("curl-to-elb-job"+util.RandomString(6), elbIP)
_, err = jobsClient.Create(ctx, elbJob, metav1.CreateOptions{})
Expect(err).NotTo(HaveOccurred())
elbJob := job.CreateCurlJobResourceSpec("curl-to-elb-job"+util.RandomString(6), elbIP)
Eventually(func() error {
_, err := jobsClient.Create(ctx, elbJob, metav1.CreateOptions{})
if err != nil {
LogWarningf("failed creating job (%s):%s\n", elbJob.Name, err.Error())
return err
}
return nil
}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
elbJobInput := WaitForJobCompleteInput{
Getter: jobsClientAdapter{client: jobsClient},
Job: elbJob,
Expand All @@ -208,14 +260,32 @@ func AzureLBSpec(ctx context.Context, inputGetter func() AzureLBSpecInput) {
}
By("deleting the test resources")
Logf("starting to delete external LB service %s", elbService.Name)
err = servicesClient.Delete(ctx, elbService.Name, metav1.DeleteOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
err := servicesClient.Delete(ctx, elbService.Name, metav1.DeleteOptions{})
if err != nil {
LogWarningf("failed deleting service (%s):%s\n", elbService.Name, err.Error())
return err
}
return nil
}, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
Logf("starting to delete deployment %s", deployment.Name)
err = webDeployment.Client(clientset).Delete(ctx, deployment.Name, metav1.DeleteOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
err := webDeployment.Client(clientset).Delete(ctx, deployment.Name, metav1.DeleteOptions{})
if err != nil {
LogWarningf("failed deleting deployment (%s):%s\n", deployment.Name, err.Error())
return err
}
return nil
}, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
Logf("starting to delete job %s", elbJob.Name)
err = jobsClient.Delete(ctx, elbJob.Name, metav1.DeleteOptions{})
Expect(err).NotTo(HaveOccurred())
Eventually(func() error {
err := jobsClient.Delete(ctx, elbJob.Name, metav1.DeleteOptions{})
if err != nil {
LogWarningf("failed deleting job (%s):%s\n", elbJob.Name, err.Error())
return err
}
return nil
}, deleteOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
}

func extractServiceIp(svc *corev1.Service) string {
Expand Down
76 changes: 41 additions & 35 deletions test/e2e/azure_machinepool_drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ import (
)

const (
AzureMachinePoolDrainSpecName = "azure-mp-drain"
AzureMachinePoolDrainSpecName = "azure-mp-drain"
waitForDrainOperationTimeout = 5 * time.Minute
waitForDrainSleepBetweenRetries = 500 * time.Millisecond
waitforResourceOperationTimeout = 30 * time.Second
waitforResourceOperationSleepBetweenRetries = 3 * time.Second
)

// AzureMachinePoolDrainSpecInput is the input for AzureMachinePoolDrainSpec.
Expand Down Expand Up @@ -154,19 +158,21 @@ func testMachinePoolCordonAndDrain(ctx context.Context, mgmtClusterProxy, worklo
Eventually(func() error {
helper, err := patch.NewHelper(owningMachinePool, mgmtClusterProxy.GetClient())
if err != nil {
LogWarning(err.Error())
return err
}

decreasedReplicas := *owningMachinePool.Spec.Replicas - int32(1)
owningMachinePool.Spec.Replicas = &decreasedReplicas
return helper.Patch(ctx, owningMachinePool)
})
}, 3*time.Minute, 3*time.Second).Should(Succeed())

By(fmt.Sprintf("checking for a machine to start draining for machine pool: %s/%s", amp.Namespace, amp.Name))
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was never actually running, because the Eventually() was not actually running according to any success criteria. When I enabled the test using a retry + success criteria, it was failing consistently. After investigation, it seems pretty clear that evaluating for "started but hasn't yet finished draining" is a small enough window that it will regularly fail. Let's just get rid of this altogether and run the next test ("check for successful drain").

See https://kubernetes.slack.com/archives/CEX9HENG7/p1646686318723399

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually it's the "finished draining but hasn't yet deleted" window that is too small.

Eventually(func() error {
ampmls, err := getAzureMachinePoolMachines(ctx, mgmtClusterProxy, workloadClusterProxy, amp)
if err != nil {
return errors.Wrap(err, "failed to list the azure machine pool machines")
LogWarning(errors.Wrap(err, "failed to list the azure machine pool machines").Error())
return err
}

for _, machine := range ampmls {
Expand All @@ -176,35 +182,33 @@ func testMachinePoolCordonAndDrain(ctx context.Context, mgmtClusterProxy, worklo
}

return errors.New("no machine has started to drain")
})

By(fmt.Sprintf("checking for a machine to successfully complete draining for machine pool: %s/%s", amp.Namespace, amp.Name))
Eventually(func() error {
ampmls, err := getAzureMachinePoolMachines(ctx, mgmtClusterProxy, workloadClusterProxy, amp)
if err != nil {
return errors.Wrap(err, "failed to list the azure machine pool machines")
}

for _, machine := range ampmls {
if conditions.Has(&machine, clusterv1.DrainingSucceededCondition) && conditions.IsTrue(&machine, clusterv1.DrainingSucceededCondition) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the issue is that the machine gets deleted too quickly, I wonder if we could add a finalizer to it before the test starts (in the setup above) so it doesn't get deleted until we've done all the checks and we're ready to let it delete

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the proper scope for this PR? Given that these existing drain tests don't actually work at all. Should we simply abandon any changes here and follow-up with targeted improvements to drain testing?

return nil // started draining the node prior to delete
}
}
}, waitForDrainOperationTimeout, waitForDrainSleepBetweenRetries).Should(Succeed())

return errors.New("no machine has finished draining")
})
// TODO setup a watcher to detect the terminal drain success state
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than checking for the expected state transitions serially, we can improve this in the future by setting a watch on the machine in the process of deleting. I'm making the judgment that doing that work is out of scope for this PR (this PR is about standardizing the usage of Eventually() for retryable operations).

Bottom line, this test coverage was never actually running due to the lack of .Should(Succeed()), so adding the "validate that drain begins after delete" is already net additive coverage.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we open an issue for this rather than leaving a TODO in the code? Also does this mean #2120 isn't fully fixed?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The referenced issue documents a flake that should be addressed w/ the new Eventually() block in L191 below

}

func labelNodesWithMachinePoolName(ctx context.Context, workloadClient client.Client, mpName string, ampms []infrav1exp.AzureMachinePoolMachine) {
for _, ampm := range ampms {
n := &corev1.Node{}
Expect(workloadClient.Get(ctx, client.ObjectKey{
Name: ampm.Status.NodeRef.Name,
Namespace: ampm.Status.NodeRef.Namespace,
}, n)).ToNot(HaveOccurred())
Eventually(func() error {
err := workloadClient.Get(ctx, client.ObjectKey{
Name: ampm.Status.NodeRef.Name,
Namespace: ampm.Status.NodeRef.Namespace,
}, n)
if err != nil {
LogWarning(err.Error())
}
return err
}, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed())
n.Labels[clusterv1.OwnerKindAnnotation] = "MachinePool"
n.Labels[clusterv1.OwnerNameAnnotation] = mpName
Expect(workloadClient.Update(ctx, n)).ToNot(HaveOccurred())
Eventually(func() error {
err := workloadClient.Update(ctx, n)
if err != nil {
LogWarning(err.Error())
}
return err
}, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed())
}
}

Expand All @@ -231,10 +235,16 @@ func getOwnerMachinePool(ctx context.Context, c client.Client, obj metav1.Object

if ref.Kind == "MachinePool" && gv.Group == clusterv1exp.GroupVersion.Group {
mp := &clusterv1exp.MachinePool{}
err := c.Get(ctx, client.ObjectKey{
Name: ref.Name,
Namespace: obj.Namespace,
}, mp)
Eventually(func() error {
err := c.Get(ctx, client.ObjectKey{
Name: ref.Name,
Namespace: obj.Namespace,
}, mp)
if err != nil {
LogWarning(err.Error())
}
return err
}, waitforResourceOperationTimeout, 3*time.Second).Should(Succeed())
return mp, err
}
}
Expand Down Expand Up @@ -271,18 +281,14 @@ func deployHttpService(ctx context.Context, clientset *kubernetes.Clientset, isW
webDeploymentBuilder.AddContainerPort("http", "http", 80, corev1.ProtocolTCP)

if isWindows {
var windowsVersion windows.OSVersion
Eventually(func() error {
version, err := node.GetWindowsVersion(ctx, clientset)
windowsVersion = version
return err
}, 300*time.Second, 5*time.Second).Should(Succeed())
windowsVersion, err := node.GetWindowsVersion(ctx, clientset)
Expect(err).NotTo(HaveOccurred())
iisImage := windows.GetWindowsImage(windows.Httpd, windowsVersion)
webDeploymentBuilder.SetImage(deploymentName, iisImage)
webDeploymentBuilder.AddWindowsSelectors()
}

elbService := webDeploymentBuilder.GetService(ports, deployments.ExternalLoadbalancer)
elbService := webDeploymentBuilder.CreateServiceResourceSpec(ports, deployments.ExternalLoadbalancer)

for _, opt := range opts {
opt(webDeploymentBuilder, elbService)
Expand Down
Loading