-
Notifications
You must be signed in to change notification settings - Fork 223
OCPBUGS-2493: Fix TestUnmanagedDNSToManagedDNSInternal E2E test race conditions #845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,17 +5,23 @@ package e2e | |||||||||||
|
|
||||||||||||
| import ( | ||||||||||||
| "context" | ||||||||||||
| "fmt" | ||||||||||||
| "reflect" | ||||||||||||
| "testing" | ||||||||||||
| "time" | ||||||||||||
|
|
||||||||||||
| configv1 "github.com/openshift/api/config/v1" | ||||||||||||
| operatorv1 "github.com/openshift/api/operator/v1" | ||||||||||||
| iov1 "github.com/openshift/api/operatoringress/v1" | ||||||||||||
| "github.com/openshift/cluster-ingress-operator/pkg/operator/controller" | ||||||||||||
| ingresscontroller "github.com/openshift/cluster-ingress-operator/pkg/operator/controller/ingress" | ||||||||||||
|
|
||||||||||||
| appsv1 "k8s.io/api/apps/v1" | ||||||||||||
| corev1 "k8s.io/api/core/v1" | ||||||||||||
|
|
||||||||||||
| "k8s.io/apimachinery/pkg/api/errors" | ||||||||||||
| "k8s.io/apimachinery/pkg/types" | ||||||||||||
| "k8s.io/apimachinery/pkg/util/wait" | ||||||||||||
| ) | ||||||||||||
|
|
||||||||||||
| // TODO: Remove this once this condition is added to all e2e test | ||||||||||||
|
|
@@ -186,6 +192,23 @@ func TestManagedDNSToUnmanagedDNSIngressController(t *testing.T) { | |||||||||||
| func TestUnmanagedDNSToManagedDNSInternalIngressController(t *testing.T) { | ||||||||||||
| t.Parallel() | ||||||||||||
|
|
||||||||||||
| if infraConfig.Status.PlatformStatus == nil { | ||||||||||||
| t.Skip("test skipped on nil platform") | ||||||||||||
| } | ||||||||||||
| platform := infraConfig.Status.PlatformStatus.Type | ||||||||||||
|
|
||||||||||||
| supportedPlatforms := map[configv1.PlatformType]struct{}{ | ||||||||||||
| configv1.AlibabaCloudPlatformType: {}, | ||||||||||||
| configv1.AWSPlatformType: {}, | ||||||||||||
| configv1.AzurePlatformType: {}, | ||||||||||||
| configv1.GCPPlatformType: {}, | ||||||||||||
| configv1.IBMCloudPlatformType: {}, | ||||||||||||
| configv1.PowerVSPlatformType: {}, | ||||||||||||
| } | ||||||||||||
| if _, supported := supportedPlatforms[platform]; !supported { | ||||||||||||
| t.Skipf("test skipped on platform %q", platform) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| name := types.NamespacedName{Namespace: operatorNamespace, Name: "unmanaged-migrated-internal"} | ||||||||||||
| ic := newLoadBalancerController(name, name.Name+"."+dnsConfig.Spec.BaseDomain) | ||||||||||||
| ic.Spec.EndpointPublishingStrategy.LoadBalancer = &operatorv1.LoadBalancerStrategy{ | ||||||||||||
|
|
@@ -240,8 +263,35 @@ func TestUnmanagedDNSToManagedDNSInternalIngressController(t *testing.T) { | |||||||||||
| t.Fatalf("failed to update ingresscontroller %s: %v", name, err) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this step wait for stability too?
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do wait for Ingress/DNS stable conditions on line 300 before we do our verification test.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've seen (and written) e2e tests where whenever there's an operation that has side effects, you call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, but we do exactly It's not immediately below the IC update because we can go do some other things (like check for lbService update) before we Is that sufficient?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a toss up on whether you wait for things to come back to normal after a change, or you wait and redo things over and over in a loop, adding time when you find you need to tweak the test. |
||||||||||||
| if err := kclient.Delete(context.TODO(), lbService); err != nil && !errors.IsNotFound(err) { | ||||||||||||
| t.Fatalf("failed to delete svc %s: %v", lbService.Name, err) | ||||||||||||
| var oldLoadBalancerStatus corev1.LoadBalancerStatus | ||||||||||||
| lbService.Status.LoadBalancer.DeepCopyInto(&oldLoadBalancerStatus) | ||||||||||||
|
|
||||||||||||
| // Only delete the service on platforms that don't automatically update the service's scope. | ||||||||||||
| switch platform { | ||||||||||||
| case configv1.AlibabaCloudPlatformType, configv1.AWSPlatformType, configv1.IBMCloudPlatformType, configv1.PowerVSPlatformType: | ||||||||||||
| if err := kclient.Delete(context.TODO(), lbService); err != nil && !errors.IsNotFound(err) { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we have multiple attempts here to mitigate transient failures? Somebody else updated the object, for example.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, that's a good, but loaded question. It's never been an issue in the multiple times I have tried it. Part of me agrees with you, but the other part says "Well, if we do it here, we need to rewrite a ton of other Gets/Deletes/Updates in this e2e-test and others" as this doesn't just apply here. I think we need standardized common wrapper functions for Delete/Get/Update if we want to mitigate this, opposed as doing it as 1-offs everywhere. Could have a standard "deleteAndRetry" function (I think we have updateAndRetry-type functions already?). I think probably a good shift-week-type refactoring, I'm hesitant to start doing 1-off-type refactoring since it's in many more than this one place here, and I think the effort should be coordinated. Let me know if you disagree, or have an existing function I can use.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 to what Grant said. We've added retries to E2E tests ad hoc in the past, and absent any hoc, it's reasonable to defer that to some more comprehensive effort to add retries to Kube API calls in E2E tests. |
||||||||||||
| t.Fatalf("failed to delete svc %s: %v", lbService.Name, err) | ||||||||||||
| } | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps a wait for stability here too?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's already a wait after the polling loop. That should be enough:
Is there a need to repeat one of those checks, or is there some missing check?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it doesn't matter whether the service was actually deleted or not, and it has no side effects, then you don't need to wait.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh sorry, this comment was different, ignore my comment above. Edit: Deleted comment. We do wait on Line 278 to make sure the service was updated. We don't need to specifically wait for the service to be deleted, since we just want to trigger an update, and on some platform that's how you need to do it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It still isn't clear to me whether there is a missing wait, or an extra wait. If there is a missing wait, it isn't clear to me what it would be or why the test is passing now. If there is an extra wait, I'm not too bothered by it since the test is passing now. Remember, the goal here is to get CI working again ASAP so that we can be confident that we haven't broken anything, and have time to fix anything that is broken before branching day; barring a major problem with the test (such as breaking it for some job or changing it in a way such that it doesn't cover the intended functionality at all), I'd rather not delay it for issues that can be addressed in a future clean-up.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As an interim, lgtm. |
||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Ensure the service's load-balancer status changes. | ||||||||||||
| err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) { | ||||||||||||
| lbService := &corev1.Service{} | ||||||||||||
| if err := kclient.Get(context.TODO(), controller.LoadBalancerServiceName(ic), lbService); err != nil { | ||||||||||||
| t.Logf("Get %q failed: %v, retrying ...", controller.LoadBalancerServiceName(ic), err) | ||||||||||||
| return false, nil | ||||||||||||
| } | ||||||||||||
| if reflect.DeepEqual(lbService.Status.LoadBalancer, oldLoadBalancerStatus) { | ||||||||||||
| t.Logf("Waiting for service %q to be updated", controller.LoadBalancerServiceName(ic)) | ||||||||||||
| return false, nil | ||||||||||||
| } else if ingresscontroller.IsServiceInternal(lbService) { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moving this check seems odd - can it magically flip between internal and external?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check was moved just because we needed an up-to-date version of This could be moved outside the loop, but just requires another |
||||||||||||
| // The service got updated, but is not external. | ||||||||||||
| return true, fmt.Errorf("load balancer %s is internal but should be external", lbService.Name) | ||||||||||||
| } | ||||||||||||
| return true, nil | ||||||||||||
| }) | ||||||||||||
| if err != nil { | ||||||||||||
| t.Fatalf("error updating the %q service: %v", controller.LoadBalancerServiceName(ic), err) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| t.Logf("Waiting for stable conditions on ingresscontroller %s after dnsManagementPolicy=Managed", ic.Name) | ||||||||||||
|
|
@@ -251,10 +301,6 @@ func TestUnmanagedDNSToManagedDNSInternalIngressController(t *testing.T) { | |||||||||||
| t.Fatalf("failed to observe expected conditions: %v", err) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if !ingresscontroller.IsServiceInternal(lbService) { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this an erroneous test, or is it already done elsewhere?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, just in this location it was opposite of what it actually should have been. I checked other usages to make sure thinks weren't flipped.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean, it was removed because it is erroneous?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I misunderstood, yes, this was definitely incorrect, it was fixed and moved above with other changes, https://github.com/openshift/cluster-ingress-operator/pull/845/files#diff-a660dcb9b4adcc19e854b3f294ec260efabed12bb55eb2f2b8fb03ff6eb92527R287 |
||||||||||||
| t.Fatalf("load balancer %s is internal but should be external", lbService.Name) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Ensure DNSRecord CR is present. | ||||||||||||
| if err := kclient.Get(context.TODO(), wildcardRecordName, wildcardRecord); err != nil { | ||||||||||||
| t.Fatalf("failed to get wildcard dnsrecord %s: %v", wildcardRecordName, err) | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ import ( | |
| "bytes" | ||
| "context" | ||
| "fmt" | ||
| "net" | ||
| "net/http" | ||
| "strings" | ||
| "testing" | ||
|
|
@@ -83,6 +84,7 @@ func waitForHTTPClientCondition(t *testing.T, httpClient *http.Client, req *http | |
| return wait.PollImmediate(interval, timeout, func() (done bool, err error) { | ||
| resp, err := httpClient.Do(req) | ||
| if err == nil { | ||
| defer resp.Body.Close() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good find! |
||
| return compareFunc(resp), nil | ||
| } else { | ||
| t.Logf("retrying client call due to: %+v", err) | ||
|
|
@@ -127,6 +129,7 @@ func buildCurlPod(name, namespace, image, host, address string, extraArgs ...str | |
| Namespace: namespace, | ||
| }, | ||
| Spec: corev1.PodSpec{ | ||
| TerminationGracePeriodSeconds: pointer.Int64(0), | ||
| Containers: []corev1.Container{ | ||
| { | ||
| Name: "curl", | ||
|
|
@@ -472,17 +475,31 @@ func verifyExternalIngressController(t *testing.T, name types.NamespacedName, ho | |
| } | ||
| }() | ||
|
|
||
| // If we have a DNS as an external IP address, make sure we can resolve it before moving on. | ||
| // This just limits the number of "could not resolve host" errors which can be confusing. | ||
| if net.ParseIP(address) == nil { | ||
| if err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a long wait in an e2e test, can we get away with 1m?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not long considering DNS can take very well up to 5 mins to propagate for AWS. Keep in mind, these times are dependent on external processes, not something we have control over. I think we want to give platforms as much time as it needs. |
||
| _, err := net.LookupIP(address) | ||
| if err != nil { | ||
| t.Logf("waiting for loadbalancer domain %s to resolve...", address) | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }); err != nil { | ||
| t.Fatalf("loadbalancer domain %s was unable to resolve:", address) | ||
| } | ||
| } | ||
|
|
||
| req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s", address), nil) | ||
| if err != nil { | ||
| t.Fatalf("failed to build client request: %v", err) | ||
| } | ||
|
|
||
| // we use HOST header to map to the domain associated on the ingresscontroller. | ||
| // This ensures our http call is routed to the correct router. | ||
| req.Host = hostname | ||
|
|
||
| httpClient := http.Client{Timeout: 5 * time.Second} | ||
| err = waitForHTTPClientCondition(t, &httpClient, req, 10*time.Second, 5*time.Minute, func(r *http.Response) bool { | ||
| err = waitForHTTPClientCondition(t, &httpClient, req, 10*time.Second, 10*time.Minute, func(r *http.Response) bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do we gain with doubling the timeout? Does just adding 1m help?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So yes, 10 mins is pretty aggressive, but I've seen AWS, Azure, GCP take over 5 mins. Here's Miciah's answer to when I asked why it failed at 5 mins: #845 (comment). It needs to be at least 7 mins, since we've seen loadbalancers take 6 mins to initialize, but we've been having issues with very long initialization periods for loadbalancers in this test. I could reduce to 7 or 8 mins, but I don't really see the point if it introduces even more flakes. There isn't much to gain by reducing the wait time here, it's parallel. I'm hesitant, but I can go down a minute or two if you think there is value.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we should take a minute to consider changes that add 10-15 minutes to a single test. We're already waiting up to 3 hrs on some tests due to things we can't control. Are customers willing to wait up to 10 minutes for a loadbalancer to initialize? If not, we need to find out why this activity fails to perform on time, not just give it more time to pass CI. Just my opinion.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think our E2E parallelization is not a general customer use case. We are initializing multiple loadbalancers at the same time, 10-20 loadbalancer creations could be happening at the same time. That's my theory on why we've had to crank these deadlines up why these activities take so long. Probably worth a investigation in how creating multiple loadbalancers at the same time affects initialization time, but I feel it's out of the scope of this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This only increases the job runtime if (a) this is the longest running test and (b) the polling loop times out. If the LB is ready after 2 minutes, then it makes no different whether we specify a timeout of 5 minutes or 10 minutes. If the LB takes 6 minutes to be ready (as is known to be possible on AWS), then setting the timeout <10 minutes can increase frequency of flakes, and then you get to wait for the whole E2E job to rerun.
If there's a regression, it should be tracked as a separate bug. As Grant said, optimizing LB initialization is out of scope for this PR. The goal here is to get CI working again ASAP. |
||
| if r.StatusCode == http.StatusOK { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should be draining the response buffer, ala: io.Copy(ioutil.Discard, r.Body)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup I agree, but
|
||
| t.Logf("verified connectivity with workload with req %v and response %v", req.URL, r.StatusCode) | ||
| return true | ||
|
|
@@ -545,20 +562,32 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho | |
| "--retry-delay", "20", | ||
| "--max-time", "10", | ||
| } | ||
| clientPod := buildCurlPod("curl-"+name.Name, echoRoute.Namespace, image, address, echoRoute.Spec.Host, extraArgs...) | ||
| clientPodName := types.NamespacedName{Namespace: name.Namespace, Name: "curl-" + name.Name} | ||
| clientPodSpec := buildCurlPod(clientPodName.Name, clientPodName.Namespace, image, address, echoRoute.Spec.Host, extraArgs...) | ||
| clientPod := clientPodSpec.DeepCopy() | ||
| if err := kclient.Create(context.TODO(), clientPod); err != nil { | ||
| t.Fatalf("failed to create pod %s/%s: %v", clientPod.Namespace, clientPod.Name, err) | ||
| t.Fatalf("failed to create pod %q: %v", clientPodName, err) | ||
| } | ||
| defer func() { | ||
| if err := kclient.Delete(context.TODO(), clientPod); err != nil { | ||
| if errors.IsNotFound(err) { | ||
| return | ||
| } | ||
| t.Fatalf("failed to delete pod %s/%s: %v", clientPod.Namespace, clientPod.Name, err) | ||
| t.Fatalf("failed to delete pod %q: %v", clientPodName, err) | ||
| } | ||
| }() | ||
|
|
||
| err = wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) { | ||
| var curlPodLogs string | ||
| err = wait.PollImmediate(10*time.Second, 10*time.Minute, func() (bool, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto about doubling the timeout here. Can we get away with just adding 1m?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| if err := kclient.Get(context.TODO(), clientPodName, clientPod); err != nil { | ||
| t.Logf("error getting client pod %q: %v, retrying...", clientPodName, err) | ||
| return false, nil | ||
| } | ||
| // First check if client curl pod is still starting or not running. | ||
| if clientPod.Status.Phase == corev1.PodPending { | ||
| t.Logf("waiting for client pod %q to start", clientPodName) | ||
| return false, nil | ||
| } | ||
| readCloser, err := client.CoreV1().Pods(clientPod.Namespace).GetLogs(clientPod.Name, &corev1.PodLogOptions{ | ||
| Container: "curl", | ||
| Follow: false, | ||
|
|
@@ -573,16 +602,39 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho | |
| t.Errorf("failed to close reader for pod %s: %v", clientPod.Name, err) | ||
| } | ||
| }() | ||
| curlPodLogs = "" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this become the declaration? If, after processing the scanner loop, you can't return as "HTTP/1.0 200 OK" was never matched, then print
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You mean
So yea I see what your saying, but this happens a lot where 200 OK isn't matched after each scanner loop (usually DNS issues...). I think if we print every time, we are gonna clog up the logs with the same repeated failures. Keep in mind, it's not additive: it scans the whole entire log each poll loop. So, with what you are suggesting, I believe you'd get something like this: I just chose to store the last log scan and only output on failures to keep the logs clean. Let me know if this makes sense or if I'm not seeing what you are saying.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for posterity, here's what it currently looks like (I intentionally broke it with an invalid IP): |
||
| for scanner.Scan() { | ||
| line := scanner.Text() | ||
| curlPodLogs += line + "\n" | ||
| if strings.Contains(line, "HTTP/1.0 200 OK") { | ||
| t.Logf("verified connectivity with workload with address: %s with response %s", address, line) | ||
| return true, nil | ||
| } | ||
| } | ||
| // If failed or succeeded, the pod is stopped, but didn't provide us 200 response, let's try again. | ||
| if clientPod.Status.Phase == corev1.PodFailed || clientPod.Status.Phase == corev1.PodSucceeded { | ||
| t.Logf("client pod %q has stopped...restarting. Curl Pod Logs:\n%s", clientPodName, curlPodLogs) | ||
| if err := kclient.Delete(context.TODO(), clientPod); err != nil && errors.IsNotFound(err) { | ||
| t.Fatalf("failed to delete pod %q: %v", clientPodName, err) | ||
| } | ||
| // Wait for deletion to prevent a race condition. Use PollInfinite since we are already in a Poll. | ||
| wait.PollInfinite(5*time.Second, func() (bool, error) { | ||
| err = kclient.Get(context.TODO(), clientPodName, clientPod) | ||
| if !errors.IsNotFound(err) { | ||
| t.Logf("waiting for %q: to be deleted", clientPodName) | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }) | ||
| clientPod = clientPodSpec.DeepCopy() | ||
| if err := kclient.Create(context.TODO(), clientPod); err != nil { | ||
| t.Fatalf("failed to create pod %q: %v", clientPodName, err) | ||
| } | ||
| return false, nil | ||
| } | ||
| return false, nil | ||
| }) | ||
| if err != nil { | ||
| t.Fatalf("failed to verify connectivity with workload with address: %s using internal curl client: %v", address, err) | ||
| t.Fatalf("failed to verify connectivity with workload with address: %s using internal curl client. Curl Pod Logs:\n%s", address, curlPodLogs) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like over time, this will cause problems. When a new platform is added, we will have to remember to add it here, or risk not running this test on new platforms. If there is a specific platform that we don't want to run this on, you could do this:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea I see your point, opt-out vs. opt-in. @Miciah wrote this, and I think he wanted to be explicit with an "opt-in" approach to this E2E test.
@Miciah what are your thoughts since you added this? What platforms are we really trying to skip here (Nutanix? Libvirt? Openstack?...) can we even run e2e-test on them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're trying to skip platforms that don't use or support LoadBalancer-type services. We have the same or similar logic repeated in several tests. This should probably be refactored as part of a general cleanup of how we handle platform-specific skips.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Miciah Yea I think your point is that this is a larger issue than just this single E2E test. Should we just remove this logic for the time being in anticipation for a more comprehensive effort? (kind of the same debate with Andy's comment on retries https://github.com/openshift/cluster-ingress-operator/pull/845/files#r999688986)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see a reason to remove this logic—it is strictly better to skip this test on platforms on which we know the test would fail than to run the test and fail on those platforms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm ambivalent about it especially if we think we should refactor it in the future anyways.
Consistency always wins in my book, even sometimes if I don't necessarily agree with way it's done. I'd rather everything break in the same way than a million things breaking in different ways.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before I commented, I searched for uses of this pattern, and I didn't find any in c-i-o, except in the case where we knew we only wanted to run the test on a single platform. https://github.com/openshift/cluster-ingress-operator/search?q=supportedplatforms
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GitHub's search is unreliable. The pattern appears in several tests.
TestAllowedSourceRanges:cluster-ingress-operator/test/e2e/allowed_source_ranges_test.go
Lines 28 to 41 in 9456d9f
TestAllowedSourceRangesStatus:cluster-ingress-operator/test/e2e/allowed_source_ranges_test.go
Lines 127 to 140 in 9456d9f
TestSourceRangesProgressingAndEvaluationConditionsDetectedStatuses:cluster-ingress-operator/test/e2e/allowed_source_ranges_test.go
Lines 246 to 259 in 9456d9f
TestInternalLoadBalancer:cluster-ingress-operator/test/e2e/operator_test.go
Lines 935 to 954 in 9456d9f
TestInternalLoadBalancerGlobalAccessGCP:cluster-ingress-operator/test/e2e/operator_test.go
Lines 1029 to 1039 in 9456d9f
TestScopeChange:cluster-ingress-operator/test/e2e/operator_test.go
Lines 1299 to 1315 in 9456d9f
TestLocalWithFallbackOverrideForLoadBalancerService:cluster-ingress-operator/test/e2e/operator_test.go
Lines 2879 to 2890 in 9456d9f