Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions test/e2e/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -777,8 +777,8 @@ func TestRouteAdmissionPolicy(t *testing.T) {
// The updated ingresscontroller deployment may take a few minutes to
// roll out, so make sure that it is updated, and then make sure that it
// has finished rolling out before checking the route.
deployment := &appsv1.Deployment{}
err := wait.PollImmediate(1*time.Second, 1*time.Minute, func() (bool, error) {
deployment := &appsv1.Deployment{}
if err := kclient.Get(context.TODO(), controller.RouterDeploymentName(ic), deployment); err != nil {
return false, err
}
Expand All @@ -792,7 +792,7 @@ func TestRouteAdmissionPolicy(t *testing.T) {
if err != nil {
t.Fatalf("failed to observe ROUTER_DISABLE_NAMESPACE_OWNERSHIP_CHECK=true: %v", err)
}
if err := waitForIngressControllerCondition(kclient, 3*time.Minute, icName, conditions...); err != nil {
if err := waitForDeploymentComplete(t, kclient, deployment, 3*time.Minute); err != nil {
t.Fatalf("failed to observe expected conditions: %v", err)
}

Expand Down Expand Up @@ -841,7 +841,6 @@ func TestRouteAdmissionPolicy(t *testing.T) {
t.Fatalf("failed to update ingresscontroller: %v", err)
}
err = wait.PollImmediate(1*time.Second, 1*time.Minute, func() (bool, error) {
deployment := &appsv1.Deployment{}
if err := kclient.Get(context.TODO(), controller.RouterDeploymentName(ic), deployment); err != nil {
return false, err
}
Expand All @@ -855,7 +854,7 @@ func TestRouteAdmissionPolicy(t *testing.T) {
if err != nil {
t.Fatalf("failed to observe ROUTER_ALLOW_WILDCARD_ROUTES=true: %v", err)
}
if err := waitForIngressControllerCondition(kclient, 3*time.Minute, icName, conditions...); err != nil {
if err := waitForDeploymentComplete(t, kclient, deployment, 3*time.Minute); err != nil {
t.Fatalf("failed to observe expected conditions: %v", err)
}

Expand Down Expand Up @@ -1163,6 +1162,40 @@ func waitForAvailableReplicas(cl client.Client, ic *operatorv1.IngressController
return nil
}

// Wait for the provided deployment to complete its rollout.
func waitForDeploymentComplete(t *testing.T, cl client.Client, deployment *appsv1.Deployment, timeout time.Duration) error {
t.Helper()
replicas := int32(1)
name := types.NamespacedName{Namespace: deployment.Namespace, Name: deployment.Name}
deployment = &appsv1.Deployment{}
err := wait.PollImmediate(1*time.Second, timeout, func() (bool, error) {
if err := cl.Get(context.TODO(), name, deployment); err != nil {
t.Logf("error getting deployment %s: %v", name, err)
return false, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should false, err be returned here, instead of false, nil?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, yeah. Generally, I think we should either t.Logf(..., err) or return false, err. In this case, the deployment should already exist, and we're only watching for a mutation, so using return false, err would make sense; on the other hand, doing so would make the test more likely to break if there are transient network or API issues. Would it make most sense in general to t.Logf(..., err) and return false, nil to decrease the likelihood that tests fail due to networking or API failures unrelated to the test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://bugzilla.redhat.com/show_bug.cgi?id=1828618 is related to the general question (maybe it prompted your comment here).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, that BZ definitely prompted my comment.

I definitely agree that we should either log the error responsibly and return false, nil or just flat out fail by returning return false, err in these situations.

To counter your point about transient network or API issues, if any of these issues errors were to persist for a length of time close to the timeout, returning false, err and failing the test immediately would be beneficial. Additionally, parts of the ingress operator's e2e test already fail on unsuccessful API calls (for example, https://github.com/openshift/cluster-ingress-operator/blob/master/test/e2e/operator_test.go#L193). So I'm wondering how much we would really be decreasing the likelihood of the tests failing from transient issues by using the responsible logging and return false, nil strategy. For this reason, I am leaning towards turning false, err and outright failing. Thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To counter your point about transient network or API issues, if any of these issues errors were to persist for a length of time close to the timeout, returning false, err and failing the test immediately would be beneficial.

I disagree—if networking or API is broken, other tests should report that breakage. Having this component's tests fail due to other components' breakage makes it more difficult to determine which component is at fault and leads to misdirected Bugzilla reports.

Additionally, parts of the ingress operator's e2e test already fail on unsuccessful API calls (for example, https://github.com/openshift/cluster-ingress-operator/blob/master/test/e2e/operator_test.go#L193). So I'm wondering how much we would really be decreasing the likelihood of the tests failing from transient issues by using the responsible logging and return false, nil strategy. For this reason, I am leaning towards turning false, err and outright failing. Thoughts?

That is why I am speaking generally—I believe this component's tests should not fail on errors that are caused by other components, so I am suggesting that as a general principle, we should log and retry on errors that are not caused by our component (and transient errors are very unlikely to be caused by our component). Does that make sense?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That does make sense, and I think you've convinced me. I agree that logging and retrying transient errors, when applicable, makes the most sense. Let's include this conversation in https://bugzilla.redhat.com/show_bug.cgi?id=1828618 and continue the conversation there. Would like to here the rest of the teams thoughts if thats OK too.

}
if deployment.Generation != deployment.Status.ObservedGeneration {
return false, nil
}
if deployment.Spec.Replicas != nil {
replicas = *deployment.Spec.Replicas
}
if replicas != deployment.Status.Replicas {
return false, nil
}
if replicas != deployment.Status.AvailableReplicas {
return false, nil
}
if replicas != deployment.Status.UpdatedReplicas {
return false, nil
}
return true, nil
})
if err != nil {
return fmt.Errorf("failed to observe deployment rollout complete; deployment specifies %v replicas and has generation %v; last observed %v updated, %v available, %v total replicas, with observed generation %v", replicas, deployment.Generation, deployment.Status.UpdatedReplicas, deployment.Status.AvailableReplicas, deployment.Status.Replicas, deployment.Status.ObservedGeneration)
}
return nil
}

func clusterOperatorConditionMap(conditions ...configv1.ClusterOperatorStatusCondition) map[string]string {
conds := map[string]string{}
for _, cond := range conditions {
Expand Down