From 9e00c3b579df5c587ad5a23cd8fa14b0c9a6580c Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 10 Jan 2023 15:16:51 +0100 Subject: [PATCH 1/4] upgrade/adminack: guarantee one admin ack check post-upgrade While looking into OCPBUGS-5505 I discovered that some 4.10->4.11 upgrade job runs perform an Admin Ack check, while some do not. 4.11 has a `ack-4.11-kube-1.25-api-removals-in-4.12` gate, so these upgrade jobs sometimes test that `Upgradeable` goes `false` after the ugprade, and sometimes they do not. This is only determined by the polling race condition: the check is executed once per 10 minutes, and we cancel the polling after upgrade is completed. This means that in some cases we are lucky and manage to run one check before the cancel, and sometimes we are not and only check while still on the base version. Add a guaranteed single check execution after the upgrade, so that admin ack is always checked at least once with the upgrade target version. Doing checks after `done` is signalled has prior art in the alert test. --- test/e2e/upgrade/adminack/adminack.go | 7 +++++++ .../util/openshift/clusterversionoperator/adminack.go | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/e2e/upgrade/adminack/adminack.go b/test/e2e/upgrade/adminack/adminack.go index 6a9198dc9286..990a759ed5d6 100644 --- a/test/e2e/upgrade/adminack/adminack.go +++ b/test/e2e/upgrade/adminack/adminack.go @@ -51,6 +51,13 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade adminAckTest := &clusterversionoperator.AdminAckTest{Oc: t.oc, Config: t.config, Poll: 10 * time.Minute} adminAckTest.Test(ctx) + + // Perform one guaranteed check after the upgrade is complete. The polled check above can be + // cancelled on done signal (which means, so we never know whether the poll was lucky to run at least once + // since the version was bumped. + postUpdateCtx, postUpdateCtxCancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer postUpdateCtxCancel() + (&clusterversionoperator.AdminAckTest{Oc: t.oc, Config: t.config}).Test(postUpdateCtx) } // Teardown cleans up any remaining objects. diff --git a/test/extended/util/openshift/clusterversionoperator/adminack.go b/test/extended/util/openshift/clusterversionoperator/adminack.go index 083e945f4e51..ca309da3dbac 100644 --- a/test/extended/util/openshift/clusterversionoperator/adminack.go +++ b/test/extended/util/openshift/clusterversionoperator/adminack.go @@ -40,7 +40,9 @@ var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) // and verifies that the Upgradeable condition no longer complains about the ack. func (t *AdminAckTest) Test(ctx context.Context) { if t.Poll == 0 { - t.test(ctx, nil) + if err := t.test(ctx, nil); err != nil { + framework.Fail(err.Error()) + } return } @@ -97,6 +99,7 @@ func (t *AdminAckTest) test(ctx context.Context, exercisedGates map[string]struc framework.Failf("Configmap openshift-config-managed/admin-gates gate %s does not contain description.", k) } if !gateApplicableToCurrentVersion(ackVersion, currentVersion) { + framework.Logf("Gate %s not applicable to current version %s", ackVersion, currentVersion) continue } if ackCm.Data[k] == "true" { From 70909af141c7cf95ccc2e95dedf65bf560d726e1 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Thu, 12 Jan 2023 01:32:45 +0100 Subject: [PATCH 2/4] upgrade/adminack: optimize the post-upgrade check The `done` signal is either a timeout or "upgrade finished, stop testing". We do not need to perform the last check in the former case. Track versions that we check and when we get the signal, check whether the current version was checked at least once, and if not, check it before terminating. --- test/e2e/upgrade/adminack/adminack.go | 7 ---- .../clusterversionoperator/adminack.go | 35 +++++++++++++------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/test/e2e/upgrade/adminack/adminack.go b/test/e2e/upgrade/adminack/adminack.go index 990a759ed5d6..6a9198dc9286 100644 --- a/test/e2e/upgrade/adminack/adminack.go +++ b/test/e2e/upgrade/adminack/adminack.go @@ -51,13 +51,6 @@ func (t *UpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade adminAckTest := &clusterversionoperator.AdminAckTest{Oc: t.oc, Config: t.config, Poll: 10 * time.Minute} adminAckTest.Test(ctx) - - // Perform one guaranteed check after the upgrade is complete. The polled check above can be - // cancelled on done signal (which means, so we never know whether the poll was lucky to run at least once - // since the version was bumped. - postUpdateCtx, postUpdateCtxCancel := context.WithTimeout(context.Background(), 15*time.Minute) - defer postUpdateCtxCancel() - (&clusterversionoperator.AdminAckTest{Oc: t.oc, Config: t.config}).Test(postUpdateCtx) } // Teardown cleans up any remaining objects. diff --git a/test/extended/util/openshift/clusterversionoperator/adminack.go b/test/extended/util/openshift/clusterversionoperator/adminack.go index ca309da3dbac..f598d0ed534b 100644 --- a/test/extended/util/openshift/clusterversionoperator/adminack.go +++ b/test/extended/util/openshift/clusterversionoperator/adminack.go @@ -14,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" restclient "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" @@ -40,17 +41,18 @@ var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) // and verifies that the Upgradeable condition no longer complains about the ack. func (t *AdminAckTest) Test(ctx context.Context) { if t.Poll == 0 { - if err := t.test(ctx, nil); err != nil { + if err := t.test(ctx, nil, nil); err != nil { framework.Fail(err.Error()) } return } - exercisedGates := map[string]struct{}{} + exercisedGates := sets.NewString() + exercisedVersions := sets.NewString() success := false var lastError error if err := wait.PollImmediateUntilWithContext(ctx, t.Poll, func(ctx context.Context) (bool, error) { - if err := t.test(ctx, exercisedGates); err != nil { + if err := t.test(ctx, exercisedGates, exercisedVersions); err != nil { framework.Logf("Retriable failure to evaluate admin acks: %v", err) lastError = err } else { @@ -66,11 +68,21 @@ func (t *AdminAckTest) Test(ctx context.Context) { if !success { framework.Failf("Never able to evaluate admin acks. Most recent failure: %v", lastError) } -} -func (t *AdminAckTest) test(ctx context.Context, exercisedGates map[string]struct{}) error { - exists := struct{}{} + // Perform one guaranteed check after the upgrade is complete. The polled check above is cancelled + // on done signal (so we never know whether the poll was lucky to run at least once since + // the version was bumped). + postUpdateCtx, postUpdateCtxCancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer postUpdateCtxCancel() + if current := getCurrentVersion(postUpdateCtx, t.Config); current != "" && !exercisedVersions.Has(current) { + // We never saw the current version while polling, so lets check it now + if err := t.test(postUpdateCtx, exercisedGates, exercisedVersions); err != nil { + framework.Fail(err.Error()) + } + } +} +func (t *AdminAckTest) test(ctx context.Context, exercisedGates, exercisedVersions sets.String) error { gateCm, err := getAdminGatesConfigMap(ctx, t.Oc) if err != nil { return err @@ -85,11 +97,12 @@ func (t *AdminAckTest) test(ctx context.Context, exercisedGates map[string]struc return err } currentVersion := getCurrentVersion(ctx, t.Config) + if exercisedVersions != nil { + exercisedVersions.Insert(currentVersion) + } for k, v := range gateCm.Data { - if exercisedGates != nil { - if _, ok := exercisedGates[k]; ok { - continue - } + if exercisedGates != nil && exercisedGates.Has(k) { + continue } ackVersion := adminAckGateRegexp.FindString(k) if ackVersion == "" { @@ -127,7 +140,7 @@ func (t *AdminAckTest) test(ctx context.Context, exercisedGates map[string]struc framework.Fail(err.Error()) } if exercisedGates != nil { - exercisedGates[k] = exists + exercisedGates.Insert(k) } } framework.Logf("Admin Ack verified") From bed06fc821aea909e08e643130fea1f2890db133 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 23 Jan 2023 15:38:06 +0100 Subject: [PATCH 3/4] upgrade/adminack: simplify polling and unblock "guaranteed" post-upgrade check https://github.com/openshift/origin/pull/27645 intended to add a guaranteed post-upgrade check but I have overlooked how exactly the polling is implemented and terminated, leading to the post-upgrade check never actually execute. Previously the test used `PollImmediateWithContext` for the each-10-minutes check. The `ConditionFunc` never actually returned `true` or non-nil `err`, so the `PollImmediateWithContext` never terminated by the means of `ConditionFunc`: it was always terminated by the `ctx.Done()` that the framework does on finished upgrade (or a test timeout). This means that `PollImmediateWithContext` always terminated with `err=wait.ErrWaitTimeout` and the `Test` method immediately returned, so the "guaranteed" check code is never reached. Given our `ConditionFunc` never terminates the polling, we can simplify and use the `wait.UntilWithContext` instead, which is a simpler version that precisely implements the desired loop (poll until context is done). --- .../util/openshift/clusterversionoperator/adminack.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/test/extended/util/openshift/clusterversionoperator/adminack.go b/test/extended/util/openshift/clusterversionoperator/adminack.go index f598d0ed534b..bc33f4808d80 100644 --- a/test/extended/util/openshift/clusterversionoperator/adminack.go +++ b/test/extended/util/openshift/clusterversionoperator/adminack.go @@ -51,19 +51,14 @@ func (t *AdminAckTest) Test(ctx context.Context) { exercisedVersions := sets.NewString() success := false var lastError error - if err := wait.PollImmediateUntilWithContext(ctx, t.Poll, func(ctx context.Context) (bool, error) { + wait.UntilWithContext(ctx, func(ctx context.Context) { if err := t.test(ctx, exercisedGates, exercisedVersions); err != nil { framework.Logf("Retriable failure to evaluate admin acks: %v", err) lastError = err } else { success = true } - return false, nil - }); err == nil || err == wait.ErrWaitTimeout { - return - } else { - framework.Fail(err.Error()) - } + }, t.Poll) if !success { framework.Failf("Never able to evaluate admin acks. Most recent failure: %v", lastError) From d1f81bb09e353d0e6a4d1517afd6cbcf75022e97 Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Tue, 24 Jan 2023 16:15:13 +0100 Subject: [PATCH 4/4] upgrade/adminack: wait up to 4m until gate propagates to upgradeable During testing of OCPBUGS-5505, it was discovered that even with shortening the CVO cache TTL, CVO may still only update `Upgradeable` in its sync interval, which may be as high as 4 minutes. Hence the tests needs to wait for that time (I added 5 second buffer on top of that). --- .../util/openshift/clusterversionoperator/adminack.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/extended/util/openshift/clusterversionoperator/adminack.go b/test/extended/util/openshift/clusterversionoperator/adminack.go index bc33f4808d80..b52a26dc326e 100644 --- a/test/extended/util/openshift/clusterversionoperator/adminack.go +++ b/test/extended/util/openshift/clusterversionoperator/adminack.go @@ -250,9 +250,14 @@ func setAdminGate(ctx context.Context, gateName string, gateValue string, oc *ex return nil } +// adminAckDeadline is the upper bound of time for CVO to notice a new adminack +// gate. CVO sync loop duration is nondeterministic 2-4m interval so we set this +// slightly above the worst case. +const adminAckDeadline = 4*time.Minute + 5*time.Second + func waitForAdminAckRequired(ctx context.Context, config *restclient.Config, message string) error { framework.Logf("Waiting for Upgradeable to be AdminAckRequired for %q ...", message) - if err := wait.PollImmediate(10*time.Second, 3*time.Minute, func() (bool, error) { + if err := wait.PollImmediate(10*time.Second, adminAckDeadline, func() (bool, error) { if adminAckRequiredWithMessage(ctx, config, message) { return true, nil }