From e2d6af511108caf5af17189124ec39a56e3c5c81 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 19 Dec 2023 22:51:01 -0800 Subject: [PATCH] pkg/cvo/availableupdates: Only bump LastAttempt on Cincinnati pulls 965bfb2844 (pkg/cvo/availableupdates: Requeue risk evaluation on failure, 2023-09-18, #939) pivoted from "every syncAvailableUpdates round that does anything useful has a fresh Cincinnati pull" to "some syncAvailableUpdates rounds have a fresh Cincinnati pull, but others just re-eval some Recommended=Unknown conditional updates". Then syncAvailableUpdates calls setAvailableUpdates. However, until this commit, setAvailableUpdates had been bumping LastAttempt every time, even in the just-re-eval conditional updates" case. That meant we never tripped the: } else if !optrAvailableUpdates.RecentlyChanged(optr.minimumUpdateCheckInterval) { klog.V(2).Infof("Retrieving available updates again, because more than %s has elapsed since %s", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) condition to trigger a fresh Cincinnati pull. Which could lead to deadlocks like: 1. Cincinnati serves vulnerable PromQL, like [1]. 2. Clusters pick up that broken PromQL, try to evaluate, and fail. Re-eval-and-fail loop continues. 3. Cincinnati PromQL fixed, like [2]. 4. Cases: a. Before 965bfb2844, and also after this commit, Clusters pick up the fixed PromQL, try to evaluate, and start succeeding. Hooray! b. Clusters with 965bfb2844 but without this commit say "it's been a long time since we pulled fresh Cincinanti information, but it has not been long since my last attempt to eval this broken PromQL, so let me skip the Cincinnati pull and re-eval that old PromQL", which fails. Re-eval-and-fail loop continues. To break out of 4.b, clusters on impacted releases can roll their CVO pod: $ oc -n openshift-cluster-version delete -l k8s-app=cluster-version-operator pod which will clear out LastAttempt and trigger a fresh Cincinnati pull. I'm not sure if there's another recovery method... [1]: https://github.com/openshift/cincinnati-graph-data/pull/4524 [2]: https://github.com/openshift/cincinnati-graph-data/pull/4528 --- pkg/cvo/availableupdates.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 32734da5f..6024e2410 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -112,6 +112,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 upstream = "" } + optrAvailableUpdates.LastAttempt = time.Now() optrAvailableUpdates.Upstream = upstream optrAvailableUpdates.Channel = channel optrAvailableUpdates.Architecture = desiredArch @@ -201,7 +202,6 @@ func (u *availableUpdates) NeedsUpdate(original *configv1.ClusterVersion) *confi func (optr *Operator) setAvailableUpdates(u *availableUpdates) { success := false if u != nil { - u.LastAttempt = time.Now() if u.Condition.Type == configv1.RetrievedUpdates { success = u.Condition.Status == configv1.ConditionTrue } else {