From bd05174b4392af463af73acd31a474c2ff1f75a5 Mon Sep 17 00:00:00 2001 From: David Hurta Date: Tue, 2 Aug 2022 09:49:05 +0200 Subject: [PATCH] pkg/cvo/upgradeable.go: Sync Upgradeable status of the CVO more often Synchronize the CVO's Upgradeable status more often for a given period of time when and after the precondition checks start to fail. We don't want to check this more frequently forever in the case of the precondition checks failing because of a bigger problem that wasn't quickly solved at the start of the upgrade by itself. A precondition check can be failing for at least `optr.minimumUpdateCheckInterval` time because of `Upgradeable==false`. The `Upgradeable==false` could have been potentially already resolved (operator reporting `Upgradeable==false` just momentarily) during the duration resulting in necessary waiting for up to `optr.minimumUpdateCheckInterval` for the next synchronization. Synchronize the upgradeable status again in case of failing precondition checks to speed up initializing an upgrade stuck on a precondition check that potentially has been solved. We don't want to check this forever in case of precondition checks failing for a long time due to a bigger problem. This commit is part of a fix for the bug [1]. The bug was caused by slow syncing of the CVO's Upgradeable status when a precondition check fails and less frequent running of the precondition checks. The frequency of precondition checks was fixed by [2] fixing the Etcd backup halting an upgrade for a prolonged time [3]. The problem of the `Upgradeable==false` thanks to the `ErrorCheckingOperatorCompatibility` caused by the OLM operator [1] was fixed by the [3]. However, the main root cause of the `ErrorCheckingOperatorCompatibility` error probably still remained. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2006611 [2] https://github.com/openshift/cluster-version-operator/pull/683 [3] https://bugzilla.redhat.com/show_bug.cgi?id=2072348 [4] https://github.com/openshift/cluster-version-operator/pull/766 --- pkg/cvo/cvo.go | 16 +++++++++++----- pkg/cvo/upgradeable.go | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index fda11e5a3..1c6bca50b 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -121,6 +121,11 @@ type Operator struct { upgradeable *upgradeable upgradeableChecks []upgradeableCheck + // minimumUpgradeableCheckInterval is the minimum duration before another + // synchronization of the upgradeable status can happen during failing + // precondition checks. + minimumUpgradeableCheckInterval time.Duration + // verifier, if provided, will be used to check an update before it is executed. // Any error will prevent an update payload from being accessed. verifier verify.Interface @@ -179,10 +184,11 @@ func New( Image: releaseImage, }, - statusInterval: 15 * time.Second, - minimumUpdateCheckInterval: minimumInterval, - payloadDir: overridePayloadDir, - defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", + statusInterval: 15 * time.Second, + minimumUpdateCheckInterval: minimumInterval, + minimumUpgradeableCheckInterval: 15 * time.Second, + payloadDir: overridePayloadDir, + defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", client: client, kubeClient: kubeClient, @@ -627,7 +633,7 @@ func (optr *Operator) upgradeableSync(ctx context.Context, key string) error { return nil } - return optr.syncUpgradeable() + return optr.syncUpgradeable(config) } // isOlderThanLastUpdate returns true if the cluster version is older than diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index 465a875a2..d42b7f318 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -33,12 +33,13 @@ const ( var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) -// syncUpgradeable. The status is only checked if it has been more than -// the minimumUpdateCheckInterval since the last check. -func (optr *Operator) syncUpgradeable() error { - // updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes +// syncUpgradeable synchronizes the upgradeable status only if it has been more than +// the minimumUpdateCheckInterval since the last synchronization or the precondition +// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has +// been more than the minimumUpgradeableCheckInterval since the last synchronization. +func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(optr.minimumUpdateCheckInterval) { + if u != nil && u.RecentlyChanged(optr.minimumUpdateCheckInterval) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") return nil } @@ -92,8 +93,14 @@ type upgradeable struct { Conditions []configv1.ClusterOperatorStatusCondition } +// hasPassedDurationSinceTime checks if a certain amount of time specified by duration +// has passed since time. Returns true if the duration has passed, returns false otherwise. +func hasPassedDurationSinceTime(t time.Time, duration time.Duration) bool { + return time.Now().After(t.Add(duration)) +} + func (u *upgradeable) RecentlyChanged(interval time.Duration) bool { - return u.At.After(time.Now().Add(-interval)) + return !hasPassedDurationSinceTime(u.At, interval) } func (u *upgradeable) NeedsUpdate(original *configv1.ClusterVersion) *configv1.ClusterVersion { @@ -442,3 +449,22 @@ func (optr *Operator) adminGatesEventHandler() cache.ResourceEventHandler { DeleteFunc: optr.deleteFunc, } } + +// shouldSyncUpgradeableDueToPreconditionChecks checks if the upgradeable status should +// be synchronized due to the precondition checks. It checks whether the precondition +// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has +// been more than the minimumUpgradeableCheckInterval since the last synchronization. +// This means, upon precondition failure, we will synchronize the upgradeable status +// more frequently at beginning of an upgrade. +// +// shouldSyncUpgradeableDueToPreconditionChecks expects the parameters not to be nil. +// +// Function returns true if the synchronization should happen, returns false otherwise. +func shouldSyncUpgradeableDueToPreconditionChecks(optr *Operator, config *configv1.ClusterVersion, u *upgradeable) bool { + cond := resourcemerge.FindOperatorStatusCondition(config.Status.Conditions, DesiredReleaseAccepted) + if cond != nil && cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && + hasPassedDurationSinceTime(u.At, optr.minimumUpgradeableCheckInterval) && !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, optr.minimumUpdateCheckInterval) { + return true + } + return false +}