diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index a8a7b7d371..a3729afc54 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -125,10 +125,9 @@ type Operator struct { upgradeable *upgradeable upgradeableChecks []upgradeableCheck - // minimumUpgradeableCheckInterval is the minimum duration before another - // synchronization of the upgradeable status can happen during failing - // precondition checks. - minimumUpgradeableCheckInterval time.Duration + // upgradeableCheckIntervals drives minimal intervals between Upgradeable status + // synchronization + upgradeableCheckIntervals upgradeableCheckIntervals // verifier, if provided, will be used to check an update before it is executed. // Any error will prevent an update payload from being accessed. @@ -188,11 +187,11 @@ func New( Image: releaseImage, }, - statusInterval: 15 * time.Second, - minimumUpdateCheckInterval: minimumInterval, - minimumUpgradeableCheckInterval: 15 * time.Second, - payloadDir: overridePayloadDir, - defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", + statusInterval: 15 * time.Second, + minimumUpdateCheckInterval: minimumInterval, + upgradeableCheckIntervals: defaultUpgradeableCheckIntervals(), + payloadDir: overridePayloadDir, + defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", client: client, kubeClient: kubeClient, diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index 2e67a2832b..386550a781 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -33,15 +33,41 @@ const ( var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) -// syncUpgradeable synchronizes the upgradeable status only if it has been more than -// the minimumUpdateCheckInterval since the last synchronization or the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { - u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(optr.minimumUpdateCheckInterval) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { - klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") - return nil +// upgradeableCheckIntervals holds the time intervals that drive how often CVO checks for upgradeability +type upgradeableCheckIntervals struct { + // min is the base minimum interval between upgradeability checks, applied under normal circumstances + min time.Duration + + // minOnFailedPreconditions is the minimum interval between upgradeability checks when precondition checks are + // failing and were recently (see afterPreconditionsFailed) changed. This should be lower than min because we want CVO + // to check upgradeability more often + minOnFailedPreconditions time.Duration + + // afterFailingPreconditions is the period of time after preconditions failed when minOnFailedPreconditions is + // applied instead of min + afterPreconditionsFailed time.Duration +} + +func defaultUpgradeableCheckIntervals() upgradeableCheckIntervals { + return upgradeableCheckIntervals{ + // 2 minutes are here because it is a lower bound of previously nondeterministicly chosen interval + // TODO (OTA-860): Investigate our options of reducing this interval. We will need to investigate + // the API usage patterns of the underlying checks, there is anecdotal evidence that they hit + // apiserver instead of using local informer cache + min: 2 * time.Minute, + minOnFailedPreconditions: 15 * time.Second, + afterPreconditionsFailed: 2 * time.Minute, + } +} + +// syncUpgradeable synchronizes the upgradeable status only if the sufficient time passed since its last update. This +// throttling period is dynamic and is driven by upgradeableCheckIntervals. +func (optr *Operator) syncUpgradeable(cv *configv1.ClusterVersion) error { + if u := optr.getUpgradeable(); u != nil { + if u.RecentlyChanged(optr.upgradeableCheckIntervals.throttlePeriod(cv)) { + klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") + return nil + } } optr.setUpgradeableConditions() @@ -466,21 +492,20 @@ func (optr *Operator) adminGatesEventHandler() cache.ResourceEventHandler { } } -// shouldSyncUpgradeableDueToPreconditionChecks checks if the upgradeable status should -// be synchronized due to the precondition checks. It checks whether the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -// This means, upon precondition failure, we will synchronize the upgradeable status -// more frequently at beginning of an upgrade. -// -// shouldSyncUpgradeableDueToPreconditionChecks expects the parameters not to be nil. +// throttlePeriod returns the duration for which upgradeable status should be considered recent +// enough and unnecessary to update. The baseline duration is min. When the precondition checks +// on the payload are failing for less than afterPreconditionsFailed we want to synchronize +// the upgradeable status more frequently at beginning of an upgrade and return +// minOnFailedPreconditions which is expected to be lower than min. // -// Function returns true if the synchronization should happen, returns false otherwise. -func shouldSyncUpgradeableDueToPreconditionChecks(optr *Operator, config *configv1.ClusterVersion, u *upgradeable) bool { - cond := resourcemerge.FindOperatorStatusCondition(config.Status.Conditions, DesiredReleaseAccepted) - if cond != nil && cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && - hasPassedDurationSinceTime(u.At, optr.minimumUpgradeableCheckInterval) && !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, optr.minimumUpdateCheckInterval) { - return true - } - return false +// The cv parameter is expected to be non-nil. +func (intervals *upgradeableCheckIntervals) throttlePeriod(cv *configv1.ClusterVersion) time.Duration { + if cond := resourcemerge.FindOperatorStatusCondition(cv.Status.Conditions, DesiredReleaseAccepted); cond != nil { + // Function returns true if the synchronization should happen, returns false otherwise. + if cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && + !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, intervals.afterPreconditionsFailed) { + return intervals.minOnFailedPreconditions + } + } + return intervals.min } diff --git a/pkg/cvo/upgradeable_test.go b/pkg/cvo/upgradeable_test.go new file mode 100644 index 0000000000..ec9e41264f --- /dev/null +++ b/pkg/cvo/upgradeable_test.go @@ -0,0 +1,59 @@ +package cvo + +import ( + "testing" + "time" + + configv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestUpgradeableCheckIntervalsThrottlePeriod(t *testing.T) { + intervals := defaultUpgradeableCheckIntervals() + testCases := []struct { + name string + condition *configv1.ClusterOperatorStatusCondition + expected time.Duration + }{ + { + name: "no condition", + expected: intervals.min, + }, + { + name: "passing preconditions", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionTrue, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing but not precondition", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "NotPreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing preconditions but too long ago", + condition: &configv1.ClusterOperatorStatusCondition{ + Type: DesiredReleaseAccepted, + Reason: "PreconditionChecks", + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-(intervals.afterPreconditionsFailed + time.Hour))), + }, + expected: intervals.min, + }, + { + name: "failing preconditions recently", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.minOnFailedPreconditions, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cv := &configv1.ClusterVersion{Status: configv1.ClusterVersionStatus{Conditions: []configv1.ClusterOperatorStatusCondition{}}} + if tc.condition != nil { + cv.Status.Conditions = append(cv.Status.Conditions, *tc.condition) + } + if actual := intervals.throttlePeriod(cv); actual != tc.expected { + t.Errorf("throttlePeriod() = %v, want %v", actual, tc.expected) + } + }) + } +}