From bd5cc2aea27f8d8b813452a8bab13de4935c4e6c Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Mon, 9 Jan 2023 15:53:12 +0100 Subject: [PATCH 1/2] OCPBUGS-5505: Set upgradeability check throttling period to 2m Previously, the throttling reused the `minimumUpdateCheckInterval` value which is derived from the full CVO minimum sync period. This value is set between 2m and 4m at CVO startup. This period is unecessarily long and bad for UX, things happen with a delay and our own testcase expects upgradeability to be propagated in 3 minutes at worst. Hardcode the throttling to 2m (lower bound of previous behavior) to prevent flapping on flurries but allow changes to propagate deterministically faster. We will still get a bit of non-determinisim from sync periods and requeueing, so this change should not cause any periodic API-hammering. --- pkg/cvo/upgradeable.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index d42b7f318..4830c4a1a 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -29,17 +29,18 @@ import ( const ( adminAckGateFmt string = "^ack-[4-5][.]([0-9]{1,})-[^-]" upgradeableAdminAckRequired = configv1.ClusterStatusConditionType("UpgradeableAdminAckRequired") + checkThrottlePeriod = 2 * time.Minute ) var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) // syncUpgradeable synchronizes the upgradeable status only if it has been more than -// the minimumUpdateCheckInterval since the last synchronization or the precondition +// the checkThrottlePeriod since the last synchronization or the precondition // checks on the payload are failing for less than minimumUpdateCheckInterval, and it has // been more than the minimumUpgradeableCheckInterval since the last synchronization. func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(optr.minimumUpdateCheckInterval) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { + if u != nil && u.RecentlyChanged(checkThrottlePeriod) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") return nil } From 2247cf2fdd2a8e0713c3afd4f087c58368fa810d Mon Sep 17 00:00:00 2001 From: Petr Muller Date: Wed, 11 Jan 2023 20:49:47 +0100 Subject: [PATCH 2/2] pkg/cvo/upgradeable: refactor throttling Refactor the code that handles throttling upgradeability checks. Create a new method that computes the duration for which the existing `Upgradeable` status is considered recent enough to not be synced, and simply pass this duration to the `RecentlyChanged` method. The new method is now unit tested, too. Upgradeable-related intervals are now uncoupled to unrelated sync intervals and are grouped in a new struct. --- pkg/cvo/cvo.go | 17 ++++----- pkg/cvo/upgradeable.go | 76 ++++++++++++++++++++++++------------- pkg/cvo/upgradeable_test.go | 59 ++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 35 deletions(-) create mode 100644 pkg/cvo/upgradeable_test.go diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 82538be81..a70b73eed 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -125,10 +125,9 @@ type Operator struct { upgradeable *upgradeable upgradeableChecks []upgradeableCheck - // minimumUpgradeableCheckInterval is the minimum duration before another - // synchronization of the upgradeable status can happen during failing - // precondition checks. - minimumUpgradeableCheckInterval time.Duration + // upgradeableCheckIntervals drives minimal intervals between Upgradeable status + // synchronization + upgradeableCheckIntervals upgradeableCheckIntervals // verifier, if provided, will be used to check an update before it is executed. // Any error will prevent an update payload from being accessed. @@ -188,11 +187,11 @@ func New( Image: releaseImage, }, - statusInterval: 15 * time.Second, - minimumUpdateCheckInterval: minimumInterval, - minimumUpgradeableCheckInterval: 15 * time.Second, - payloadDir: overridePayloadDir, - defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", + statusInterval: 15 * time.Second, + minimumUpdateCheckInterval: minimumInterval, + upgradeableCheckIntervals: defaultUpgradeableCheckIntervals(), + payloadDir: overridePayloadDir, + defaultUpstreamServer: "https://api.openshift.com/api/upgrades_info/v1/graph", client: client, kubeClient: kubeClient, diff --git a/pkg/cvo/upgradeable.go b/pkg/cvo/upgradeable.go index 4830c4a1a..0074e2a54 100644 --- a/pkg/cvo/upgradeable.go +++ b/pkg/cvo/upgradeable.go @@ -29,20 +29,45 @@ import ( const ( adminAckGateFmt string = "^ack-[4-5][.]([0-9]{1,})-[^-]" upgradeableAdminAckRequired = configv1.ClusterStatusConditionType("UpgradeableAdminAckRequired") - checkThrottlePeriod = 2 * time.Minute ) var adminAckGateRegexp = regexp.MustCompile(adminAckGateFmt) -// syncUpgradeable synchronizes the upgradeable status only if it has been more than -// the checkThrottlePeriod since the last synchronization or the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -func (optr *Operator) syncUpgradeable(config *configv1.ClusterVersion) error { - u := optr.getUpgradeable() - if u != nil && u.RecentlyChanged(checkThrottlePeriod) && !shouldSyncUpgradeableDueToPreconditionChecks(optr, config, u) { - klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") - return nil +// upgradeableCheckIntervals holds the time intervals that drive how often CVO checks for upgradeability +type upgradeableCheckIntervals struct { + // min is the base minimum interval between upgradeability checks, applied under normal circumstances + min time.Duration + + // minOnFailedPreconditions is the minimum interval between upgradeability checks when precondition checks are + // failing and were recently (see afterPreconditionsFailed) changed. This should be lower than min because we want CVO + // to check upgradeability more often + minOnFailedPreconditions time.Duration + + // afterFailingPreconditions is the period of time after preconditions failed when minOnFailedPreconditions is + // applied instead of min + afterPreconditionsFailed time.Duration +} + +func defaultUpgradeableCheckIntervals() upgradeableCheckIntervals { + return upgradeableCheckIntervals{ + // 2 minutes are here because it is a lower bound of previously nondeterministicly chosen interval + // TODO (OTA-860): Investigate our options of reducing this interval. We will need to investigate + // the API usage patterns of the underlying checks, there is anecdotal evidence that they hit + // apiserver instead of using local informer cache + min: 2 * time.Minute, + minOnFailedPreconditions: 15 * time.Second, + afterPreconditionsFailed: 2 * time.Minute, + } +} + +// syncUpgradeable synchronizes the upgradeable status only if the sufficient time passed since its last update. This +// throttling period is dynamic and is driven by upgradeableCheckIntervals. +func (optr *Operator) syncUpgradeable(cv *configv1.ClusterVersion) error { + if u := optr.getUpgradeable(); u != nil { + if u.RecentlyChanged(optr.upgradeableCheckIntervals.throttlePeriod(cv)) { + klog.V(2).Infof("Upgradeable conditions were recently checked, will try later.") + return nil + } } optr.setUpgradeableConditions() @@ -451,21 +476,20 @@ func (optr *Operator) adminGatesEventHandler() cache.ResourceEventHandler { } } -// shouldSyncUpgradeableDueToPreconditionChecks checks if the upgradeable status should -// be synchronized due to the precondition checks. It checks whether the precondition -// checks on the payload are failing for less than minimumUpdateCheckInterval, and it has -// been more than the minimumUpgradeableCheckInterval since the last synchronization. -// This means, upon precondition failure, we will synchronize the upgradeable status -// more frequently at beginning of an upgrade. +// throttlePeriod returns the duration for which upgradeable status should be considered recent +// enough and unnecessary to update. The baseline duration is min. When the precondition checks +// on the payload are failing for less than afterPreconditionsFailed we want to synchronize +// the upgradeable status more frequently at beginning of an upgrade and return +// minOnFailedPreconditions which is expected to be lower than min. // -// shouldSyncUpgradeableDueToPreconditionChecks expects the parameters not to be nil. -// -// Function returns true if the synchronization should happen, returns false otherwise. -func shouldSyncUpgradeableDueToPreconditionChecks(optr *Operator, config *configv1.ClusterVersion, u *upgradeable) bool { - cond := resourcemerge.FindOperatorStatusCondition(config.Status.Conditions, DesiredReleaseAccepted) - if cond != nil && cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && - hasPassedDurationSinceTime(u.At, optr.minimumUpgradeableCheckInterval) && !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, optr.minimumUpdateCheckInterval) { - return true - } - return false +// The cv parameter is expected to be non-nil. +func (intervals *upgradeableCheckIntervals) throttlePeriod(cv *configv1.ClusterVersion) time.Duration { + if cond := resourcemerge.FindOperatorStatusCondition(cv.Status.Conditions, DesiredReleaseAccepted); cond != nil { + // Function returns true if the synchronization should happen, returns false otherwise. + if cond.Reason == "PreconditionChecks" && cond.Status == configv1.ConditionFalse && + !hasPassedDurationSinceTime(cond.LastTransitionTime.Time, intervals.afterPreconditionsFailed) { + return intervals.minOnFailedPreconditions + } + } + return intervals.min } diff --git a/pkg/cvo/upgradeable_test.go b/pkg/cvo/upgradeable_test.go new file mode 100644 index 000000000..ec9e41264 --- /dev/null +++ b/pkg/cvo/upgradeable_test.go @@ -0,0 +1,59 @@ +package cvo + +import ( + "testing" + "time" + + configv1 "github.com/openshift/api/config/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestUpgradeableCheckIntervalsThrottlePeriod(t *testing.T) { + intervals := defaultUpgradeableCheckIntervals() + testCases := []struct { + name string + condition *configv1.ClusterOperatorStatusCondition + expected time.Duration + }{ + { + name: "no condition", + expected: intervals.min, + }, + { + name: "passing preconditions", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionTrue, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing but not precondition", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "NotPreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.min, + }, + { + name: "failing preconditions but too long ago", + condition: &configv1.ClusterOperatorStatusCondition{ + Type: DesiredReleaseAccepted, + Reason: "PreconditionChecks", + Status: configv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-(intervals.afterPreconditionsFailed + time.Hour))), + }, + expected: intervals.min, + }, + { + name: "failing preconditions recently", + condition: &configv1.ClusterOperatorStatusCondition{Type: DesiredReleaseAccepted, Reason: "PreconditionChecks", Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + expected: intervals.minOnFailedPreconditions, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cv := &configv1.ClusterVersion{Status: configv1.ClusterVersionStatus{Conditions: []configv1.ClusterOperatorStatusCondition{}}} + if tc.condition != nil { + cv.Status.Conditions = append(cv.Status.Conditions, *tc.condition) + } + if actual := intervals.throttlePeriod(cv); actual != tc.expected { + t.Errorf("throttlePeriod() = %v, want %v", actual, tc.expected) + } + }) + } +}