From ea86b216750e7334d8de701b9f81841057a6776d Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Thu, 1 Oct 2020 13:32:09 -0400 Subject: [PATCH] pkv/cvo/status: Raise Operator leveling grace-period to 40 minutes Similar to #422, further tune things up so that we can ensure that our 90th percentile of clusters do not trip over momentary cluster upgrade failures whenever operators take longer than 20 minutes to roll out. --- docs/user/status.md | 2 +- pkg/cvo/status.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user/status.md b/docs/user/status.md index 26a1875542..b6cbcdf73a 100644 --- a/docs/user/status.md +++ b/docs/user/status.md @@ -22,7 +22,7 @@ If this happens it is a CVO coding error, because clearing [`desiredUpdate`][api `ClusterOperatorNotAvailable` (or the consolidated `ClusterOperatorsNotAvailable`) is set when the CVO fails to retrieve the ClusterOperator from the cluster or when the retrieved ClusterOperator does not satisfy [the reconciliation conditions](reconciliation.md#clusteroperator). Unlike most manifest-reconciliation failures, this error does not immediately result in `Failing=True`. -Under some conditions during installs and updates, the CVO will treat this condition as a `Progressing=True` condition and give the operator up to twenty minutes to level before reporting `Failing=True`. +Under some conditions during installs and updates, the CVO will treat this condition as a `Progressing=True` condition and give the operator up to fourty minutes to level before reporting `Failing=True`. ## RetrievedUpdates diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 755f2fd55a..828f94560c 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -331,13 +331,13 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being -// unavailable AND the general payload task making progress towards its goal. An operator is given 20 minutes since +// unavailable AND the general payload task making progress towards its goal. An operator is given 40 minutes since // its last update to go ready, or an hour has elapsed since the update began, before the condition is ignored. func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { if len(history) == 0 || status.Failure == nil || status.Reconciling || status.LastProgress.IsZero() { return "", "", false } - if now.Sub(status.LastProgress) > 20*time.Minute || now.Sub(history[0].StartedTime.Time) > time.Hour { + if now.Sub(status.LastProgress) > 40*time.Minute || now.Sub(history[0].StartedTime.Time) > time.Hour { return "", "", false } uErr, ok := status.Failure.(*payload.UpdateError)