From 6c7cd992c64b7f2d421898d203528660cee9853c Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 24 May 2021 14:26:01 -0700 Subject: [PATCH] pkg/payload/task: Fail fast for UpdateError Things like API-server connection errors and patch conflicts deserve some retries before we bubble them up into ClusterVersion conditions. But when we are able to retrieve in-cluster objects and determine that they are not happy, we should exit more quickly so we can complain about the resource state and start in on the next sync cycle. For example, see the recent e02d1489a5 (pkg/cvo/internal/operatorstatus: Replace wait-for with single-shot "is it alive now?", 2021-05-12, #560) and the older cc9292a061 (lib/resourcebuilder: Replace wait-for with single-shot "is it alive now?", 2020-07-07, #400). This commit uses the presence of an UpdateError as a marker for "fail fast; no need to retry". The install-time backoff is from fee2d06871 (sync: Completely parallelize the initial payload, 2019-03-11, #136). I'm not sure if it really wants the same cap as reconcile and update modes, but I've left them the same for now, and future commits to pivot the backoff settings can focus on motivating those pivots. I'd tried dropping: backoff := st.Backoff and passing st.Backoff directly to ExponentialBackoffWithContext, but it turns out that Step() [1]: ... mutates the provided Backoff to update its Steps and Duration. Luckily, Backoff has no pointer properties, so storing as a local variable is sufficient to give us a fresh copy for the local mutations. [1]: https://pkg.go.dev/k8s.io/apimachinery/pkg/util/wait#Backoff.Step --- pkg/cvo/cvo.go | 1 + pkg/cvo/internal/operatorstatus.go | 8 +--- pkg/cvo/sync_worker.go | 2 +- pkg/payload/task.go | 63 +++++++++++++----------------- 4 files changed, 31 insertions(+), 43 deletions(-) diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index ab4d540a2..9f7e1e3ab 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -262,6 +262,7 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo Duration: time.Second * 10, Factor: 1.3, Steps: 3, + Cap: time.Second * 15, }, optr.exclude, optr.eventRecorder, diff --git a/pkg/cvo/internal/operatorstatus.go b/pkg/cvo/internal/operatorstatus.go index 5c76a5eac..4c0b76a7b 100644 --- a/pkg/cvo/internal/operatorstatus.go +++ b/pkg/cvo/internal/operatorstatus.go @@ -136,13 +136,7 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp actual, err := client.Get(ctx, expected.Name) if err != nil { - return &payload.UpdateError{ - Nested: err, - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorNotAvailable", - Message: fmt.Sprintf("Cluster operator %s has not yet reported success", expected.Name), - Name: expected.Name, - } + return err } // undone is a sorted slice of transition messages for incomplete operands. diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 6c2d98fe0..169186fdd 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -678,7 +678,7 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w var tasks []*payload.Task backoff := w.backoff if backoff.Steps > 1 && work.State == payload.InitializingPayload { - backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second} + backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second, Cap: 15 * time.Second} } for i := range payloadUpdate.Manifests { tasks = append(tasks, &payload.Task{ diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 91bc3110a..e0a22691b 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -100,46 +100,39 @@ func (st *Task) String() string { func (st *Task) Run(ctx context.Context, version string, builder ResourceBuilder, state State) error { var lastErr error backoff := st.Backoff - maxDuration := 15 * time.Second // TODO: fold back into Backoff in 1.13 - for { - // attempt the apply, waiting as long as necessary - err := builder.Apply(ctx, st.Manifest, state) + err := wait.ExponentialBackoffWithContext(ctx, backoff, func() (done bool, err error) { + err = builder.Apply(ctx, st.Manifest, state) if err == nil { - return nil + return true, nil + } + if updateErr, ok := lastErr.(*UpdateError); ok { + updateErr.Task = st.Copy() + return false, updateErr // failing fast for UpdateError } lastErr = err utilruntime.HandleError(errors.Wrapf(err, "error running apply for %s", st)) metricPayloadErrors.WithLabelValues(version).Inc() - - // TODO: this code will become easier in Kube 1.13 because Backoff now supports max - d := time.Duration(float64(backoff.Duration) * backoff.Factor) - if d > maxDuration { - d = maxDuration - } - d = wait.Jitter(d, backoff.Jitter) - - // sleep or wait for cancellation - select { - case <-time.After(d): - continue - case <-ctx.Done(): - if uerr, ok := lastErr.(*UpdateError); ok { - uerr.Task = st.Copy() - return uerr - } - reason, cause := reasonForPayloadSyncError(lastErr) - if len(cause) > 0 { - cause = ": " + cause - } - return &UpdateError{ - Nested: lastErr, - Reason: reason, - Message: fmt.Sprintf("Could not update %s%s", st, cause), - - Task: st.Copy(), - } - } + return false, nil + }) + if lastErr != nil { + err = lastErr + } + if err == nil { + return nil + } + if _, ok := err.(*UpdateError); ok { + return err + } + reason, cause := reasonForPayloadSyncError(err) + if len(cause) > 0 { + cause = ": " + cause + } + return &UpdateError{ + Nested: err, + Reason: reason, + Message: fmt.Sprintf("Could not update %s%s", st, cause), + Task: st.Copy(), } } @@ -177,7 +170,7 @@ func (e *UpdateError) Cause() error { return e.Nested } -// reasonForUpdateError provides a succint explanation of a known error type for use in a human readable +// reasonForPayloadSyncError provides a succint explanation of a known error type for use in a human readable // message during update. Since all objects in the image should be successfully applied, messages // should direct the reader (likely a cluster administrator) to a possible cause in their own config. func reasonForPayloadSyncError(err error) (string, string) {