From b6912175b37a57e43df773e2d12e3c7cf0cb2d27 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 28 May 2020 00:11:27 -0700 Subject: [PATCH 1/2] pkg/cvo/sync_worker: Generalize CancelError to ContextError With this commit, I drop contextIsCancelled in favor of Context.Err(). From the docs [1]: If Done is not yet closed, Err returns nil. If Done is closed, Err returns a non-nil error explaining why: Canceled if the context was canceled or DeadlineExceeded if the context's deadline passed. After Err returns a non-nil error, successive calls to Err return the same error. I dunno why we'd been checking Done() instead, but contextIsCancelled dates back to 961873d66a (sync: Do config syncing in the background, 2019-01-11, #82). I've also generalized a number of *Cancel* helpers to be *Context* to remind folks that Context.Err() can be DeadlineExceeded as well as Canceled, and the CVO uses both WithCancel and WithTimeout. The new error messages will be either: update context deadline exceeded at 1 of 2 or: update context canceled at 1 of 2 Instead of always claiming: update was cancelled at 1 of 2 Cherry-picked from eea209284d (pkg/cvo/sync_worker: Generalize CancelError to ContextError, 2020-05-28, #378) and edited to resolve context conflicts because release-4.4 lacks 2a469e37c1 (cvo: When installing or upgrading, fast-fill cluster-operators, 2020-02-07, #318). [1]: https://golang.org/pkg/context/#Context --- pkg/cvo/sync_worker.go | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 43dd25b79..bfd738241 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -571,8 +571,8 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w // update each object errs := payload.RunGraph(ctx, graph, maxWorkers, func(ctx context.Context, tasks []*payload.Task) error { for _, task := range tasks { - if contextIsCancelled(ctx) { - return cr.CancelError() + if err := ctx.Err(); err != nil { + return cr.ContextError(err) } cr.Update() @@ -616,11 +616,11 @@ func init() { ) } -type errCanceled struct { +type errContext struct { err error } -func (e errCanceled) Error() string { return e.err.Error() } +func (e errContext) Error() string { return e.err.Error() } // consistentReporter hides the details of calculating the status based on the progress // of the graph runner. @@ -657,7 +657,7 @@ func (r *consistentReporter) Error(err error) { copied := r.status copied.Step = "ApplyResources" copied.Fraction = float32(r.done) / float32(r.total) - if !isCancelledError(err) { + if !isContextError(err) { copied.Failure = err } r.reporter.Report(copied) @@ -678,10 +678,10 @@ func (r *consistentReporter) Errors(errs []error) error { return err } -func (r *consistentReporter) CancelError() error { +func (r *consistentReporter) ContextError(err error) error { r.lock.Lock() defer r.lock.Unlock() - return errCanceled{fmt.Errorf("update was cancelled at %d of %d", r.done, r.total)} + return errContext{fmt.Errorf("update %s at %d of %d", err, r.done, r.total)} } func (r *consistentReporter) Complete() { @@ -697,11 +697,11 @@ func (r *consistentReporter) Complete() { r.reporter.Report(copied) } -func isCancelledError(err error) bool { +func isContextError(err error) bool { if err == nil { return false } - _, ok := err.(errCanceled) + _, ok := err.(errContext) return ok } @@ -722,11 +722,12 @@ func isImageVerificationError(err error) bool { // not truly an error (cancellation). // TODO: take into account install vs upgrade func summarizeTaskGraphErrors(errs []error) error { - // we ignore cancellation errors since they don't provide good feedback to users and are an internal - // detail of the server - err := errors.FilterOut(errors.NewAggregate(errs), isCancelledError) + // we ignore context errors (canceled or timed out) since they don't + // provide good feedback to users and are an internal detail of the + // server + err := errors.FilterOut(errors.NewAggregate(errs), isContextError) if err == nil { - klog.V(4).Infof("All errors were cancellation errors: %v", errs) + klog.V(4).Infof("All errors were context errors: %v", errs) return nil } agg, ok := err.(errors.Aggregate) @@ -893,16 +894,6 @@ func ownerRefModifier(config *configv1.ClusterVersion) resourcebuilder.MetaV1Obj } } -// contextIsCancelled returns true if the provided context is cancelled. -func contextIsCancelled(ctx context.Context) bool { - select { - case <-ctx.Done(): - return true - default: - return false - } -} - // runThrottledStatusNotifier invokes fn every time ch is updated, but no more often than once // every interval. If bucket is non-zero then the channel is throttled like a rate limiter bucket. func runThrottledStatusNotifier(stopCh <-chan struct{}, interval time.Duration, bucket int, ch <-chan SyncWorkerStatus, fn func()) { From 9958a207271ebf55e0fcdca1aab1d788d15b5027 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 28 May 2020 04:49:01 -0700 Subject: [PATCH 2/2] pkg/cvo/sync_worker: Do not treat "All errors were context errors..." as success For [1]. Before this commit, you could have a flow like: 1. SyncWorker.Start() 2. External code calls Update(), e.g. after noticing a ClusterVersion spec.desiredUpdate change. 3. Update sets w.work to point at the desired payload. 4. Start's Until loop is triggered via w.notify. 5. Start calls calculateNext, which notices the change and sets the state to UpdatingPayload. 6. Start calculates a syncTimeout and calls syncOnce. 7. syncOnce notices the new payload and loads it. 8. For whatever reason, payload retrieval takes a while. Blackholed signature-fetch attempts in a restricted network [2] are one example of something that could be slow here. Eventually the syncTimeout kicks in and signature fetching or other verification is canceled (which counts as failed verification). 9. Force is true, so syncOnce logs "Forcing past precondition failures..." but carries on to call apply. 10. apply computes the manifest graph, runs the ClusterOperator precreation (whose handlers return ContextError() right after spawning, because the context is already expired), and runs the main RunGraph (whose handlers do the same). 11. The main RunGraph returns at a slice with a handful of context errors and nothing else. apply passes this on to consistentReporter.Errors. 12. consistentReporter.Errors calls summarizeTaskGraphErrors, which logs "All errors were context errors..." and returns nil to avoid alarming consistentReporter.Errors (we don't want to put this in our ClusterVersion status and alarm users). 13. apply returns the summarized nil to syncOnce. 14. syncOnce returns the summarized nil to Start. 15. Start logs "Sync succeeded..." and flops into ReconcilingPayload for the next round. 16. Start comes into the next round in reconciling mode despite never having attempted to apply any manifests in its Updating mode. The manifest graph gets flattened and shuffled and all kinds of terrible things could happen like the machine-config trying to roll out the newer machine-os-content and its 4.4 hyperkube binary before rolling out prerequisites like the 4.4 kube-apiserver operator. With this commit, the process is the same through 12, but ends with: 13. apply returns the first context error to syncOnce. 14. syncOnce returns that error to Start. 15. Start backs off and comes in again with a second attempt at UpdatingPayload. 16. Manifests get pushed in the intended order, and nothing explodes. The race fixed in this commit could also have come up without timing out the payload pull/verification, e.g. by having a perfectly slow ClusterOperator preconditions. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1838497#c23 [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1840343 --- pkg/cvo/sync_worker.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index bfd738241..02f312489 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -594,8 +594,10 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w return nil }) if len(errs) > 0 { - err := cr.Errors(errs) - return err + if err := cr.Errors(errs); err != nil { + return err + } + return errs[0] } // update the status