diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 349aca033f..b3667a5905 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -281,6 +281,7 @@ func (optr *Operator) InitializeFromPayload(ctx context.Context, restConfig *res Duration: time.Second * 10, Factor: 1.3, Steps: 3, + Cap: time.Second * 15, }, optr.exclude, optr.includeTechPreview, diff --git a/pkg/cvo/cvo_scenarios_test.go b/pkg/cvo/cvo_scenarios_test.go index 118402dd24..f2f6a38f41 100644 --- a/pkg/cvo/cvo_scenarios_test.go +++ b/pkg/cvo/cvo_scenarios_test.go @@ -3554,15 +3554,13 @@ func TestCVO_ParallelError(t *testing.T) { worker := o.configSync.(*SyncWorker) b := &errorResourceBuilder{errors: map[string]error{ "0000_10_a_file.yaml": &payload.UpdateError{ - Reason: "ClusterOperatorNotAvailable", UpdateEffect: payload.UpdateEffectNone, - Name: "operator-1", + Message: "Failed to reconcile 10-a-file resource", }, "0000_20_a_file.yaml": nil, "0000_20_b_file.yaml": &payload.UpdateError{ - Reason: "ClusterOperatorNotAvailable", UpdateEffect: payload.UpdateEffectNone, - Name: "operator-2", + Message: "Failed to reconcile 20-b-file resource", }, }} worker.builder = b @@ -3640,9 +3638,6 @@ func TestCVO_ParallelError(t *testing.T) { // Step 3: Cancel after we've accumulated 2/3 errors // - time.Sleep(100 * time.Millisecond) - cancel() - // // verify we observe the remaining changes in the first sync for status := range worker.StatusCh() { if status.Failure == nil { @@ -3675,7 +3670,7 @@ func TestCVO_ParallelError(t *testing.T) { } err := status.Failure uErr, ok := err.(*payload.UpdateError) - if !ok || uErr.Reason != "ClusterOperatorsNotAvailable" || uErr.Message != "Some cluster operators are still updating: operator-1, operator-2" { + if !ok || uErr.Reason != "MultipleErrors" || uErr.Message != "Multiple errors are preventing progress:\n* Failed to reconcile 10-a-file resource\n* Failed to reconcile 20-b-file resource" { t.Fatalf("unexpected error: %v", err) } if status.LastProgress.IsZero() { @@ -3707,6 +3702,7 @@ func TestCVO_ParallelError(t *testing.T) { } break } + cancel() verifyAllStatus(t, worker.StatusCh()) client.ClearActions() @@ -3746,8 +3742,8 @@ func TestCVO_ParallelError(t *testing.T) { {Type: DesiredReleaseAccepted, Status: configv1.ConditionTrue, Reason: "PayloadLoaded", Message: "Payload loaded version=\"1.0.0-abc\" image=\"image/image:1\""}, {Type: configv1.OperatorAvailable, Status: configv1.ConditionFalse}, - {Type: ClusterStatusFailing, Status: configv1.ConditionFalse}, - {Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue, Reason: "ClusterOperatorsNotAvailable", Message: "Working towards 1.0.0-abc: 1 of 3 done (33% complete), waiting on operator-1, operator-2"}, + {Type: ClusterStatusFailing, Status: configv1.ConditionTrue, Reason: "MultipleErrors", Message: "Multiple errors are preventing progress:\n* Failed to reconcile 10-a-file resource\n* Failed to reconcile 20-b-file resource"}, + {Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue, Reason: "MultipleErrors", Message: "Unable to apply 1.0.0-abc: an unknown error has occurred: MultipleErrors"}, {Type: configv1.RetrievedUpdates, Status: configv1.ConditionFalse}, }, }, diff --git a/pkg/cvo/internal/operatorstatus.go b/pkg/cvo/internal/operatorstatus.go index cf7cb8057d..d013905d86 100644 --- a/pkg/cvo/internal/operatorstatus.go +++ b/pkg/cvo/internal/operatorstatus.go @@ -126,22 +126,18 @@ func (b *clusterOperatorBuilder) Do(ctx context.Context) error { func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, expected *configv1.ClusterOperator, mode resourcebuilder.Mode) error { if len(expected.Status.Versions) == 0 { return &payload.UpdateError{ - UpdateEffect: payload.UpdateEffectFail, - Reason: "ClusterOperatorNotAvailable", - Message: fmt.Sprintf("Cluster operator %s does not declare expected versions", expected.Name), - Name: expected.Name, + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNoVersions", + PluralReason: "ClusterOperatorsNoVersions", + Message: fmt.Sprintf("Cluster operator %s does not declare expected versions", expected.Name), + PluralMessageFormat: "Cluster operators %s do not declare expected versions", + Name: expected.Name, } } actual, err := client.Get(ctx, expected.Name) if err != nil { - return &payload.UpdateError{ - Nested: err, - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorNotAvailable", - Message: fmt.Sprintf("Cluster operator %s has not yet reported success", expected.Name), - Name: expected.Name, - } + return err } // undone is a sorted slice of transition messages for incomplete operands. @@ -195,11 +191,13 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp nestedMessage = fmt.Errorf("cluster operator %s is %s=%s: %s: %s", actual.Name, availableCondition.Type, availableCondition.Status, availableCondition.Reason, availableCondition.Message) } return &payload.UpdateError{ - Nested: nestedMessage, - UpdateEffect: payload.UpdateEffectFail, - Reason: "ClusterOperatorNotAvailable", - Message: fmt.Sprintf("Cluster operator %s is not available", actual.Name), - Name: actual.Name, + Nested: nestedMessage, + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNotAvailable", + PluralReason: "ClusterOperatorsNotAvailable", + Message: fmt.Sprintf("Cluster operator %s is not available", actual.Name), + PluralMessageFormat: "Cluster operators %s are not available", + Name: actual.Name, } } @@ -209,11 +207,13 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp nestedMessage = fmt.Errorf("cluster operator %s is %s=%s: %s, %s", actual.Name, degradedCondition.Type, degradedCondition.Status, degradedCondition.Reason, degradedCondition.Message) } return &payload.UpdateError{ - Nested: nestedMessage, - UpdateEffect: payload.UpdateEffectFailAfterInterval, - Reason: "ClusterOperatorDegraded", - Message: fmt.Sprintf("Cluster operator %s is degraded", actual.Name), - Name: actual.Name, + Nested: nestedMessage, + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: fmt.Sprintf("Cluster operator %s is degraded", actual.Name), + PluralMessageFormat: "Cluster operators %s are degraded", + Name: actual.Name, } } @@ -221,11 +221,13 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp if len(undone) > 0 && mode != resourcebuilder.InitializingMode { nestedMessage = fmt.Errorf("cluster operator %s is available and not degraded but has not finished updating to target version", actual.Name) return &payload.UpdateError{ - Nested: nestedMessage, - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: fmt.Sprintf("Cluster operator %s is updating versions", actual.Name), - Name: actual.Name, + Nested: nestedMessage, + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: fmt.Sprintf("Cluster operator %s is updating versions", actual.Name), + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: actual.Name, } } diff --git a/pkg/cvo/internal/operatorstatus_test.go b/pkg/cvo/internal/operatorstatus_test.go index 8877a2ecd0..6a714f0458 100644 --- a/pkg/cvo/internal/operatorstatus_test.go +++ b/pkg/cvo/internal/operatorstatus_test.go @@ -42,13 +42,7 @@ func Test_checkOperatorHealth(t *testing.T) { }}, }, }, - expErr: &payload.UpdateError{ - Nested: apierrors.NewNotFound(schema.GroupResource{Resource: "clusteroperator"}, "test-co"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorNotAvailable", - Message: "Cluster operator test-co has not yet reported success", - Name: "test-co", - }, + expErr: apierrors.NewNotFound(schema.GroupResource{Resource: "clusteroperator"}, "test-co"), }, { name: "cluster operator reporting available=true and degraded=false, but no versions", actual: &configv1.ClusterOperator{ @@ -69,11 +63,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true, degraded=false, but no versions, while expecting an operator version", @@ -97,11 +93,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true and degraded=false, but no versions for operand", @@ -128,11 +126,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true and degraded=false, but old versions", @@ -161,11 +161,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true and degraded=false, but mix of desired and old versions", @@ -194,11 +196,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true, degraded=false, and desired operator, but old versions for 2 operands", @@ -231,11 +235,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting available=true, degraded=false, and desired operator, but mix of old and desired versions for 2 operands", @@ -268,11 +274,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorUpdating", - Message: "Cluster operator test-co is updating versions", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is available and not degraded but has not finished updating to target version"), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co", }, }, { name: "cluster operator reporting desired versions and no conditions", @@ -297,11 +305,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), - UpdateEffect: payload.UpdateEffectFail, - Reason: "ClusterOperatorNotAvailable", - Message: "Cluster operator test-co is not available", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNotAvailable", + PluralReason: "ClusterOperatorsNotAvailable", + Message: "Cluster operator test-co is not available", + PluralMessageFormat: "Cluster operators %s are not available", + Name: "test-co", }, }, { name: "cluster operator reporting progressing=true", @@ -327,11 +337,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), - UpdateEffect: payload.UpdateEffectFail, - Reason: "ClusterOperatorNotAvailable", - Message: "Cluster operator test-co is not available", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNotAvailable", + PluralReason: "ClusterOperatorsNotAvailable", + Message: "Cluster operator test-co is not available", + PluralMessageFormat: "Cluster operators %s are not available", + Name: "test-co", }, }, { name: "cluster operator reporting available=false degraded=true", @@ -357,11 +369,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), - UpdateEffect: payload.UpdateEffectFail, - Reason: "ClusterOperatorNotAvailable", - Message: "Cluster operator test-co is not available", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co: available=false, progressing=true, degraded=true, undone="), + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNotAvailable", + PluralReason: "ClusterOperatorsNotAvailable", + Message: "Cluster operator test-co is not available", + PluralMessageFormat: "Cluster operators %s are not available", + Name: "test-co", }, }, { name: "cluster operator reporting available=true, degraded=false, and progressing=true", @@ -417,11 +431,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), - UpdateEffect: payload.UpdateEffectFailAfterInterval, - Reason: "ClusterOperatorDegraded", - Message: "Cluster operator test-co is degraded", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co", }, }, { name: "cluster operator reporting available=true progressing=true degraded=true", @@ -451,11 +467,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), - UpdateEffect: payload.UpdateEffectFailAfterInterval, - Reason: "ClusterOperatorDegraded", - Message: "Cluster operator test-co is degraded", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co is Degraded=True: RandomReason, random error"), + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co", }, }, { name: "cluster operator reporting available=true no progressing or degraded", @@ -481,11 +499,13 @@ func Test_checkOperatorHealth(t *testing.T) { }, }, expErr: &payload.UpdateError{ - Nested: fmt.Errorf("cluster operator test-co: available=true, progressing=true, degraded=true, undone="), - UpdateEffect: payload.UpdateEffectFailAfterInterval, - Reason: "ClusterOperatorDegraded", - Message: "Cluster operator test-co is degraded", - Name: "test-co", + Nested: fmt.Errorf("cluster operator test-co: available=true, progressing=true, degraded=true, undone="), + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co", }, }, { name: "cluster operator reporting available=true progressing=false degraded=false", diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 17d3d3da8a..9f119dd000 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -427,7 +427,7 @@ func setDesiredReleaseAcceptedCondition(config *configv1.ClusterVersion, status // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being -// unavailable AND the general payload task making progress towards its goal. The error's UpdateEffect determines +// progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines // whether an error should be considered a failure and, if so, whether the operator should be given up to 40 minutes // to recover from the error. func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) { @@ -446,7 +446,11 @@ func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, case payload.UpdateEffectFailAfterInterval: var exceeded []string threshold := now.Add(-(40 * time.Minute)) - for _, name := range strings.Split(uErr.Name, ", ") { + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + for _, name := range names { if payload.COUpdateStartTimesGet(name).Before(threshold) { exceeded = append(exceeded, name) } diff --git a/pkg/cvo/sync_test.go b/pkg/cvo/sync_test.go index 721042f31f..5d97cfb103 100644 --- a/pkg/cvo/sync_test.go +++ b/pkg/cvo/sync_test.go @@ -43,6 +43,7 @@ func init() { func Test_SyncWorker_apply(t *testing.T) { tests := []struct { + name string manifests []string reactors map[action]error cancelAfter int @@ -50,6 +51,7 @@ func Test_SyncWorker_apply(t *testing.T) { check func(*testing.T, []action) wantErr bool }{{ + name: "successful creation", manifests: []string{ `{ "apiVersion": "test.cvo.io/v1", @@ -89,6 +91,7 @@ func Test_SyncWorker_apply(t *testing.T) { } }, }, { + name: "unknown resource failures", manifests: []string{ `{ "apiVersion": "test.cvo.io/v1", @@ -124,8 +127,11 @@ func Test_SyncWorker_apply(t *testing.T) { t.Fatalf("unexpected %d actions", len(actions)) } - if got, exp := actions[0], (newAction(schema.GroupVersionKind{Group: "test.cvo.io", Version: "v1", Kind: "TestA"}, "default", "testa")); !reflect.DeepEqual(got, exp) { - t.Fatalf("%s", diff.ObjectReflectDiff(exp, got)) + exp := newAction(schema.GroupVersionKind{Group: "test.cvo.io", Version: "v1", Kind: "TestA"}, "default", "testa") + for i, got := range actions { + if !reflect.DeepEqual(got, exp) { + t.Fatalf("unexpected action %d: %s", i, diff.ObjectReflectDiff(exp, got)) + } } }, }, { @@ -151,8 +157,8 @@ func Test_SyncWorker_apply(t *testing.T) { } }, }} - for idx, test := range tests { - t.Run(fmt.Sprintf("test#%d", idx), func(t *testing.T) { + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { var manifests []manifest.Manifest for _, s := range test.manifests { m := manifest.Manifest{} @@ -176,6 +182,7 @@ func Test_SyncWorker_apply(t *testing.T) { testMapper.AddToMap(resourcebuilder.Mapper) worker := &SyncWorker{eventRecorder: record.NewFakeRecorder(100)} + worker.backoff.Steps = 2 worker.builder = NewResourceBuilder(nil, nil, nil, nil) ctx, cancel := context.WithCancel(context.Background()) diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 895ff36156..01e51e0b89 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -873,8 +873,11 @@ func (w *SyncWorker) apply(ctx context.Context, work *SyncWork, maxWorkers int, var tasks []*payload.Task backoff := w.backoff + if backoff.Steps == 0 { + return fmt.Errorf("SyncWorker requires at least one backoff step to apply any manifests") + } if backoff.Steps > 1 && work.State == payload.InitializingPayload { - backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second} + backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second, Cap: 15 * time.Second} } for i := range payloadUpdate.Manifests { @@ -1135,83 +1138,74 @@ func summarizeTaskGraphErrors(errs []error) error { } // collapse into a set of common errors where necessary + errs = condenseClusterOperators(errs) if len(errs) == 1 { return errs[0] } - // hide the generic "not available yet" when there are more specific errors present - if filtered := filterErrors(errs, isClusterOperatorNotAvailable); len(filtered) > 0 { - return newMultipleError(filtered) - } - // if we're only waiting for operators, condense the error down to a singleton - if err := newClusterOperatorsNotAvailable(errs); err != nil { - return err - } return newMultipleError(errs) } -// filterErrors returns only the errors in errs which are false for all fns. -func filterErrors(errs []error, fns ...func(err error) bool) []error { - var filtered []error +// condenseClusterOperators unifies any ClusterOperator errors which +// share the same reason. +func condenseClusterOperators(errs []error) []error { + condensed := make([]error, 0, len(errs)) + clusterOperatorByReason := make(map[string][]*payload.UpdateError, len(errs)) + reasons := make([]string, 0, len(errs)) for _, err := range errs { - if errorMatches(err, fns...) { + uErr, ok := err.(*payload.UpdateError) + if !ok || uErr.Task == nil || uErr.Task.Manifest == nil || uErr.Task.Manifest.GVK != configv1.SchemeGroupVersion.WithKind("ClusterOperator") { + // error is not a ClusterOperator error, so pass it through + condensed = append(condensed, err) continue } - filtered = append(filtered, err) - } - return filtered -} - -func errorMatches(err error, fns ...func(err error) bool) bool { - for _, fn := range fns { - if fn(err) { - return true + if _, ok := clusterOperatorByReason[uErr.Reason]; !ok { + reasons = append(reasons, uErr.Reason) } + clusterOperatorByReason[uErr.Reason] = append(clusterOperatorByReason[uErr.Reason], uErr) } - return false -} -// isClusterOperatorNotAvailable returns true if this is a ClusterOperatorNotAvailable error -func isClusterOperatorNotAvailable(err error) bool { - uErr, ok := err.(*payload.UpdateError) - return ok && uErr != nil && uErr.Reason == "ClusterOperatorNotAvailable" -} - -// newClusterOperatorsNotAvailable unifies multiple ClusterOperatorNotAvailable errors into -// a single error. It returns nil if the provided errors are not of the same type. -func newClusterOperatorsNotAvailable(errs []error) error { - updateEffect := payload.UpdateEffectNone - names := make([]string, 0, len(errs)) - for _, err := range errs { - uErr, ok := err.(*payload.UpdateError) - if !ok || uErr.Reason != "ClusterOperatorNotAvailable" { - return nil - } - if len(uErr.Name) > 0 { - names = append(names, uErr.Name) + sort.Strings(reasons) + for _, reason := range reasons { + reasonErrors := clusterOperatorByReason[reason] + if len(reasonErrors) == 1 { + condensed = append(condensed, reasonErrors[0]) + continue } - switch uErr.UpdateEffect { - case payload.UpdateEffectNone: - case payload.UpdateEffectFail: - updateEffect = payload.UpdateEffectFail - case payload.UpdateEffectFailAfterInterval: - if updateEffect != payload.UpdateEffectFail { - updateEffect = payload.UpdateEffectFailAfterInterval + nested := make([]error, 0, len(reasonErrors)) + names := make([]string, 0, len(reasonErrors)) + updateEffect := payload.UpdateEffectNone + for _, err := range reasonErrors { + nested = append(nested, err) + if len(err.Name) > 0 { + names = append(names, err.Name) + } + + switch err.UpdateEffect { + case payload.UpdateEffectNone: + case payload.UpdateEffectFail: + updateEffect = payload.UpdateEffectFail + case payload.UpdateEffectFailAfterInterval: + if updateEffect != payload.UpdateEffectFail { + updateEffect = payload.UpdateEffectFailAfterInterval + } } } - } - if len(names) == 0 { - return nil + sort.Strings(names) + name := strings.Join(names, ", ") + + condensed = append(condensed, &payload.UpdateError{ + Nested: apierrors.NewAggregate(nested), + UpdateEffect: updateEffect, + Reason: reasonErrors[0].PluralReason, + PluralReason: reasonErrors[0].PluralReason, + Message: fmt.Sprintf(reasonErrors[0].PluralMessageFormat, name), + PluralMessageFormat: reasonErrors[0].PluralMessageFormat, + Name: name, + Names: names, + }) } - sort.Strings(names) - name := strings.Join(names, ", ") - return &payload.UpdateError{ - Nested: apierrors.NewAggregate(errs), - UpdateEffect: updateEffect, - Reason: "ClusterOperatorsNotAvailable", - Message: fmt.Sprintf("Some cluster operators are still updating: %s", name), - Name: name, - } + return condensed } // uniqueStrings returns an array with all sequential identical items removed. It modifies the contents diff --git a/pkg/cvo/sync_worker_test.go b/pkg/cvo/sync_worker_test.go index 8a3de52a3c..45190285fb 100644 --- a/pkg/cvo/sync_worker_test.go +++ b/pkg/cvo/sync_worker_test.go @@ -7,9 +7,17 @@ import ( "testing" "time" + "github.com/davecgh/go-spew/spew" + configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/library-go/pkg/manifest" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/diff" + "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/record" - configv1 "github.com/openshift/api/config/v1" + "github.com/openshift/cluster-version-operator/pkg/payload" ) func Test_statusWrapper_ReportProgress(t *testing.T) { @@ -183,6 +191,197 @@ func Test_runThrottledStatusNotifier(t *testing.T) { } } +func task(name string, gvk schema.GroupVersionKind) *payload.Task { + return &payload.Task{ + Manifest: &manifest.Manifest{ + OriginalFilename: fmt.Sprintf("%s.yaml", name), + GVK: gvk, + Obj: &unstructured.Unstructured{ + Object: map[string]interface{}{ + "group": gvk.Group, + "apiVersion": gvk.Version, + "kind": gvk.Kind, + "metadata": map[string]interface{}{ + "name": name, + }, + }, + }, + }, + } +} + +func Test_condenseClusterOperators(t *testing.T) { + coADegradedNone := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co-A is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co-A", + Task: task("test-co-A", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + coANotAvailable := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorNotAvailable", + PluralReason: "ClusterOperatorsNotAvailable", + Message: "Cluster operator test-co-A is not available", + PluralMessageFormat: "Cluster operators %s are not available", + Name: "test-co-A", + Task: task("test-co-A", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + coAUpdating := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co-A is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co-A", + Task: task("test-co-A", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + coBDegradedFail := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co-B is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co-B", + Task: task("test-co-B", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + coBUpdating := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operator test-co-B is updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co-B", + Task: task("test-co-B", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + coCDegradedFailAfterInterval := &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operator test-co-C is degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co-C", + Task: task("test-co-C", configv1.SchemeGroupVersion.WithKind("ClusterOperator")), + } + + tests := []struct { + name string + input []error + expected []error + }{{ + name: "no errors", + expected: []error{}, + }, { + name: "one ClusterOperator, one API", + input: []error{ + coAUpdating, + apierrors.NewNotFound(schema.GroupResource{Group: "", Resource: "clusteroperator"}, "test-co-A"), + }, + expected: []error{ + apierrors.NewNotFound(schema.GroupResource{Group: "", Resource: "clusteroperator"}, "test-co-A"), + coAUpdating, + }, + }, { + name: "two ClusterOperator with different reasons, one API", + input: []error{ + coBUpdating, + coANotAvailable, + apierrors.NewNotFound(schema.GroupResource{Group: "", Resource: "clusteroperator"}, "test-co"), + }, + expected: []error{ + apierrors.NewNotFound(schema.GroupResource{Group: "", Resource: "clusteroperator"}, "test-co"), + coANotAvailable, // NotAvailable sorts before Updating + coBUpdating, + }, + }, { + name: "two ClusterOperator with the same reason", + input: []error{ + coBUpdating, + coAUpdating, + }, + expected: []error{ + &payload.UpdateError{ + Nested: errors.NewAggregate([]error{coBUpdating, coAUpdating}), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operators test-co-A, test-co-B are updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co-A, test-co-B", + Names: []string{"test-co-A", "test-co-B"}, + }, + }, + }, { + name: "two ClusterOperator with the same reason, one different", + input: []error{ + coBUpdating, + coAUpdating, + coCDegradedFailAfterInterval, + }, + expected: []error{ + coCDegradedFailAfterInterval, // Degraded sorts before Updating + &payload.UpdateError{ + Nested: errors.NewAggregate([]error{coBUpdating, coAUpdating}), + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + PluralReason: "ClusterOperatorsUpdating", + Message: "Cluster operators test-co-A, test-co-B are updating versions", + PluralMessageFormat: "Cluster operators %s are updating versions", + Name: "test-co-A, test-co-B", + Names: []string{"test-co-A", "test-co-B"}, + }, + }, + }, { + name: "there ClusterOperator with the same reason but different effects", + input: []error{ + coBDegradedFail, + coADegradedNone, + coCDegradedFailAfterInterval, + }, + expected: []error{ + &payload.UpdateError{ + Nested: errors.NewAggregate([]error{coBDegradedFail, coADegradedNone, coCDegradedFailAfterInterval}), + UpdateEffect: payload.UpdateEffectFail, + Reason: "ClusterOperatorsDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operators test-co-A, test-co-B, test-co-C are degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co-A, test-co-B, test-co-C", + Names: []string{"test-co-A", "test-co-B", "test-co-C"}, + }, + }, + }, { + name: "to ClusterOperator with the same reason but None and FailAfterInterval effects", + input: []error{ + coADegradedNone, + coCDegradedFailAfterInterval, + }, + expected: []error{ + &payload.UpdateError{ + Nested: errors.NewAggregate([]error{coADegradedNone, coCDegradedFailAfterInterval}), + UpdateEffect: payload.UpdateEffectFailAfterInterval, + Reason: "ClusterOperatorsDegraded", + PluralReason: "ClusterOperatorsDegraded", + Message: "Cluster operators test-co-A, test-co-C are degraded", + PluralMessageFormat: "Cluster operators %s are degraded", + Name: "test-co-A, test-co-C", + Names: []string{"test-co-A", "test-co-C"}, + }, + }, + }} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual := condenseClusterOperators(test.input) + if !reflect.DeepEqual(test.expected, actual) { + spew.Config.DisableMethods = true + t.Fatalf("Incorrect value returned -\ndiff: %s\nexpected: %s\nreturned: %s", diff.ObjectReflectDiff(test.expected, actual), spew.Sdump(test.expected), spew.Sdump(actual)) + } + }) + } +} + func Test_equalDigest(t *testing.T) { for _, testCase := range []struct { name string diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 5d914f4316..8600db1c17 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -100,51 +100,46 @@ func (st *Task) String() string { return fmt.Sprintf("%s (%d of %d)", manId, st.Index, st.Total) } -// Run attempts to create the provided object until it succeeds or context is cancelled. It returns the -// last error if context is cancelled. +// Run attempts to create the provided object until it: +// +// * Succeeds, or +// * Fails with an UpdateError, because these are unlikely to improve quickly on retry, or +// * The context is canceled, in which case it returns the most recent error. func (st *Task) Run(ctx context.Context, version string, builder ResourceBuilder, state State) error { var lastErr error backoff := st.Backoff - maxDuration := 15 * time.Second // TODO: fold back into Backoff in 1.13 - for { - // attempt the apply, waiting as long as necessary - err := builder.Apply(ctx, st.Manifest, state) + err := wait.ExponentialBackoffWithContext(ctx, backoff, func() (done bool, err error) { + err = builder.Apply(ctx, st.Manifest, state) if err == nil { - return nil + return true, nil } - lastErr = err utilruntime.HandleError(errors.Wrapf(err, "error running apply for %s", st)) metricPayloadErrors.WithLabelValues(version).Inc() - - // TODO: this code will become easier in Kube 1.13 because Backoff now supports max - d := time.Duration(float64(backoff.Duration) * backoff.Factor) - if d > maxDuration { - d = maxDuration - } - d = wait.Jitter(d, backoff.Jitter) - - // sleep or wait for cancellation - select { - case <-time.After(d): - continue - case <-ctx.Done(): - if uerr, ok := lastErr.(*UpdateError); ok { - uerr.Task = st.Copy() - return uerr - } - reason, cause := reasonForPayloadSyncError(lastErr) - if len(cause) > 0 { - cause = ": " + cause - } - return &UpdateError{ - Nested: lastErr, - Reason: reason, - Message: fmt.Sprintf("Could not update %s%s", st, cause), - - Task: st.Copy(), - } + if updateErr, ok := err.(*UpdateError); ok { + updateErr.Task = st.Copy() + return false, updateErr // failing fast for UpdateError } + return false, nil + }) + if err == nil { + return nil + } + if lastErr != nil { + err = lastErr + } + if _, ok := err.(*UpdateError); ok { + return err + } + reason, cause := reasonForPayloadSyncError(err) + if len(cause) > 0 { + cause = ": " + cause + } + return &UpdateError{ + Nested: err, + Reason: reason, + Message: fmt.Sprintf("Could not update %s%s", st, cause), + Task: st.Copy(), } } @@ -165,11 +160,14 @@ const ( // UpdateError is a wrapper for errors that occur during a payload sync. type UpdateError struct { - Nested error - UpdateEffect UpdateEffectType - Reason string - Message string - Name string + Nested error + UpdateEffect UpdateEffectType + Reason string + PluralReason string + Message string + PluralMessageFormat string + Name string + Names []string Task *Task } @@ -192,7 +190,7 @@ func (e *UpdateError) Unwrap() error { return e.Nested } -// reasonForUpdateError provides a succint explanation of a known error type for use in a human readable +// reasonForPayloadSyncError provides a succint explanation of a known error type for use in a human readable // message during update. Since all objects in the image should be successfully applied, messages // should direct the reader (likely a cluster administrator) to a possible cause in their own config. func reasonForPayloadSyncError(err error) (string, string) { @@ -264,11 +262,16 @@ func SummaryForReason(reason, name string) string { return "a cluster operator is degraded" case "ClusterOperatorNotAvailable": if len(name) > 0 { - return fmt.Sprintf("the cluster operator %s has not yet successfully rolled out", name) + return fmt.Sprintf("the cluster operator %s is not available", name) } - return "a cluster operator has not yet rolled out" + return "a cluster operator is not available" case "ClusterOperatorsNotAvailable": - return "some cluster operators have not yet rolled out" + return "some cluster operators are not available" + case "ClusterOperatorNoVersions": + if len(name) > 0 { + return fmt.Sprintf("the cluster operator %s does not declare expected versions", name) + } + return "a cluster operator does not declare expected versions" case "WorkloadNotAvailable": if len(name) > 0 { return fmt.Sprintf("the workload %s has not yet successfully rolled out", name)