From 1c5b1f18b6c8939107d50c982dc664957f29767d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Ry=C5=A1av=C3=BD?= Date: Thu, 19 Dec 2024 15:07:10 +0100 Subject: [PATCH 1/2] fix: Fix calculating SelfHealBackOff delay when exceeding maximum (#20976) (#20978) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: fix TestSelfHealExponentialBackoff to test exceeding Backoff.Cap Signed-off-by: Michal Ryšavý * fix: fix calculating SelfHealBackOff delay when exceeding maximum Signed-off-by: Michal Ryšavý Signed-off-by: Michal Ryšavý --------- Signed-off-by: Michal Ryšavý Co-authored-by: Michal Ryšavý --- controller/appcontroller.go | 3 ++- controller/appcontroller_test.go | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/controller/appcontroller.go b/controller/appcontroller.go index 44d1a49398cf1..1aa721529f6b4 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -2236,7 +2236,8 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool, backOff := *ctrl.selfHealBackOff backOff.Steps = int(app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount) var delay time.Duration - for backOff.Steps > 0 { + steps := backOff.Steps + for i := 0; i < steps; i++ { delay = backOff.Step() } if app.Status.OperationState.FinishedAt == nil { diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index e627f37bd82a8..e5d63801dbeaf 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -2533,7 +2533,7 @@ func TestSelfHealExponentialBackoff(t *testing.T) { ctrl.selfHealBackOff = &wait.Backoff{ Factor: 3, Duration: 2 * time.Second, - Cap: 5 * time.Minute, + Cap: 2 * time.Minute, } app := &v1alpha1.Application{ @@ -2571,6 +2571,21 @@ func TestSelfHealExponentialBackoff(t *testing.T) { finishedAt: nil, expectedDuration: 18 * time.Second, shouldSelfHeal: false, + }, { + attempts: 4, + finishedAt: nil, + expectedDuration: 54 * time.Second, + shouldSelfHeal: false, + }, { + attempts: 5, + finishedAt: nil, + expectedDuration: 120 * time.Second, + shouldSelfHeal: false, + }, { + attempts: 6, + finishedAt: nil, + expectedDuration: 120 * time.Second, + shouldSelfHeal: false, }} for i := range testCases { From bf976cc303d92a394c5d9abd69018d469b56d6a1 Mon Sep 17 00:00:00 2001 From: Blake Pettersson Date: Sun, 2 Mar 2025 12:46:39 -1000 Subject: [PATCH 2/2] fix(appcontroller): selfhealattemptscount needs to be reset at times (#22095) Signed-off-by: Blake Pettersson --- controller/appcontroller.go | 29 +++++++++++++-------- controller/appcontroller_test.go | 44 +++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/controller/appcontroller.go b/controller/appcontroller.go index 1aa721529f6b4..4787457fbf821 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -2103,9 +2103,7 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus * InitiatedBy: appv1.OperationInitiator{Automated: true}, Retry: appv1.RetryStrategy{Limit: 5}, } - if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil { - op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount - } + if app.Spec.SyncPolicy.Retry != nil { op.Retry = *app.Spec.SyncPolicy.Retry } @@ -2121,8 +2119,18 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus * } logCtx.Infof("Skipping auto-sync: most recent sync already to %s", desiredCommitSHA) return nil, 0 - } else if alreadyAttempted && selfHeal { - if shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app); shouldSelfHeal { + } else if selfHeal { + shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app, alreadyAttempted) + if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil { + op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount + } + + if alreadyAttempted { + if !shouldSelfHeal { + logCtx.Infof("Skipping auto-sync: already attempted sync to %s with timeout %v (retrying in %v)", desiredCommitSHA, ctrl.selfHealTimeout, retryAfter) + ctrl.requestAppRefresh(app.QualifiedName(), CompareWithLatest.Pointer(), &retryAfter) + return nil, 0 + } op.Sync.SelfHealAttemptsCount++ for _, resource := range resources { if resource.Status != appv1.SyncStatusCodeSynced { @@ -2133,10 +2141,6 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus * }) } } - } else { - logCtx.Infof("Skipping auto-sync: already attempted sync to %s with timeout %v (retrying in %v)", desiredCommitSHA, ctrl.selfHealTimeout, retryAfter) - ctrl.requestAppRefresh(app.QualifiedName(), CompareWithLatest.Pointer(), &retryAfter) - return nil, 0 } } ts.AddCheckpoint("already_attempted_check_ms") @@ -2220,11 +2224,16 @@ func alreadyAttemptedSync(app *appv1.Application, commitSHA string, commitSHAsMS } } -func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool, time.Duration) { +func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application, alreadyAttempted bool) (bool, time.Duration) { if app.Status.OperationState == nil { return true, time.Duration(0) } + // Reset counter if the prior sync was successful OR if the revision has changed + if !alreadyAttempted || app.Status.Sync.Status == appv1.SyncStatusCodeSynced { + app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = 0 + } + var retryAfter time.Duration if ctrl.selfHealBackOff == nil { if app.Status.OperationState.FinishedAt == nil { diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index e5d63801dbeaf..cd3f84559984e 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -2548,44 +2548,84 @@ func TestSelfHealExponentialBackoff(t *testing.T) { testCases := []struct { attempts int64 + expectedAttempts int64 finishedAt *metav1.Time expectedDuration time.Duration shouldSelfHeal bool + alreadyAttempted bool + syncStatus v1alpha1.SyncStatusCode }{{ attempts: 0, finishedAt: ptr.To(metav1.Now()), expectedDuration: 0, shouldSelfHeal: true, + alreadyAttempted: true, + expectedAttempts: 0, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 1, finishedAt: ptr.To(metav1.Now()), expectedDuration: 2 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 1, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 2, finishedAt: ptr.To(metav1.Now()), expectedDuration: 6 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 2, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 3, finishedAt: nil, expectedDuration: 18 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 3, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 4, finishedAt: nil, expectedDuration: 54 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 4, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 5, finishedAt: nil, expectedDuration: 120 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 5, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, }, { attempts: 6, finishedAt: nil, expectedDuration: 120 * time.Second, shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 6, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, + }, { + attempts: 6, + finishedAt: nil, + expectedDuration: 0, + shouldSelfHeal: true, + alreadyAttempted: false, + expectedAttempts: 0, + syncStatus: v1alpha1.SyncStatusCodeOutOfSync, + }, { + attempts: 6, + finishedAt: nil, + expectedDuration: 0, + shouldSelfHeal: true, + alreadyAttempted: true, + expectedAttempts: 0, + syncStatus: v1alpha1.SyncStatusCodeSynced, }} for i := range testCases { @@ -2593,8 +2633,10 @@ func TestSelfHealExponentialBackoff(t *testing.T) { t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = tc.attempts app.Status.OperationState.FinishedAt = tc.finishedAt - ok, duration := ctrl.shouldSelfHeal(app) + app.Status.Sync.Status = tc.syncStatus + ok, duration := ctrl.shouldSelfHeal(app, tc.alreadyAttempted) require.Equal(t, ok, tc.shouldSelfHeal) + require.Equal(t, tc.expectedAttempts, app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount) assertDurationAround(t, tc.expectedDuration, duration) }) }