diff --git a/cmd/argocd-application-controller/commands/argocd_application_controller.go b/cmd/argocd-application-controller/commands/argocd_application_controller.go index 49a71f746b394..d9f3e2d9cba2b 100644 --- a/cmd/argocd-application-controller/commands/argocd_application_controller.go +++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go @@ -68,6 +68,7 @@ func NewCommand() *cobra.Command { selfHealBackoffTimeoutSeconds int selfHealBackoffFactor int selfHealBackoffCapSeconds int + selfHealBackoffCooldownSeconds int syncTimeout int statusProcessors int operationProcessors int @@ -201,6 +202,7 @@ func NewCommand() *cobra.Command { time.Duration(appResyncJitter)*time.Second, time.Duration(selfHealTimeoutSeconds)*time.Second, selfHealBackoff, + time.Duration(selfHealBackoffCooldownSeconds)*time.Second, time.Duration(syncTimeout)*time.Second, time.Duration(repoErrorGracePeriod)*time.Second, metricsPort, @@ -272,6 +274,7 @@ func NewCommand() *cobra.Command { command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts") command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts") command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts") + command.Flags().IntVar(&selfHealBackoffCooldownSeconds, "self-heal-backoff-cooldown-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS", 330, 0, math.MaxInt32), "Specifies period of time the app needs to stay synced before the self heal backoff can reset") command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).") command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.") command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server") diff --git a/controller/appcontroller.go b/controller/appcontroller.go index deea6d7656087..eb973c5a68dfa 100644 --- a/controller/appcontroller.go +++ b/controller/appcontroller.go @@ -41,6 +41,7 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" + "k8s.io/utils/ptr" commitclient "github.com/argoproj/argo-cd/v3/commitserver/apiclient" "github.com/argoproj/argo-cd/v3/common" @@ -133,6 +134,7 @@ type ApplicationController struct { statusRefreshJitter time.Duration selfHealTimeout time.Duration selfHealBackOff *wait.Backoff + selfHealBackoffCooldown time.Duration syncTimeout time.Duration db db.ArgoDB settingsMgr *settings_util.SettingsManager @@ -168,6 +170,7 @@ func NewApplicationController( appResyncJitter time.Duration, selfHealTimeout time.Duration, selfHealBackoff *wait.Backoff, + selfHealBackoffCooldown time.Duration, syncTimeout time.Duration, repoErrorGracePeriod time.Duration, metricsPort int, @@ -214,6 +217,7 @@ func NewApplicationController( settingsMgr: settingsMgr, selfHealTimeout: selfHealTimeout, selfHealBackOff: selfHealBackoff, + selfHealBackoffCooldown: selfHealBackoffCooldown, syncTimeout: syncTimeout, clusterSharding: clusterSharding, projByNameCache: sync.Map{}, @@ -2241,17 +2245,22 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application, alread return true, time.Duration(0) } - // Reset counter if the prior sync was successful OR if the revision has changed - if !alreadyAttempted || app.Status.Sync.Status == appv1.SyncStatusCodeSynced { + var timeSinceOperation *time.Duration + if app.Status.OperationState.FinishedAt != nil { + timeSinceOperation = ptr.To(time.Since(app.Status.OperationState.FinishedAt.Time)) + } + + // Reset counter if the prior sync was successful and the cooldown period is over OR if the revision has changed + if !alreadyAttempted || (timeSinceOperation != nil && *timeSinceOperation >= ctrl.selfHealBackoffCooldown && app.Status.Sync.Status == appv1.SyncStatusCodeSynced) { app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = 0 } var retryAfter time.Duration if ctrl.selfHealBackOff == nil { - if app.Status.OperationState.FinishedAt == nil { + if timeSinceOperation == nil { retryAfter = ctrl.selfHealTimeout } else { - retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time) + retryAfter = ctrl.selfHealTimeout - *timeSinceOperation } } else { backOff := *ctrl.selfHealBackOff @@ -2261,10 +2270,11 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application, alread for i := 0; i < steps; i++ { delay = backOff.Step() } - if app.Status.OperationState.FinishedAt == nil { + + if timeSinceOperation == nil { retryAfter = delay } else { - retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time) + retryAfter = delay - *timeSinceOperation } } return retryAfter <= 0, retryAfter diff --git a/controller/appcontroller_test.go b/controller/appcontroller_test.go index 9e464643fbb36..939983044039a 100644 --- a/controller/appcontroller_test.go +++ b/controller/appcontroller_test.go @@ -172,6 +172,7 @@ func newFakeControllerWithResync(data *fakeData, appResyncPeriod time.Duration, time.Second, time.Minute, nil, + time.Minute, 0, time.Second*10, common.DefaultPortArgoCDMetrics, @@ -2642,10 +2643,18 @@ func TestSelfHealExponentialBackoff(t *testing.T) { alreadyAttempted: false, expectedAttempts: 0, syncStatus: v1alpha1.SyncStatusCodeOutOfSync, - }, { + }, { // backoff will not reset as finished tme isn't >= cooldown attempts: 6, - finishedAt: nil, - expectedDuration: 0, + finishedAt: ptr.To(metav1.Now()), + expectedDuration: 120 * time.Second, + shouldSelfHeal: false, + alreadyAttempted: true, + expectedAttempts: 6, + syncStatus: v1alpha1.SyncStatusCodeSynced, + }, { // backoff will reset as finished time is >= cooldown + attempts: 40, + finishedAt: &metav1.Time{Time: time.Now().Add(-(1 * time.Minute))}, + expectedDuration: -60 * time.Second, shouldSelfHeal: true, alreadyAttempted: true, expectedAttempts: 0, diff --git a/docs/operator-manual/server-commands/argocd-application-controller.md b/docs/operator-manual/server-commands/argocd-application-controller.md index 852c62456666c..29c6b95d7f1e6 100644 --- a/docs/operator-manual/server-commands/argocd-application-controller.md +++ b/docs/operator-manual/server-commands/argocd-application-controller.md @@ -71,6 +71,7 @@ argocd-application-controller [flags] --repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60) --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") --self-heal-backoff-cap-seconds int Specifies max timeout of exponential backoff between application self heal attempts (default 300) + --self-heal-backoff-cooldown-seconds int Specifies period of time the app needs to stay synced before the self heal backoff can reset (default 330) --self-heal-backoff-factor int Specifies factor of exponential timeout between application self heal attempts (default 3) --self-heal-backoff-timeout-seconds int Specifies initial timeout of exponential backoff between self heal attempts (default 2) --self-heal-timeout-seconds int Specifies timeout between application self heal attempts diff --git a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml index ea40f9c9a4c40..bf65aed35bb2f 100644 --- a/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml +++ b/manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml @@ -121,6 +121,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.self.heal.backoff.cooldown.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index a595b75f84b5e..e33d3917f0f27 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -124,6 +124,12 @@ spec: name: argocd-cmd-params-cm key: controller.self.heal.backoff.cap.seconds optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.self.heal.backoff.cooldown.seconds + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/core-install-with-hydrator.yaml b/manifests/core-install-with-hydrator.yaml index a7e32c3acee8c..5d54bfb9ebb26 100644 --- a/manifests/core-install-with-hydrator.yaml +++ b/manifests/core-install-with-hydrator.yaml @@ -25454,6 +25454,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index 000aea56787a2..9a46112e0bfcb 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -25288,6 +25288,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/ha/install-with-hydrator.yaml b/manifests/ha/install-with-hydrator.yaml index bacf71bf49d12..af90d98a8b5bb 100644 --- a/manifests/ha/install-with-hydrator.yaml +++ b/manifests/ha/install-with-hydrator.yaml @@ -27499,6 +27499,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index f3153be54bb5e..94c355060bd35 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -27335,6 +27335,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/ha/namespace-install-with-hydrator.yaml b/manifests/ha/namespace-install-with-hydrator.yaml index 5ce09327ce029..251a8e6adba3d 100644 --- a/manifests/ha/namespace-install-with-hydrator.yaml +++ b/manifests/ha/namespace-install-with-hydrator.yaml @@ -3310,6 +3310,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index 6cdf81e20b1ad..db6d2f97e5892 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -3146,6 +3146,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/install-with-hydrator.yaml b/manifests/install-with-hydrator.yaml index 44c8a50ab3161..d1cdc23c9ee3b 100644 --- a/manifests/install-with-hydrator.yaml +++ b/manifests/install-with-hydrator.yaml @@ -26543,6 +26543,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/install.yaml b/manifests/install.yaml index 3f25c1287a6ca..d6e75b197c5e5 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -26377,6 +26377,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/namespace-install-with-hydrator.yaml b/manifests/namespace-install-with-hydrator.yaml index 0dd6a114756c7..fb86245be7d69 100644 --- a/manifests/namespace-install-with-hydrator.yaml +++ b/manifests/namespace-install-with-hydrator.yaml @@ -2354,6 +2354,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 85aee13abdc58..31c6c7e7291a0 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -2188,6 +2188,12 @@ spec: key: controller.self.heal.backoff.cap.seconds name: argocd-cmd-params-cm optional: true + - name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS + valueFrom: + configMapKeyRef: + key: controller.self.heal.backoff.cooldown.seconds + name: argocd-cmd-params-cm + optional: true - name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT valueFrom: configMapKeyRef: