diff --git a/deploy/crds/metal3.io_baremetalhosts_crd.yaml b/deploy/crds/metal3.io_baremetalhosts_crd.yaml index 8f1ba48361..33d9a09358 100644 --- a/deploy/crds/metal3.io_baremetalhosts_crd.yaml +++ b/deploy/crds/metal3.io_baremetalhosts_crd.yaml @@ -297,6 +297,10 @@ spec: status: description: BareMetalHostStatus defines the observed state of BareMetalHost properties: + errorCount: + description: ErrorCount records how many times the host has encoutered + an error + type: integer errorMessage: description: the last error message reported by the provisioning subsystem type: string @@ -689,6 +693,7 @@ spec: type: string type: object required: + - errorCount - errorMessage - hardwareProfile - operationHistory diff --git a/pkg/apis/metal3/v1alpha1/baremetalhost_types.go b/pkg/apis/metal3/v1alpha1/baremetalhost_types.go index f419850bca..c2126690d2 100644 --- a/pkg/apis/metal3/v1alpha1/baremetalhost_types.go +++ b/pkg/apis/metal3/v1alpha1/baremetalhost_types.go @@ -529,6 +529,9 @@ type BareMetalHostStatus struct { // OperationHistory holds information about operations performed // on this host. OperationHistory OperationHistory `json:"operationHistory"` + + // ErrorCount records how many times the host has encoutered an error + ErrorCount int `json:"errorCount"` } // ProvisionStatus holds the state information for a single target. @@ -781,6 +784,11 @@ func (host *BareMetalHost) UpdateTriedCredentials(currentSecret corev1.Secret) { } } +// IncrementErrorCount increments the error number +func (host *BareMetalHost) IncrementErrorCount() { + host.Status.ErrorCount++ +} + // NewEvent creates a new event associated with the object and ready // to be published to the kubernetes API. func (host *BareMetalHost) NewEvent(reason, message string) corev1.Event { diff --git a/pkg/controller/baremetalhost/action_result.go b/pkg/controller/baremetalhost/action_result.go index 6466a4658b..003c805772 100644 --- a/pkg/controller/baremetalhost/action_result.go +++ b/pkg/controller/baremetalhost/action_result.go @@ -1,12 +1,17 @@ package baremetalhost import ( + "math" + metal3 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1" - "sigs.k8s.io/controller-runtime/pkg/reconcile" "time" + + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const maxBackOff = time.Hour * 24 + // actionResult is an interface that encapsulates the result of a Reconcile // call, as returned by the action corresponding to the current state. type actionResult interface { @@ -90,11 +95,22 @@ func (r actionError) Dirty() bool { // actionFailed is a result indicating that the current action has failed, // and that the resource should be marked as in error. type actionFailed struct { - dirty bool - ErrorType metal3.ErrorType + dirty bool + ErrorType metal3.ErrorType + errorCount int +} + +func calculateBackoff(errorCount int, max time.Duration) time.Duration { + backOff := math.Exp2(float64(errorCount)) + backOffDuration := time.Second * time.Duration(backOff) + if backOffDuration.Milliseconds() > max.Milliseconds() { + return max + } + return backOffDuration } func (r actionFailed) Result() (result reconcile.Result, err error) { + result.RequeueAfter = calculateBackoff(r.errorCount, maxBackOff) return } diff --git a/pkg/controller/baremetalhost/baremetalhost_controller.go b/pkg/controller/baremetalhost/baremetalhost_controller.go index 9590c1f371..6c4d9812b2 100644 --- a/pkg/controller/baremetalhost/baremetalhost_controller.go +++ b/pkg/controller/baremetalhost/baremetalhost_controller.go @@ -347,7 +347,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType info.publishEvent(eventType, errorMessage) } - return actionFailed{dirty: dirty, ErrorType: errorType} + errorCount := info.host.Status.ErrorCount + return actionFailed{dirty: dirty, ErrorType: errorType, errorCount: errorCount} } func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request reconcile.Request, host *metal3v1alpha1.BareMetalHost) (reconcile.Result, error) { @@ -495,6 +496,7 @@ func (r *ReconcileBareMetalHost) actionRegistering(prov provisioner.Provisioner, info.log.Info("response from validate", "provResult", provResult) if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } @@ -532,6 +534,7 @@ func (r *ReconcileBareMetalHost) actionInspecting(prov provisioner.Provisioner, } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage) } @@ -615,6 +618,7 @@ func (r *ReconcileBareMetalHost) actionProvisioning(prov provisioner.Provisioner if provResult.ErrorMessage != "" { info.log.Info("handling provisioning error in controller") + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage) } @@ -653,6 +657,7 @@ func (r *ReconcileBareMetalHost) actionDeprovisioning(prov provisioner.Provision } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage) } @@ -687,6 +692,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) } @@ -738,6 +744,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) } @@ -774,6 +781,7 @@ func (r *ReconcileBareMetalHost) actionManageSteadyState(prov provisioner.Provis return actionError{err} } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty { @@ -800,6 +808,7 @@ func (r *ReconcileBareMetalHost) actionManageReady(prov provisioner.Provisioner, return actionError{err} } if provResult.ErrorMessage != "" { + info.host.IncrementErrorCount() return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty {