-
Notifications
You must be signed in to change notification settings - Fork 311
Always retry provisioning operations on failure #584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,12 +1,17 @@ | ||
| package baremetalhost | ||
|
|
||
| import ( | ||
| "math" | ||
|
|
||
| metal3 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1" | ||
|
|
||
| "sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
| "time" | ||
|
|
||
| "sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
| ) | ||
|
|
||
| const maxBackOff = time.Hour * 24 | ||
|
|
||
| // actionResult is an interface that encapsulates the result of a Reconcile | ||
| // call, as returned by the action corresponding to the current state. | ||
| type actionResult interface { | ||
|
|
@@ -90,11 +95,22 @@ func (r actionError) Dirty() bool { | |
| // actionFailed is a result indicating that the current action has failed, | ||
| // and that the resource should be marked as in error. | ||
| type actionFailed struct { | ||
| dirty bool | ||
| ErrorType metal3.ErrorType | ||
| dirty bool | ||
| ErrorType metal3.ErrorType | ||
| errorCount int | ||
| } | ||
|
|
||
| func calculateBackoff(errorCount int, max time.Duration) time.Duration { | ||
| backOff := math.Exp2(float64(errorCount)) | ||
| backOffDuration := time.Second * time.Duration(backOff) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 2s is a very short back-off to start with for an operation as long as e.g. provisioning. The user may not even have time to notice that it has failed. Maybe s/Second/Minute/ here? |
||
| if backOffDuration.Milliseconds() > max.Milliseconds() { | ||
| return max | ||
| } | ||
| return backOffDuration | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having fixed delays (even exponentially increasing ones) is prone to causing thundering herd problems. We should at least add some jitter on top. (We could go as far as to implement exponential backoff in the CSMA sense, where we wait for a random interval between 0 and backOffDuration, but resource contention isn't our primary reason for backing off here so my instinct is that that would be overkill.)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a separate need to throttle the number of things we ask ironic to do at once anyway, so maybe we can solve the herd problem that way to avoid complicating the logic here? |
||
| } | ||
|
|
||
| func (r actionFailed) Result() (result reconcile.Result, err error) { | ||
| result.RequeueAfter = calculateBackoff(r.errorCount, maxBackOff) | ||
| return | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -347,7 +347,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType | |
|
|
||
| info.publishEvent(eventType, errorMessage) | ||
| } | ||
| return actionFailed{dirty: dirty, ErrorType: errorType} | ||
| errorCount := info.host.Status.ErrorCount | ||
| return actionFailed{dirty: dirty, ErrorType: errorType, errorCount: errorCount} | ||
| } | ||
|
|
||
| func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request reconcile.Request, host *metal3v1alpha1.BareMetalHost) (reconcile.Result, error) { | ||
|
|
@@ -495,6 +496,7 @@ func (r *ReconcileBareMetalHost) actionRegistering(prov provisioner.Provisioner, | |
| info.log.Info("response from validate", "provResult", provResult) | ||
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason not to put this inside recordActionFailure? |
||
| return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -532,6 +534,7 @@ func (r *ReconcileBareMetalHost) actionInspecting(prov provisioner.Provisioner, | |
| } | ||
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -615,6 +618,7 @@ func (r *ReconcileBareMetalHost) actionProvisioning(prov provisioner.Provisioner | |
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.log.Info("handling provisioning error in controller") | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -653,6 +657,7 @@ func (r *ReconcileBareMetalHost) actionDeprovisioning(prov provisioner.Provision | |
| } | ||
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -687,6 +692,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i | |
| } | ||
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -738,6 +744,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i | |
| } | ||
|
|
||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) | ||
| } | ||
|
|
||
|
|
@@ -774,6 +781,7 @@ func (r *ReconcileBareMetalHost) actionManageSteadyState(prov provisioner.Provis | |
| return actionError{err} | ||
| } | ||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) | ||
| } | ||
| if provResult.Dirty { | ||
|
|
@@ -800,6 +808,7 @@ func (r *ReconcileBareMetalHost) actionManageReady(prov provisioner.Provisioner, | |
| return actionError{err} | ||
| } | ||
| if provResult.ErrorMessage != "" { | ||
| info.host.IncrementErrorCount() | ||
| return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) | ||
| } | ||
| if provResult.Dirty { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like this could probably go as short as an hour or two.