Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions deploy/crds/metal3.io_baremetalhosts_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ spec:
status:
description: BareMetalHostStatus defines the observed state of BareMetalHost
properties:
errorCount:
description: ErrorCount records how many times the host has encoutered
an error
type: integer
errorMessage:
description: the last error message reported by the provisioning subsystem
type: string
Expand Down Expand Up @@ -689,6 +693,7 @@ spec:
type: string
type: object
required:
- errorCount
- errorMessage
- hardwareProfile
- operationHistory
Expand Down
8 changes: 8 additions & 0 deletions pkg/apis/metal3/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,9 @@ type BareMetalHostStatus struct {
// OperationHistory holds information about operations performed
// on this host.
OperationHistory OperationHistory `json:"operationHistory"`

// ErrorCount records how many times the host has encoutered an error
ErrorCount int `json:"errorCount"`
}

// ProvisionStatus holds the state information for a single target.
Expand Down Expand Up @@ -781,6 +784,11 @@ func (host *BareMetalHost) UpdateTriedCredentials(currentSecret corev1.Secret) {
}
}

// IncrementErrorCount increments the error number
func (host *BareMetalHost) IncrementErrorCount() {
host.Status.ErrorCount++
}

// NewEvent creates a new event associated with the object and ready
// to be published to the kubernetes API.
func (host *BareMetalHost) NewEvent(reason, message string) corev1.Event {
Expand Down
22 changes: 19 additions & 3 deletions pkg/controller/baremetalhost/action_result.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
package baremetalhost

import (
"math"

metal3 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1"

"sigs.k8s.io/controller-runtime/pkg/reconcile"
"time"

"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const maxBackOff = time.Hour * 24
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this could probably go as short as an hour or two.


// actionResult is an interface that encapsulates the result of a Reconcile
// call, as returned by the action corresponding to the current state.
type actionResult interface {
Expand Down Expand Up @@ -90,11 +95,22 @@ func (r actionError) Dirty() bool {
// actionFailed is a result indicating that the current action has failed,
// and that the resource should be marked as in error.
type actionFailed struct {
dirty bool
ErrorType metal3.ErrorType
dirty bool
ErrorType metal3.ErrorType
errorCount int
}

func calculateBackoff(errorCount int, max time.Duration) time.Duration {
backOff := math.Exp2(float64(errorCount))
backOffDuration := time.Second * time.Duration(backOff)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2s is a very short back-off to start with for an operation as long as e.g. provisioning. The user may not even have time to notice that it has failed. Maybe s/Second/Minute/ here?

if backOffDuration.Milliseconds() > max.Milliseconds() {
return max
}
return backOffDuration
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having fixed delays (even exponentially increasing ones) is prone to causing thundering herd problems. We should at least add some jitter on top. (We could go as far as to implement exponential backoff in the CSMA sense, where we wait for a random interval between 0 and backOffDuration, but resource contention isn't our primary reason for backing off here so my instinct is that that would be overkill.)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a separate need to throttle the number of things we ask ironic to do at once anyway, so maybe we can solve the herd problem that way to avoid complicating the logic here?

}

func (r actionFailed) Result() (result reconcile.Result, err error) {
result.RequeueAfter = calculateBackoff(r.errorCount, maxBackOff)
return
}

Expand Down
11 changes: 10 additions & 1 deletion pkg/controller/baremetalhost/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,8 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType

info.publishEvent(eventType, errorMessage)
}
return actionFailed{dirty: dirty, ErrorType: errorType}
errorCount := info.host.Status.ErrorCount
return actionFailed{dirty: dirty, ErrorType: errorType, errorCount: errorCount}
}

func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request reconcile.Request, host *metal3v1alpha1.BareMetalHost) (reconcile.Result, error) {
Expand Down Expand Up @@ -495,6 +496,7 @@ func (r *ReconcileBareMetalHost) actionRegistering(prov provisioner.Provisioner,
info.log.Info("response from validate", "provResult", provResult)

if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason not to put this inside recordActionFailure?

return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -532,6 +534,7 @@ func (r *ReconcileBareMetalHost) actionInspecting(prov provisioner.Provisioner,
}

if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -615,6 +618,7 @@ func (r *ReconcileBareMetalHost) actionProvisioning(prov provisioner.Provisioner

if provResult.ErrorMessage != "" {
info.log.Info("handling provisioning error in controller")
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -653,6 +657,7 @@ func (r *ReconcileBareMetalHost) actionDeprovisioning(prov provisioner.Provision
}

if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.ProvisioningError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -687,6 +692,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i
}

if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -738,6 +744,7 @@ func (r *ReconcileBareMetalHost) manageHostPower(prov provisioner.Provisioner, i
}

if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage)
}

Expand Down Expand Up @@ -774,6 +781,7 @@ func (r *ReconcileBareMetalHost) actionManageSteadyState(prov provisioner.Provis
return actionError{err}
}
if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage)
}
if provResult.Dirty {
Expand All @@ -800,6 +808,7 @@ func (r *ReconcileBareMetalHost) actionManageReady(prov provisioner.Provisioner,
return actionError{err}
}
if provResult.ErrorMessage != "" {
info.host.IncrementErrorCount()
return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage)
}
if provResult.Dirty {
Expand Down