Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ const (
// OperationalStatusError is the status value for when the host
// has any sort of error.
OperationalStatusError OperationalStatus = "error"

OperationalStatusDelayed = "delayed"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if there is a better name for this, but I can't think of one.

)

// ErrorType indicates the class of problem that has caused the Host resource
Expand Down Expand Up @@ -523,7 +525,7 @@ type BareMetalHostStatus struct {
// after modifying this file

// OperationalStatus holds the status of the host
// +kubebuilder:validation:Enum="";OK;discovered;error
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed
OperationalStatus OperationalStatus `json:"operationalStatus"`

// ErrorType indicates the type of failure encountered when the
Expand Down
1 change: 1 addition & 0 deletions config/crd/bases/metal3.io_baremetalhosts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ spec:
- OK
- discovered
- error
- delayed
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
1 change: 1 addition & 0 deletions config/render/capm3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ spec:
- OK
- discovered
- error
- delayed
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
28 changes: 26 additions & 2 deletions controllers/metal3.io/action_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ import (
"github.com/metal3-io/baremetal-operator/pkg/provisioner"
)

const maxBackOffCount = 10
// This is an upper limit for the ErrorCount, so that the max backoff
// timeout will not exceed (roughly) 8 hours
const maxBackOffCount = 9
Comment thread
andfasano marked this conversation as resolved.

func init() {
rand.Seed(time.Now().UTC().UnixNano())
Expand Down Expand Up @@ -54,6 +56,18 @@ func (r actionUpdate) Dirty() bool {
return true
}

// actionDelayed it's the same of an actionUpdate, but the requeue time
// is calculated using a fixed backoff with jitter
type actionDelayed struct {
actionUpdate
}

func (r actionDelayed) Result() (result reconcile.Result, err error) {
result.RequeueAfter = calculateBackoff(1)
result.Requeue = true
return
}

// actionComplete is a result indicating that the current action has completed,
// and that the resource should transition to the next state.
type actionComplete struct {
Expand Down Expand Up @@ -110,6 +124,16 @@ type actionFailed struct {
errorCount int
}

// Distribution sample for errorCount values:
// 1 [1m, 2m]
// 2 [2m, 4m]
// 3 [4m, 8m]
// 4 [8m, 16m]
// 5 [16m, 32m]
// 6 [32m, 1h4m]
// 7 [1h4m, 2h8m]
// 8 [2h8m, 4h16m]
// 9 [4h16m, 8h32m]
func calculateBackoff(errorCount int) time.Duration {

if errorCount > maxBackOffCount {
Expand All @@ -119,7 +143,7 @@ func calculateBackoff(errorCount int) time.Duration {
base := math.Exp2(float64(errorCount))
/* #nosec */
backOff := base - (rand.Float64() * base * 0.5)
backOffDuration := time.Minute * time.Duration(backOff)
backOffDuration := time.Duration(float64(time.Minute) * backOff)
return backOffDuration
}

Expand Down
2 changes: 1 addition & 1 deletion controllers/metal3.io/action_result_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
func TestBackoffIncrements(t *testing.T) {

var backOff time.Duration
for i := 0; i < maxBackOffCount; i++ {
for i := 1; i <= maxBackOffCount; i++ {
prev := backOff
backOff = calculateBackoff(i)

Expand Down
13 changes: 9 additions & 4 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ const (
rebootAnnotationPrefix = "reboot.metal3.io"
)

func init() {
}

// BareMetalHostReconciler reconciles a BareMetalHost object
type BareMetalHostReconciler struct {
client.Client
Expand Down Expand Up @@ -214,7 +211,6 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R
}

ready, err := prov.IsReady()

if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to check services availability")
}
Expand Down Expand Up @@ -295,6 +291,15 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType
return actionFailed{dirty: true, ErrorType: errorType, errorCount: info.host.Status.ErrorCount}
}

func recordActionDelayed(info *reconcileInfo) actionResult {

counter := delayedProvisioningHostCounters.With(hostMetricLabels(info.request))
info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc)

info.host.SetOperationalStatus(metal3v1alpha1.OperationalStatusDelayed)
return actionDelayed{}
}

func (r *BareMetalHostReconciler) credentialsErrorResult(err error, request ctrl.Request, host *metal3v1alpha1.BareMetalHost) (ctrl.Result, error) {
switch err.(type) {
// In the event a credential secret is defined, but we cannot find it
Expand Down
12 changes: 7 additions & 5 deletions controllers/metal3.io/baremetalhost_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1290,12 +1290,14 @@ func TestUpdateEventHandler(t *testing.T) {

func TestErrorCountIncrementsAlways(t *testing.T) {

errorTypes := []metal3v1alpha1.ErrorType{metal3v1alpha1.RegistrationError, metal3v1alpha1.InspectionError, metal3v1alpha1.ProvisioningError, metal3v1alpha1.PowerManagementError}

b := &metal3v1alpha1.BareMetalHost{}
assert.Equal(t, b.Status.ErrorCount, 0)

setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message")
assert.Equal(t, b.Status.ErrorCount, 1)

setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message")
assert.Equal(t, b.Status.ErrorCount, 2)
for _, c := range errorTypes {
before := b.Status.ErrorCount
setErrorMessage(b, c, "An error message")
assert.Equal(t, before+1, b.Status.ErrorCount)
}
}
67 changes: 64 additions & 3 deletions controllers/metal3.io/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1"
"github.com/metal3-io/baremetal-operator/pkg/provisioner"

"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down Expand Up @@ -77,9 +78,33 @@ func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, sta
return
}

func (hsm *hostStateMachine) ensureProvisioningCapacity(info *reconcileInfo) actionResult {
hasCapacity, err := hsm.Provisioner.HasProvisioningCapacity()
if err != nil {
return actionError{errors.Wrap(err, "failed to get hosts currently being provisioned")}
}
if !hasCapacity {
return recordActionDelayed(info)
}

return nil
}

func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.ProvisioningState,
info *reconcileInfo) {
info *reconcileInfo) actionResult {
if hsm.NextState != initialState {

// Check if there is a free slot available when trying to
// provision an host - if not the action will be delayed.
// The check is limited to only the provisioning states to
// avoid putting an excessive pressure on the provisioner
switch hsm.NextState {
case metal3v1alpha1.StateInspecting, metal3v1alpha1.StateProvisioning:
Comment thread
andfasano marked this conversation as resolved.
if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil {
return actionRes
}
}

info.log.Info("changing provisioning state",
"old", initialState,
"new", hsm.NextState)
Expand Down Expand Up @@ -109,11 +134,47 @@ func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.Pro
}
}
}

return nil
}

func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) actionResult {
func (hsm *hostStateMachine) checkDelayedHost(info *reconcileInfo) actionResult {

// Check if there's a free slot for hosts that have been previously delayed
if info.host.Status.OperationalStatus == metal3v1alpha1.OperationalStatusDelayed {
if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil {
return actionRes
}

// A slot is available, let's cleanup the status and retry
clearError(info.host)
return actionUpdate{}
}

// Make sure the check is re-applied when provisioning an
// host not yet tracked by the provisioner
switch info.host.Status.Provisioning.State {
case metal3v1alpha1.StateInspecting, metal3v1alpha1.StateProvisioning:
if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil {
return actionRes
}
}

return nil
}

func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) (actionRes actionResult) {
initialState := hsm.Host.Status.Provisioning.State
defer hsm.updateHostStateFrom(initialState, info)

defer func() {
if overrideAction := hsm.updateHostStateFrom(initialState, info); overrideAction != nil {
actionRes = overrideAction
}
}()

if delayedResult := hsm.checkDelayedHost(info); delayedResult != nil {
return delayedResult
}

if hsm.checkInitiateDelete() {
info.log.Info("Initiating host deletion")
Expand Down
Loading