Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions deploy/crds/metal3.io_baremetalhosts_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ spec:
status:
description: BareMetalHostStatus defines the observed state of BareMetalHost
properties:
errorCount:
description: ErrorCount records how many times the host has encoutered
an error since the last successful operation
type: integer
errorMessage:
description: the last error message reported by the provisioning subsystem
type: string
Expand Down Expand Up @@ -702,6 +706,7 @@ spec:
type: string
type: object
required:
- errorCount
- errorMessage
- hardwareProfile
- operationHistory
Expand Down
29 changes: 13 additions & 16 deletions pkg/apis/metal3/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,9 @@ type BareMetalHostStatus struct {
// OperationHistory holds information about operations performed
// on this host.
OperationHistory OperationHistory `json:"operationHistory"`

// ErrorCount records how many times the host has encoutered an error since the last successful operation
ErrorCount int `json:"errorCount"`
}

// ProvisionStatus holds the state information for a single target.
Expand Down Expand Up @@ -611,22 +614,12 @@ func (host *BareMetalHost) Available() bool {
}

// SetErrorMessage updates the ErrorMessage in the host Status struct
// when necessary and returns true when a change is made or false when
// no change is made.
func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) (dirty bool) {
if host.Status.OperationalStatus != OperationalStatusError {
host.Status.OperationalStatus = OperationalStatusError
dirty = true
}
if host.Status.ErrorType != errType {
host.Status.ErrorType = errType
dirty = true
}
if host.Status.ErrorMessage != message {
host.Status.ErrorMessage = message
dirty = true
}
return dirty
// and increases the ErrorCount
func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) {
host.Status.OperationalStatus = OperationalStatusError
host.Status.ErrorType = errType
host.Status.ErrorMessage = message
host.Status.ErrorCount++
}

// ClearError removes any existing error message.
Expand All @@ -641,6 +634,10 @@ func (host *BareMetalHost) ClearError() (dirty bool) {
host.Status.ErrorMessage = ""
dirty = true
}
if host.Status.ErrorCount != 0 {
host.Status.ErrorCount = 0
dirty = true
}
return dirty
}

Expand Down
36 changes: 36 additions & 0 deletions pkg/apis/metal3/v1alpha1/baremetalhost_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,3 +554,39 @@ func TestBootMode(t *testing.T) {
})
}
}

func TestErrorCountIncrementsAlways(t *testing.T) {

b := &BareMetalHost{}
assert.Equal(t, b.Status.ErrorCount, 0)

b.SetErrorMessage(RegistrationError, "An error message")
assert.Equal(t, b.Status.ErrorCount, 1)

b.SetErrorMessage(InspectionError, "Another error message")
assert.Equal(t, b.Status.ErrorCount, 2)
}

func TestClearErrorCount(t *testing.T) {

b := &BareMetalHost{
Status: BareMetalHostStatus{
ErrorCount: 5,
},
}

assert.True(t, b.ClearError())
assert.Equal(t, 0, b.Status.ErrorCount)
}

func TestClearErrorCountOnlyIfNotZero(t *testing.T) {

b := &BareMetalHost{
Status: BareMetalHostStatus{
ErrorCount: 5,
},
}

assert.True(t, b.ClearError())
assert.False(t, b.ClearError())
}
31 changes: 28 additions & 3 deletions pkg/controller/baremetalhost/action_result.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
package baremetalhost

import (
"math"
"math/rand"

metal3 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1"

"sigs.k8s.io/controller-runtime/pkg/reconcile"
"time"

"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const maxBackOffCount = 10

func init() {
rand.Seed(time.Now().UTC().UnixNano())
}

// actionResult is an interface that encapsulates the result of a Reconcile
// call, as returned by the action corresponding to the current state.
type actionResult interface {
Expand Down Expand Up @@ -90,11 +100,26 @@ func (r actionError) Dirty() bool {
// actionFailed is a result indicating that the current action has failed,
// and that the resource should be marked as in error.
type actionFailed struct {
dirty bool
ErrorType metal3.ErrorType
dirty bool
ErrorType metal3.ErrorType
errorCount int
}

func calculateBackoff(errorCount int) time.Duration {

if errorCount > maxBackOffCount {
errorCount = maxBackOffCount
}

base := math.Exp2(float64(errorCount))
/* #nosec */
backOff := base - (rand.Float64() * base * 0.5)
backOffDuration := time.Minute * time.Duration(backOff)
return backOffDuration
}

func (r actionFailed) Result() (result reconcile.Result, err error) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW it would be fine to change the signature of Result() to pass the error count if it were more convenient for us to not calculate it until later.

result.RequeueAfter = calculateBackoff(r.errorCount)
return
}

Expand Down
30 changes: 30 additions & 0 deletions pkg/controller/baremetalhost/action_result_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package baremetalhost

import (
"math"
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func TestBackoffIncrements(t *testing.T) {

var backOff time.Duration
for i := 0; i < maxBackOffCount; i++ {
prev := backOff
backOff = calculateBackoff(i)

assert.GreaterOrEqual(t, backOff.Milliseconds(), prev.Milliseconds())
}

}

func TestMaxBackoffDuration(t *testing.T) {

maxBackOffDuration := (time.Minute * time.Duration(math.Exp2(float64(maxBackOffCount)))).Milliseconds()

assert.LessOrEqual(t, calculateBackoff(maxBackOffCount-1).Milliseconds(), maxBackOffDuration)
assert.LessOrEqual(t, calculateBackoff(maxBackOffCount+1).Milliseconds(), maxBackOffDuration)
assert.LessOrEqual(t, calculateBackoff(maxBackOffCount+100).Milliseconds(), maxBackOffDuration)
}
74 changes: 42 additions & 32 deletions pkg/controller/baremetalhost/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
)
Expand Down Expand Up @@ -115,7 +117,18 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error {

// Watch for changes to primary resource BareMetalHost
err = c.Watch(&source.Kind{Type: &metal3v1alpha1.BareMetalHost{}},
&handler.EnqueueRequestForObject{})
&handler.EnqueueRequestForObject{}, predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
oldHost := e.ObjectOld.(*metal3v1alpha1.BareMetalHost)
newHost := e.ObjectNew.(*metal3v1alpha1.BareMetalHost)

if oldHost.Status.ErrorCount != newHost.Status.ErrorCount {
Comment thread
andfasano marked this conversation as resolved.
//skip reconcile loop
return false
}
return true
},
})
if err != nil {
return err
}
Expand Down Expand Up @@ -351,21 +364,22 @@ func logResult(info *reconcileInfo, result reconcile.Result) {
}

func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType, errorMessage string) actionFailed {
dirty := info.host.SetErrorMessage(errorType, errorMessage)
if dirty {
eventType := map[metal3v1alpha1.ErrorType]string{
metal3v1alpha1.RegistrationError: "RegistrationError",
metal3v1alpha1.InspectionError: "InspectionError",
metal3v1alpha1.ProvisioningError: "ProvisioningError",
metal3v1alpha1.PowerManagementError: "PowerManagementError",
}[errorType]

counter := actionFailureCounters.WithLabelValues(eventType)
info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc)
info.host.SetErrorMessage(errorType, errorMessage)

info.publishEvent(eventType, errorMessage)
}
return actionFailed{dirty: dirty, ErrorType: errorType}
eventType := map[metal3v1alpha1.ErrorType]string{
metal3v1alpha1.RegistrationError: "RegistrationError",
metal3v1alpha1.InspectionError: "InspectionError",
metal3v1alpha1.ProvisioningError: "ProvisioningError",
metal3v1alpha1.PowerManagementError: "PowerManagementError",
}[errorType]

counter := actionFailureCounters.WithLabelValues(eventType)
info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc)

info.publishEvent(eventType, errorMessage)

return actionFailed{dirty: true, ErrorType: errorType, errorCount: info.host.Status.ErrorCount}
}

func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request reconcile.Request, host *metal3v1alpha1.BareMetalHost) (reconcile.Result, error) {
Expand All @@ -375,15 +389,12 @@ func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request recon
// at some point in the future.
case *ResolveBMCSecretRefError:
credentialsMissing.Inc()
changed, saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error())
saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error())
if saveErr != nil {
return reconcile.Result{Requeue: true}, saveErr
}
if changed {
// Only publish the event if we do not have an error
// after saving so that we only publish one time.
r.publishEvent(request, host.NewEvent("BMCCredentialError", err.Error()))
}
r.publishEvent(request, host.NewEvent("BMCCredentialError", err.Error()))

return reconcile.Result{Requeue: true, RequeueAfter: hostErrorRetryDelay}, nil
// If a managed Host is missing a BMC address or secret, or
// we have found the secret but it is missing the required fields,
Expand All @@ -394,7 +405,7 @@ func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request recon
case *EmptyBMCAddressError, *EmptyBMCSecretError,
*bmc.CredentialsValidationError, *bmc.UnknownBMCTypeError:
credentialsInvalid.Inc()
_, saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error())
saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error())
if saveErr != nil {
return reconcile.Result{Requeue: true}, saveErr
}
Expand Down Expand Up @@ -877,20 +888,19 @@ func (r *ReconcileBareMetalHost) getHostStatusFromAnnotation(host *metal3v1alpha
return objStatus, nil
}

func (r *ReconcileBareMetalHost) setErrorCondition(request reconcile.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (changed bool, err error) {
func (r *ReconcileBareMetalHost) setErrorCondition(request reconcile.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (err error) {
reqLogger := log.WithValues("Request.Namespace",
request.Namespace, "Request.Name", request.Name)

changed = host.SetErrorMessage(errType, message)
if changed {
reqLogger.Info(
"adding error message",
"message", message,
)
err = r.saveHostStatus(host)
if err != nil {
err = errors.Wrap(err, "failed to update error message")
}
host.SetErrorMessage(errType, message)

reqLogger.Info(
"adding error message",
"message", message,
)
err = r.saveHostStatus(host)
if err != nil {
err = errors.Wrap(err, "failed to update error message")
}

return
Expand Down
Loading