diff --git a/deploy/crds/metal3.io_baremetalhosts_crd.yaml b/deploy/crds/metal3.io_baremetalhosts_crd.yaml index 90c03b5b83..a7a785462c 100644 --- a/deploy/crds/metal3.io_baremetalhosts_crd.yaml +++ b/deploy/crds/metal3.io_baremetalhosts_crd.yaml @@ -303,6 +303,10 @@ spec: status: description: BareMetalHostStatus defines the observed state of BareMetalHost properties: + errorCount: + description: ErrorCount records how many times the host has encoutered + an error since the last successful operation + type: integer errorMessage: description: the last error message reported by the provisioning subsystem type: string @@ -702,6 +706,7 @@ spec: type: string type: object required: + - errorCount - errorMessage - hardwareProfile - operationHistory diff --git a/pkg/apis/metal3/v1alpha1/baremetalhost_types.go b/pkg/apis/metal3/v1alpha1/baremetalhost_types.go index b83a679dce..682c065a43 100644 --- a/pkg/apis/metal3/v1alpha1/baremetalhost_types.go +++ b/pkg/apis/metal3/v1alpha1/baremetalhost_types.go @@ -545,6 +545,9 @@ type BareMetalHostStatus struct { // OperationHistory holds information about operations performed // on this host. OperationHistory OperationHistory `json:"operationHistory"` + + // ErrorCount records how many times the host has encoutered an error since the last successful operation + ErrorCount int `json:"errorCount"` } // ProvisionStatus holds the state information for a single target. @@ -611,22 +614,12 @@ func (host *BareMetalHost) Available() bool { } // SetErrorMessage updates the ErrorMessage in the host Status struct -// when necessary and returns true when a change is made or false when -// no change is made. -func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) (dirty bool) { - if host.Status.OperationalStatus != OperationalStatusError { - host.Status.OperationalStatus = OperationalStatusError - dirty = true - } - if host.Status.ErrorType != errType { - host.Status.ErrorType = errType - dirty = true - } - if host.Status.ErrorMessage != message { - host.Status.ErrorMessage = message - dirty = true - } - return dirty +// and increases the ErrorCount +func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) { + host.Status.OperationalStatus = OperationalStatusError + host.Status.ErrorType = errType + host.Status.ErrorMessage = message + host.Status.ErrorCount++ } // ClearError removes any existing error message. @@ -641,6 +634,10 @@ func (host *BareMetalHost) ClearError() (dirty bool) { host.Status.ErrorMessage = "" dirty = true } + if host.Status.ErrorCount != 0 { + host.Status.ErrorCount = 0 + dirty = true + } return dirty } diff --git a/pkg/apis/metal3/v1alpha1/baremetalhost_types_test.go b/pkg/apis/metal3/v1alpha1/baremetalhost_types_test.go index c1957fbf30..dddb47b607 100644 --- a/pkg/apis/metal3/v1alpha1/baremetalhost_types_test.go +++ b/pkg/apis/metal3/v1alpha1/baremetalhost_types_test.go @@ -554,3 +554,39 @@ func TestBootMode(t *testing.T) { }) } } + +func TestErrorCountIncrementsAlways(t *testing.T) { + + b := &BareMetalHost{} + assert.Equal(t, b.Status.ErrorCount, 0) + + b.SetErrorMessage(RegistrationError, "An error message") + assert.Equal(t, b.Status.ErrorCount, 1) + + b.SetErrorMessage(InspectionError, "Another error message") + assert.Equal(t, b.Status.ErrorCount, 2) +} + +func TestClearErrorCount(t *testing.T) { + + b := &BareMetalHost{ + Status: BareMetalHostStatus{ + ErrorCount: 5, + }, + } + + assert.True(t, b.ClearError()) + assert.Equal(t, 0, b.Status.ErrorCount) +} + +func TestClearErrorCountOnlyIfNotZero(t *testing.T) { + + b := &BareMetalHost{ + Status: BareMetalHostStatus{ + ErrorCount: 5, + }, + } + + assert.True(t, b.ClearError()) + assert.False(t, b.ClearError()) +} diff --git a/pkg/controller/baremetalhost/action_result.go b/pkg/controller/baremetalhost/action_result.go index 6466a4658b..a60441b9e2 100644 --- a/pkg/controller/baremetalhost/action_result.go +++ b/pkg/controller/baremetalhost/action_result.go @@ -1,12 +1,22 @@ package baremetalhost import ( + "math" + "math/rand" + metal3 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1" - "sigs.k8s.io/controller-runtime/pkg/reconcile" "time" + + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const maxBackOffCount = 10 + +func init() { + rand.Seed(time.Now().UTC().UnixNano()) +} + // actionResult is an interface that encapsulates the result of a Reconcile // call, as returned by the action corresponding to the current state. type actionResult interface { @@ -90,11 +100,26 @@ func (r actionError) Dirty() bool { // actionFailed is a result indicating that the current action has failed, // and that the resource should be marked as in error. type actionFailed struct { - dirty bool - ErrorType metal3.ErrorType + dirty bool + ErrorType metal3.ErrorType + errorCount int +} + +func calculateBackoff(errorCount int) time.Duration { + + if errorCount > maxBackOffCount { + errorCount = maxBackOffCount + } + + base := math.Exp2(float64(errorCount)) + /* #nosec */ + backOff := base - (rand.Float64() * base * 0.5) + backOffDuration := time.Minute * time.Duration(backOff) + return backOffDuration } func (r actionFailed) Result() (result reconcile.Result, err error) { + result.RequeueAfter = calculateBackoff(r.errorCount) return } diff --git a/pkg/controller/baremetalhost/action_result_test.go b/pkg/controller/baremetalhost/action_result_test.go new file mode 100644 index 0000000000..036ce646a1 --- /dev/null +++ b/pkg/controller/baremetalhost/action_result_test.go @@ -0,0 +1,30 @@ +package baremetalhost + +import ( + "math" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestBackoffIncrements(t *testing.T) { + + var backOff time.Duration + for i := 0; i < maxBackOffCount; i++ { + prev := backOff + backOff = calculateBackoff(i) + + assert.GreaterOrEqual(t, backOff.Milliseconds(), prev.Milliseconds()) + } + +} + +func TestMaxBackoffDuration(t *testing.T) { + + maxBackOffDuration := (time.Minute * time.Duration(math.Exp2(float64(maxBackOffCount)))).Milliseconds() + + assert.LessOrEqual(t, calculateBackoff(maxBackOffCount-1).Milliseconds(), maxBackOffDuration) + assert.LessOrEqual(t, calculateBackoff(maxBackOffCount+1).Milliseconds(), maxBackOffDuration) + assert.LessOrEqual(t, calculateBackoff(maxBackOffCount+100).Milliseconds(), maxBackOffDuration) +} diff --git a/pkg/controller/baremetalhost/baremetalhost_controller.go b/pkg/controller/baremetalhost/baremetalhost_controller.go index 69ab7e2fc7..58647c22dd 100644 --- a/pkg/controller/baremetalhost/baremetalhost_controller.go +++ b/pkg/controller/baremetalhost/baremetalhost_controller.go @@ -32,9 +32,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" ) @@ -115,7 +117,18 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { // Watch for changes to primary resource BareMetalHost err = c.Watch(&source.Kind{Type: &metal3v1alpha1.BareMetalHost{}}, - &handler.EnqueueRequestForObject{}) + &handler.EnqueueRequestForObject{}, predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldHost := e.ObjectOld.(*metal3v1alpha1.BareMetalHost) + newHost := e.ObjectNew.(*metal3v1alpha1.BareMetalHost) + + if oldHost.Status.ErrorCount != newHost.Status.ErrorCount { + //skip reconcile loop + return false + } + return true + }, + }) if err != nil { return err } @@ -351,21 +364,22 @@ func logResult(info *reconcileInfo, result reconcile.Result) { } func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType, errorMessage string) actionFailed { - dirty := info.host.SetErrorMessage(errorType, errorMessage) - if dirty { - eventType := map[metal3v1alpha1.ErrorType]string{ - metal3v1alpha1.RegistrationError: "RegistrationError", - metal3v1alpha1.InspectionError: "InspectionError", - metal3v1alpha1.ProvisioningError: "ProvisioningError", - metal3v1alpha1.PowerManagementError: "PowerManagementError", - }[errorType] - counter := actionFailureCounters.WithLabelValues(eventType) - info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc) + info.host.SetErrorMessage(errorType, errorMessage) - info.publishEvent(eventType, errorMessage) - } - return actionFailed{dirty: dirty, ErrorType: errorType} + eventType := map[metal3v1alpha1.ErrorType]string{ + metal3v1alpha1.RegistrationError: "RegistrationError", + metal3v1alpha1.InspectionError: "InspectionError", + metal3v1alpha1.ProvisioningError: "ProvisioningError", + metal3v1alpha1.PowerManagementError: "PowerManagementError", + }[errorType] + + counter := actionFailureCounters.WithLabelValues(eventType) + info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc) + + info.publishEvent(eventType, errorMessage) + + return actionFailed{dirty: true, ErrorType: errorType, errorCount: info.host.Status.ErrorCount} } func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request reconcile.Request, host *metal3v1alpha1.BareMetalHost) (reconcile.Result, error) { @@ -375,15 +389,12 @@ func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request recon // at some point in the future. case *ResolveBMCSecretRefError: credentialsMissing.Inc() - changed, saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error()) + saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error()) if saveErr != nil { return reconcile.Result{Requeue: true}, saveErr } - if changed { - // Only publish the event if we do not have an error - // after saving so that we only publish one time. - r.publishEvent(request, host.NewEvent("BMCCredentialError", err.Error())) - } + r.publishEvent(request, host.NewEvent("BMCCredentialError", err.Error())) + return reconcile.Result{Requeue: true, RequeueAfter: hostErrorRetryDelay}, nil // If a managed Host is missing a BMC address or secret, or // we have found the secret but it is missing the required fields, @@ -394,7 +405,7 @@ func (r *ReconcileBareMetalHost) credentialsErrorResult(err error, request recon case *EmptyBMCAddressError, *EmptyBMCSecretError, *bmc.CredentialsValidationError, *bmc.UnknownBMCTypeError: credentialsInvalid.Inc() - _, saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error()) + saveErr := r.setErrorCondition(request, host, metal3v1alpha1.RegistrationError, err.Error()) if saveErr != nil { return reconcile.Result{Requeue: true}, saveErr } @@ -877,20 +888,19 @@ func (r *ReconcileBareMetalHost) getHostStatusFromAnnotation(host *metal3v1alpha return objStatus, nil } -func (r *ReconcileBareMetalHost) setErrorCondition(request reconcile.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (changed bool, err error) { +func (r *ReconcileBareMetalHost) setErrorCondition(request reconcile.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (err error) { reqLogger := log.WithValues("Request.Namespace", request.Namespace, "Request.Name", request.Name) - changed = host.SetErrorMessage(errType, message) - if changed { - reqLogger.Info( - "adding error message", - "message", message, - ) - err = r.saveHostStatus(host) - if err != nil { - err = errors.Wrap(err, "failed to update error message") - } + host.SetErrorMessage(errType, message) + + reqLogger.Info( + "adding error message", + "message", message, + ) + err = r.saveHostStatus(host) + if err != nil { + err = errors.Wrap(err, "failed to update error message") } return diff --git a/pkg/controller/baremetalhost/host_state_machine_test.go b/pkg/controller/baremetalhost/host_state_machine_test.go index 98da856ff7..1883292098 100644 --- a/pkg/controller/baremetalhost/host_state_machine_test.go +++ b/pkg/controller/baremetalhost/host_state_machine_test.go @@ -6,7 +6,14 @@ import ( metal3v1alpha1 "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1" "github.com/metal3-io/baremetal-operator/pkg/bmc" + "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/metal3-io/baremetal-operator/pkg/apis/metal3/v1alpha1" + "github.com/metal3-io/baremetal-operator/pkg/provisioner" + corev1 "k8s.io/api/core/v1" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) func testStateMachine(host *metal3v1alpha1.BareMetalHost) *hostStateMachine { @@ -167,3 +174,213 @@ func TestProvisioningCancelled(t *testing.T) { }) } } + +func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) { + + tests := []struct { + Scenario string + Host *metal3v1alpha1.BareMetalHost + }{ + { + Scenario: "registering", + Host: host(metal3v1alpha1.StateRegistering).build(), + }, + { + Scenario: "inspecting", + Host: host(metal3v1alpha1.StateInspecting).build(), + }, + { + Scenario: "ready", + Host: host(metal3v1alpha1.StateReady).build(), + }, + { + Scenario: "deprovisioning", + Host: host(metal3v1alpha1.StateDeprovisioning).build(), + }, + { + Scenario: "provisioning", + Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), + }, + { + Scenario: "externallyProvisioned", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + }, + } + for _, tt := range tests { + t.Run(tt.Scenario, func(t *testing.T) { + prov := &mockProvisioner{} + hsm := newHostStateMachine(tt.Host, &ReconcileBareMetalHost{}, prov) + info := makeDefaultReconcileInfo(tt.Host) + + prov.setNextError("some error") + result := hsm.ReconcileState(info) + + assert.Greater(t, tt.Host.Status.ErrorCount, 0) + assert.True(t, result.Dirty()) + }) + } +} + +func TestErrorCountCleared(t *testing.T) { + + tests := []struct { + Scenario string + Host *metal3v1alpha1.BareMetalHost + }{ + { + Scenario: "registering", + Host: host(metal3v1alpha1.StateRegistering).build(), + }, + { + Scenario: "inspecting", + Host: host(metal3v1alpha1.StateInspecting).build(), + }, + { + Scenario: "ready", + Host: host(metal3v1alpha1.StateReady).build(), + }, + { + Scenario: "deprovisioning", + Host: host(metal3v1alpha1.StateDeprovisioning).build(), + }, + { + Scenario: "provisioning", + Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), + }, + { + Scenario: "externallyProvisioned", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + }, + } + for _, tt := range tests { + t.Run(tt.Scenario, func(t *testing.T) { + prov := &mockProvisioner{} + hsm := newHostStateMachine(tt.Host, &ReconcileBareMetalHost{}, prov) + info := makeDefaultReconcileInfo(tt.Host) + + info.host.Status.ErrorCount = 1 + prov.setNextResult(true) + result := hsm.ReconcileState(info) + + assert.Equal(t, tt.Host.Status.ErrorCount, 0) + assert.True(t, result.Dirty()) + }) + } +} + +type hostBuilder struct { + metal3v1alpha1.BareMetalHost +} + +func host(state metal3v1alpha1.ProvisioningState) *hostBuilder { + return &hostBuilder{ + metal3v1alpha1.BareMetalHost{ + Status: metal3v1alpha1.BareMetalHostStatus{ + Provisioning: metal3v1alpha1.ProvisionStatus{ + State: state, + BootMode: v1alpha1.DefaultBootMode, + }, + GoodCredentials: metal3v1alpha1.CredentialsStatus{ + Reference: &corev1.SecretReference{ + Name: "secretRefName", + Namespace: "secretNs", + }, + Version: "100", + }, + }, + }, + } +} + +func (hb *hostBuilder) build() *metal3v1alpha1.BareMetalHost { + return &hb.BareMetalHost +} + +func (hb *hostBuilder) SetTriedCredentials() *hostBuilder { + hb.Status.TriedCredentials = hb.Status.GoodCredentials + return hb +} + +func (hb *hostBuilder) SetExternallyProvisioned() *hostBuilder { + hb.Spec.ExternallyProvisioned = true + return hb +} + +func (hb *hostBuilder) SetImageURL(url string) *hostBuilder { + hb.Spec.Image = &metal3v1alpha1.Image{ + URL: url, + } + return hb +} + +func makeDefaultReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo { + return &reconcileInfo{ + log: logf.Log.WithName("test"), + host: host, + request: reconcile.Request{}, + bmcCredsSecret: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: host.Status.GoodCredentials.Reference.Name, + Namespace: host.Status.GoodCredentials.Reference.Namespace, + ResourceVersion: host.Status.GoodCredentials.Version, + }, + }, + } +} + +type mockProvisioner struct { + nextResult provisioner.Result +} + +func (m *mockProvisioner) setNextError(msg string) { + m.nextResult = provisioner.Result{ + ErrorMessage: msg, + } +} + +func (m *mockProvisioner) setNextResult(dirty bool) { + m.nextResult = provisioner.Result{ + Dirty: dirty, + ErrorMessage: "", + } +} + +func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { + return m.nextResult, details, err +} + +func (m *mockProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) Adopt() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) Provision(configData provisioner.HostConfigData) (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) Deprovision() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) Delete() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) PowerOn() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) PowerOff() (result provisioner.Result, err error) { + return m.nextResult, err +} + +func (m *mockProvisioner) IsReady() (result bool, err error) { + return +}