diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 74fbd55b1f..7ac672efbe 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -123,6 +123,9 @@ const ( type ErrorType string const ( + // ProvisionedRegistrationError is an error condition occurring when the controller + // is unable to re-register an already provisioned host. + ProvisionedRegistrationError ErrorType = "provisioned registration error" // RegistrationError is an error condition occurring when the // controller is unable to connect to the Host's baseboard management // controller. @@ -521,7 +524,7 @@ type BareMetalHostStatus struct { // ErrorType indicates the type of failure encountered when the // OperationalStatus is OperationalStatusError - // +kubebuilder:validation:Enum=registration error;inspection error;provisioning error;power management error + // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;provisioning error;power management error ErrorType ErrorType `json:"errorType,omitempty"` // LastUpdated identifies when this status was last observed. @@ -608,48 +611,6 @@ func (host *BareMetalHost) BootMode() BootMode { return mode } -// Available returns true if the host is available to be provisioned. -func (host *BareMetalHost) Available() bool { - if host.Spec.ConsumerRef != nil { - return false - } - if host.GetDeletionTimestamp() != nil { - return false - } - if host.HasError() { - return false - } - return true -} - -// SetErrorMessage updates the ErrorMessage in the host Status struct -// and increases the ErrorCount -func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) { - host.Status.OperationalStatus = OperationalStatusError - host.Status.ErrorType = errType - host.Status.ErrorMessage = message - host.Status.ErrorCount++ -} - -// ClearError removes any existing error message. -func (host *BareMetalHost) ClearError() (dirty bool) { - dirty = host.SetOperationalStatus(OperationalStatusOK) - var emptyErrType ErrorType = "" - if host.Status.ErrorType != emptyErrType { - host.Status.ErrorType = emptyErrType - dirty = true - } - if host.Status.ErrorMessage != "" { - host.Status.ErrorMessage = "" - dirty = true - } - if host.Status.ErrorCount != 0 { - host.Status.ErrorCount = 0 - dirty = true - } - return dirty -} - // setLabel updates the given label when necessary and returns true // when a change is made or false when no change is made. func (host *BareMetalHost) setLabel(name, value string) bool { @@ -713,12 +674,6 @@ func (host *BareMetalHost) OperationalStatus() OperationalStatus { return host.Status.OperationalStatus } -// HasError returns a boolean indicating whether there is an error -// set for the host. -func (host *BareMetalHost) HasError() bool { - return host.Status.ErrorMessage != "" -} - // CredentialsKey returns a NamespacedName suitable for loading the // Secret containing the credentials associated with the host. func (host *BareMetalHost) CredentialsKey() types.NamespacedName { diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go index dddb47b607..5a005d5d8b 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go @@ -2,7 +2,6 @@ package v1alpha1 import ( "testing" - "time" "github.com/stretchr/testify/assert" @@ -10,71 +9,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func TestHostAvailable(t *testing.T) { - hostWithError := BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - } - hostWithError.SetErrorMessage(RegistrationError, "oops something went wrong") - - testCases := []struct { - Host BareMetalHost - Expected bool - FailMessage string - }{ - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - }, - Expected: true, - FailMessage: "available host returned not available", - }, - { - Host: hostWithError, - Expected: false, - FailMessage: "host with error returned as available", - }, - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - Spec: BareMetalHostSpec{ - ConsumerRef: &corev1.ObjectReference{ - Name: "mymachine", - Namespace: "myns", - }, - }, - }, - Expected: false, - FailMessage: "host with consumerref returned as available", - }, - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - DeletionTimestamp: &metav1.Time{Time: time.Now()}, - }, - }, - Expected: false, - FailMessage: "deleted host returned as available", - }, - } - - for _, tc := range testCases { - if tc.Host.Available() != tc.Expected { - t.Error(tc.FailMessage) - } - } -} - func TestHostNeedsHardwareInspection(t *testing.T) { testCases := []struct { @@ -554,39 +488,3 @@ func TestBootMode(t *testing.T) { }) } } - -func TestErrorCountIncrementsAlways(t *testing.T) { - - b := &BareMetalHost{} - assert.Equal(t, b.Status.ErrorCount, 0) - - b.SetErrorMessage(RegistrationError, "An error message") - assert.Equal(t, b.Status.ErrorCount, 1) - - b.SetErrorMessage(InspectionError, "Another error message") - assert.Equal(t, b.Status.ErrorCount, 2) -} - -func TestClearErrorCount(t *testing.T) { - - b := &BareMetalHost{ - Status: BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, b.ClearError()) - assert.Equal(t, 0, b.Status.ErrorCount) -} - -func TestClearErrorCountOnlyIfNotZero(t *testing.T) { - - b := &BareMetalHost{ - Status: BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, b.ClearError()) - assert.False(t, b.ClearError()) -} diff --git a/config/crd/bases/metal3.io_baremetalhosts.yaml b/config/crd/bases/metal3.io_baremetalhosts.yaml index 767b31856a..fe52ca031c 100644 --- a/config/crd/bases/metal3.io_baremetalhosts.yaml +++ b/config/crd/bases/metal3.io_baremetalhosts.yaml @@ -262,6 +262,7 @@ spec: errorType: description: ErrorType indicates the type of failure encountered when the OperationalStatus is OperationalStatusError enum: + - provisioned registration error - registration error - inspection error - provisioning error diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index e99fc20eca..d10547a31c 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -260,6 +260,7 @@ spec: errorType: description: ErrorType indicates the type of failure encountered when the OperationalStatus is OperationalStatusError enum: + - provisioned registration error - registration error - inspection error - provisioning error diff --git a/controllers/metal3.io/action_result.go b/controllers/metal3.io/action_result.go index 5f3a9717e3..a746f05c98 100644 --- a/controllers/metal3.io/action_result.go +++ b/controllers/metal3.io/action_result.go @@ -25,17 +25,6 @@ type actionResult interface { Dirty() bool } -// actionContinueNoWrite is a result indicating that the current action is still -// in progress, and that the resource should remain in the same provisioning -// state without writing the status -type actionContinueNoWrite struct { - actionContinue -} - -func (r actionContinueNoWrite) Dirty() bool { - return false -} - // actionContinue is a result indicating that the current action is still // in progress, and that the resource should remain in the same provisioning // state. @@ -51,6 +40,17 @@ func (r actionContinue) Result() (result reconcile.Result, err error) { } func (r actionContinue) Dirty() bool { + return false +} + +// actionUpdate is a result indicating that the current action is still +// in progress, and that the resource should remain in the same provisioning +// state but write new Status data. +type actionUpdate struct { + actionContinue +} + +func (r actionUpdate) Dirty() bool { return true } diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index dfc243609a..d5967ce282 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -20,6 +20,7 @@ import ( "encoding/json" "fmt" "os" + "reflect" "strconv" "strings" "time" @@ -207,7 +208,7 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R request: request, bmcCredsSecret: bmcCredsSecret, } - prov, err := r.ProvisionerFactory(host, *bmcCreds, info.publishEvent) + prov, err := r.ProvisionerFactory(*host, *bmcCreds, info.publishEvent) if err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to create provisioner") } @@ -276,13 +277,14 @@ func logResult(info *reconcileInfo, result ctrl.Result) { func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType, errorMessage string) actionFailed { - info.host.SetErrorMessage(errorType, errorMessage) + setErrorMessage(info.host, errorType, errorMessage) eventType := map[metal3v1alpha1.ErrorType]string{ - metal3v1alpha1.RegistrationError: "RegistrationError", - metal3v1alpha1.InspectionError: "InspectionError", - metal3v1alpha1.ProvisioningError: "ProvisioningError", - metal3v1alpha1.PowerManagementError: "PowerManagementError", + metal3v1alpha1.ProvisionedRegistrationError: "ProvisionedRegistrationError", + metal3v1alpha1.RegistrationError: "RegistrationError", + metal3v1alpha1.InspectionError: "InspectionError", + metal3v1alpha1.ProvisioningError: "ProvisioningError", + metal3v1alpha1.PowerManagementError: "PowerManagementError", }[errorType] counter := actionFailureCounters.WithLabelValues(eventType) @@ -357,6 +359,30 @@ func clearRebootAnnotations(host *metal3v1alpha1.BareMetalHost) (dirty bool) { return } +// clearError removes any existing error message. +func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) { + dirty = host.SetOperationalStatus(metal3v1alpha1.OperationalStatusOK) + var emptyErrType metal3v1alpha1.ErrorType = "" + if host.Status.ErrorType != emptyErrType { + host.Status.ErrorType = emptyErrType + dirty = true + } + if host.Status.ErrorMessage != "" { + host.Status.ErrorMessage = "" + dirty = true + } + return dirty +} + +// setErrorMessage updates the ErrorMessage in the host Status struct +// and increases the ErrorCount +func setErrorMessage(host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) { + host.Status.OperationalStatus = metal3v1alpha1.OperationalStatusError + host.Status.ErrorType = errType + host.Status.ErrorMessage = message + host.Status.ErrorCount++ +} + // Manage deletion of the host func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info( @@ -394,60 +420,80 @@ func (r *BareMetalHostReconciler) actionUnmanaged(prov provisioner.Provisioner, if info.host.HasBMCDetails() { return actionComplete{} } - return actionContinueNoWrite{actionContinue{unmanagedRetryDelay}} + return actionContinue{unmanagedRetryDelay} } // Test the credentials by connecting to the management controller. -func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner, info *reconcileInfo) actionResult { +func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("registering and validating access to management controller", "credentials", info.host.Status.TriedCredentials) + dirty := false credsChanged := !info.host.Status.TriedCredentials.Match(*info.bmcCredsSecret) if credsChanged { info.log.Info("new credentials") info.host.UpdateTriedCredentials(*info.bmcCredsSecret) info.postSaveCallbacks = append(info.postSaveCallbacks, updatedCredentials.Inc) + dirty = true } - provResult, err := prov.ValidateManagementAccess(credsChanged) + provResult, provID, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) if err != nil { noManagementAccess.Inc() return actionError{errors.Wrap(err, "failed to validate BMC access")} } - info.log.Info("response from validate", "provResult", provResult) - if provResult.ErrorMessage != "" { return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } + if provID != "" && info.host.Status.Provisioning.ID != provID { + info.log.Info("setting provisioning id", "ID", provID) + info.host.Status.Provisioning.ID = provID + dirty = true + } + if provResult.Dirty { info.log.Info("host not ready", "wait", provResult.RequeueAfter) - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + dirty = true + } + if dirty { + return actionUpdate{result} + } + return result } // Reaching this point means the credentials are valid and worked, // so clear any previous error and record the success in the // status block. - info.log.Info("updating credentials success status fields") registeredNewCreds := !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) - info.host.UpdateGoodCredentials(*info.bmcCredsSecret) - info.log.Info("clearing previous error message") - info.host.ClearError() - if registeredNewCreds { + info.log.Info("updating credentials success status fields") + info.host.UpdateGoodCredentials(*info.bmcCredsSecret) info.publishEvent("BMCAccessValidated", "Verified access to BMC") + dirty = true + } else { + info.log.Info("verified access to the BMC") } - return actionComplete{} + if info.host.Status.ErrorType == metal3v1alpha1.RegistrationError || registeredNewCreds { + info.log.Info("clearing previous error message") + dirty = clearError(info.host) + } + + if dirty { + return actionComplete{} + } + return nil } // Ensure we have the information about the hardware on the host. func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("inspecting hardware") - provResult, details, err := prov.InspectHardware() + provResult, details, err := prov.InspectHardware(info.host.Status.ErrorType == metal3v1alpha1.InspectionError) if err != nil { return actionError{errors.Wrap(err, "hardware inspection failed")} } @@ -456,12 +502,15 @@ func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage) } - info.host.ClearError() - if provResult.Dirty || details == nil { - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } + clearError(info.host) info.host.Status.HardwareDetails = details return actionComplete{} } @@ -506,7 +555,7 @@ func (r *BareMetalHostReconciler) actionMatchProfile(prov provisioner.Provisione info.publishEvent("ProfileSet", fmt.Sprintf("Hardware profile set: %s", hardwareProfile)) } - info.host.ClearError() + clearError(info.host) return actionComplete{} } @@ -523,7 +572,7 @@ func (r *BareMetalHostReconciler) actionProvisioning(prov provisioner.Provisione if err := r.Update(context.TODO(), info.host); err != nil { return actionError{errors.Wrap(err, "failed to remove reboot annotations from host")} } - return actionContinueNoWrite{} + return actionContinue{} } provResult, err := prov.Provision(hostConf) @@ -540,8 +589,11 @@ func (r *BareMetalHostReconciler) actionProvisioning(prov provisioner.Provisione // Go back into the queue and wait for the Provision() method // to return false, indicating that it has no more work to // do. - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } // If the provisioner had no work, ensure the image settings match. @@ -563,23 +615,28 @@ func clearHostProvisioningSettings(host *metal3v1alpha1.BareMetalHost) { } func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisioner, info *reconcileInfo) actionResult { - // Adopt the host in case it has been re-registered during the - // deprovisioning process before it completed - provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) - if err != nil { - return actionError{err} - } - if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) - } - if provResult.Dirty { - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + if info.host.Status.Provisioning.Image.URL != "" { + // Adopt the host in case it has been re-registered during the + // deprovisioning process before it completed + provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.ProvisionedRegistrationError) + if err != nil { + return actionError{err} + } + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3v1alpha1.ProvisionedRegistrationError, provResult.ErrorMessage) + } + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result + } } info.log.Info("deprovisioning") - provResult, err = prov.Deprovision() + provResult, err := prov.Deprovision(info.host.Status.ErrorType == metal3v1alpha1.ProvisioningError) if err != nil { return actionError{errors.Wrap(err, "failed to deprovision")} } @@ -589,15 +646,18 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio } if provResult.Dirty { - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } if clearRebootAnnotations(info.host) { if err = r.Update(context.TODO(), info.host); err != nil { return actionError{errors.Wrap(err, "failed to remove reboot annotations from host")} } - return actionContinueNoWrite{} + return actionContinue{} } // After the provisioner is done, clear the provisioning settings @@ -613,18 +673,16 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, var provResult provisioner.Result // Check the current status and save it before trying to update it. - provResult, err := prov.UpdateHardwareState() + hwState, err := prov.UpdateHardwareState() if err != nil { return actionError{errors.Wrap(err, "failed to update the host power status")} } - if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) - } - - if provResult.Dirty { - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { + info.log.Info("updating power status", "discovered", *hwState.PoweredOn) + info.host.Status.PoweredOn = *hwState.PoweredOn + clearError(info.host) + return actionUpdate{} } desiredPowerOnState := info.host.Spec.Online @@ -637,7 +695,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, return actionError{errors.Wrap(err, "failed to remove reboot annotation from host")} } - return actionContinueNoWrite{} + return actionContinue{} } } @@ -652,7 +710,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // a delay. steadyStateResult := actionContinue{time.Second * 60} if info.host.Status.PoweredOn == desiredPowerOnState { - return actionContinueNoWrite{steadyStateResult} + return steadyStateResult } info.log.Info("power state change needed", @@ -683,15 +741,19 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, } powerChangeAttempts.With(metricLabels).Inc() }) - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } // The provisioner did not have to do anything to change the power // state and there were no errors, so reflect the new state in the // host status field. info.host.Status.PoweredOn = info.host.Spec.Online - return steadyStateResult + info.host.Status.ErrorCount = 0 + return actionUpdate{steadyStateResult} } // A host reaching this action handler should be provisioned or externally @@ -699,16 +761,19 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // action. We use the Adopt() API to make sure that the provisioner is aware of // the provisioning details. Then we monitor its power status. func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provisioner, info *reconcileInfo) actionResult { - provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) + provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.ProvisionedRegistrationError) if err != nil { return actionError{err} } if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) + return recordActionFailure(info, metal3v1alpha1.ProvisionedRegistrationError, provResult.ErrorMessage) } if provResult.Dirty { - info.host.ClearError() - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } return r.manageHostPower(prov, info) @@ -728,7 +793,7 @@ func (r *BareMetalHostReconciler) actionManageReady(prov provisioner.Provisioner if dirty { info.log.Info("updating host provisioning settings") } - info.host.ClearError() + clearError(info.host) return actionComplete{} } return r.manageHostPower(prov, info) @@ -791,7 +856,7 @@ func (r *BareMetalHostReconciler) getHostStatusFromAnnotation(host *metal3v1alph func (r *BareMetalHostReconciler) setErrorCondition(request ctrl.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (err error) { reqLogger := r.Log.WithValues("baremetalhost", request.NamespacedName) - host.SetErrorMessage(errType, message) + setErrorMessage(host, errType, message) reqLogger.Info( "adding error message", @@ -909,6 +974,30 @@ func hostHasFinalizer(host *metal3v1alpha1.BareMetalHost) bool { return utils.StringInList(host.Finalizers, metal3v1alpha1.BareMetalHostFinalizer) } +func (r *BareMetalHostReconciler) updateEventHandler(e event.UpdateEvent) bool { + _, oldOK := e.ObjectOld.(*metal3v1alpha1.BareMetalHost) + _, newOK := e.ObjectNew.(*metal3v1alpha1.BareMetalHost) + if !(oldOK && newOK) { + // The thing that changed wasn't a host, so we + // need to assume that we must update. This + // happens when, for example, an owned Secret + // changes. + return true + } + + //If the update increased the resource Generation then let's process it + if e.MetaNew.GetGeneration() != e.MetaOld.GetGeneration() { + return true + } + + //Discard updates that did not increase the resource Generation (such as on Status.LastUpdated), except for the finalizers or annotations + if reflect.DeepEqual(e.MetaNew.GetFinalizers(), e.MetaOld.GetFinalizers()) && reflect.DeepEqual(e.MetaNew.GetAnnotations(), e.MetaOld.GetAnnotations()) { + return false + } + + return true +} + // SetupWithManager reigsters the reconciler to be run by the manager func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager) error { @@ -936,23 +1025,7 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager) error { For(&metal3v1alpha1.BareMetalHost{}). WithEventFilter( predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - oldHost, oldOK := e.ObjectOld.(*metal3v1alpha1.BareMetalHost) - newHost, newOK := e.ObjectNew.(*metal3v1alpha1.BareMetalHost) - if !(oldOK && newOK) { - // The thing that changed wasn't a host, so we - // need to assume that we must update. This - // happens when, for example, an owned Secret - // changes. - return true - } - - if oldHost.Status.ErrorCount != newHost.Status.ErrorCount { - //skip reconcile loop - return false - } - return true - }, + UpdateFunc: r.updateEventHandler, }). WithOptions(opts). Owns(&corev1.Secret{}). diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 25e28b9a22..e00d8e7dc4 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -19,11 +19,10 @@ import ( ctrl "sigs.k8s.io/controller-runtime" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/reconcile" metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" - "github.com/metal3-io/baremetal-operator/pkg/bmc" - "github.com/metal3-io/baremetal-operator/pkg/provisioner" "github.com/metal3-io/baremetal-operator/pkg/provisioner/fixture" "github.com/metal3-io/baremetal-operator/pkg/utils" ) @@ -87,7 +86,7 @@ func newDefaultHost(t *testing.T) *metal3v1alpha1.BareMetalHost { return newDefaultNamedHost(t.Name(), t) } -func newTestReconcilerWithProvisionerFactory(factory provisioner.Factory, initObjs ...runtime.Object) *BareMetalHostReconciler { +func newTestReconcilerWithFixture(fix *fixture.Fixture, initObjs ...runtime.Object) *BareMetalHostReconciler { c := fakeclient.NewFakeClient(initObjs...) @@ -98,13 +97,14 @@ func newTestReconcilerWithProvisionerFactory(factory provisioner.Factory, initOb return &BareMetalHostReconciler{ Client: c, Scheme: scheme.Scheme, - ProvisionerFactory: factory, + ProvisionerFactory: fix.New, Log: ctrl.Log.WithName("controllers").WithName("BareMetalHost"), } } func newTestReconciler(initObjs ...runtime.Object) *BareMetalHostReconciler { - return newTestReconcilerWithProvisionerFactory(fixture.New, initObjs...) + fix := fixture.Fixture{} + return newTestReconcilerWithFixture(&fix, initObjs...) } type DoneFunc func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool @@ -168,7 +168,7 @@ func waitForError(t *testing.T, r *BareMetalHostReconciler, host *metal3v1alpha1 tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { t.Logf("Waiting for error: %q", host.Status.ErrorMessage) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } @@ -177,7 +177,7 @@ func waitForNoError(t *testing.T, r *BareMetalHostReconciler, host *metal3v1alph tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { t.Logf("Waiting for no error message: %q", host.Status.ErrorMessage) - return !host.HasError() + return host.Status.ErrorMessage == "" }, ) } @@ -1032,12 +1032,12 @@ func TestDeleteHost(t *testing.T) { host.DeletionTimestamp = &now host.Status.Provisioning.ID = "made-up-id" badSecret := newBMCCredsSecret("bmc-creds-no-user", "", "Pass") - r := newTestReconciler(host, badSecret) + fix := fixture.Fixture{} + r := newTestReconcilerWithFixture(&fix, host, badSecret) tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { - t.Logf("provisioning id: %q", host.Status.Provisioning.ID) - return host.Status.Provisioning.ID == "" + return fix.Deleted }, ) }) @@ -1155,22 +1155,147 @@ func TestUpdateRootDeviceHints(t *testing.T) { func TestProvisionerIsReady(t *testing.T) { host := newDefaultHost(t) - var prov provisioner.Provisioner - r := newTestReconcilerWithProvisionerFactory(func(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner provisioner.Provisioner, err error) { - if prov == nil { - prov, err = fixture.NewMock(host, bmcCreds, publisher, 5) - } - return prov, err - }, host) + fix := fixture.Fixture{BecomeReadyCounter: 5} + r := newTestReconcilerWithFixture(&fix, host) tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { - if prov == nil { - return false - } - - ready, _ := prov.IsReady() - return ready + return host.Status.Provisioning.State != metal3v1alpha1.StateNone }, ) } + +func TestUpdateEventHandler(t *testing.T) { + cases := []struct { + name string + event event.UpdateEvent + expectedProcess bool + }{ + { + name: "process-non-bmh-events", + event: event.UpdateEvent{ + ObjectOld: &corev1.Secret{}, + ObjectNew: &corev1.Secret{}, + }, + expectedProcess: true, + }, + { + name: "process-generation-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{Generation: 0}, + MetaNew: &metav1.ObjectMeta{Generation: 1}, + }, + + expectedProcess: true, + }, + { + name: "skip-if-same-generation-finalizers-and-annotations", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: false, + }, + { + name: "process-same-generation-annotations-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{}, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + { + name: "process-same-generation-finalizers-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + { + name: "process-same-generation-finalizers-and-annotation-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{}, + Annotations: map[string]string{}, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + r := newTestReconciler() + assert.Equal(t, tc.expectedProcess, r.updateEventHandler(tc.event)) + }) + } +} + +func TestErrorCountIncrementsAlways(t *testing.T) { + + b := &metal3v1alpha1.BareMetalHost{} + assert.Equal(t, b.Status.ErrorCount, 0) + + setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message") + assert.Equal(t, b.Status.ErrorCount, 1) + + setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") + assert.Equal(t, b.Status.ErrorCount, 2) +} diff --git a/controllers/metal3.io/demo_test.go b/controllers/metal3.io/demo_test.go index ba68f96e22..0560323fd9 100644 --- a/controllers/metal3.io/demo_test.go +++ b/controllers/metal3.io/demo_test.go @@ -45,7 +45,7 @@ func TestDemoRegistrationError(t *testing.T) { host.Status.Provisioning.State, host.Status.ErrorMessage, ) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } @@ -168,7 +168,7 @@ func TestDemoValidationError(t *testing.T) { host.Status.Provisioning.State, host.Status.ErrorMessage, ) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 1835ca4761..45ba5bb096 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -63,7 +63,7 @@ func recordStateBegin(host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.P } } -func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState, time metav1.Time) { +func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState, time metav1.Time) (changed bool) { if prevMetric := host.OperationMetricForState(state); prevMetric != nil { if !prevMetric.Start.IsZero() && prevMetric.End.IsZero() { prevMetric.End = time @@ -71,8 +71,10 @@ func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, sta observer := stateTime[state].With(hostMetricLabels(info.request)) observer.Observe(prevMetric.Duration().Seconds()) }) + changed = true } } + return } func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.ProvisioningState, @@ -170,8 +172,6 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio return } - needsReregister := false - switch hsm.NextState { case metal3v1alpha1.StateNone, metal3v1alpha1.StateUnmanaged: // We haven't yet reached the Registration state, so don't attempt @@ -182,30 +182,24 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio return case metal3v1alpha1.StateRegistering: default: - needsReregister = (hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError || - !hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret)) - if needsReregister { + if hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError || + !hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret) { info.log.Info("Retrying registration") recordStateBegin(hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) } } - result = hsm.Reconciler.actionRegistering(hsm.Provisioner, info) - if _, complete := result.(actionComplete); complete { - if hsm.NextState != metal3v1alpha1.StateRegistering { - recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) - } - if needsReregister { - // Host was re-registered, so requeue and run the state machine on - // the next reconcile - result = actionContinue{} - } else { - // Allow the state machine to run, either because we were just - // reconfirming an existing registration, or because we are in the - // Registering state - result = nil + result = hsm.Reconciler.registerHost(hsm.Provisioner, info) + _, complete := result.(actionComplete) + if (result == nil || complete) && + hsm.NextState != metal3v1alpha1.StateRegistering { + if recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) { + result = actionUpdate{} } } + if complete { + result = actionUpdate{} + } return } @@ -240,6 +234,7 @@ func (hsm *hostStateMachine) handleRegistering(info *reconcileInfo) actionResult } else { hsm.NextState = metal3v1alpha1.StateInspecting } + hsm.Host.Status.ErrorCount = 0 return actionComplete{} } @@ -247,6 +242,7 @@ func (hsm *hostStateMachine) handleInspecting(info *reconcileInfo) actionResult actResult := hsm.Reconciler.actionInspecting(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateMatchProfile + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -255,12 +251,14 @@ func (hsm *hostStateMachine) handleMatchProfile(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionMatchProfile(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } return actResult } func (hsm *hostStateMachine) handleExternallyProvisioned(info *reconcileInfo) actionResult { if hsm.Host.Spec.ExternallyProvisioned { + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -281,8 +279,8 @@ func (hsm *hostStateMachine) handleReady(info *reconcileInfo) actionResult { return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageReady actResult := hsm.Reconciler.actionManageReady(hsm.Provisioner, info) - if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioning } @@ -290,7 +288,7 @@ func (hsm *hostStateMachine) handleReady(info *reconcileInfo) actionResult { } func (hsm *hostStateMachine) provisioningCancelled() bool { - if hsm.Host.HasError() { + if hsm.Host.Status.ErrorMessage != "" { return true } if hsm.Host.Spec.Image == nil { @@ -317,6 +315,7 @@ func (hsm *hostStateMachine) handleProvisioning(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionProvisioning(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioned + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -327,6 +326,7 @@ func (hsm *hostStateMachine) handleProvisioned(info *reconcileInfo) actionResult return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -336,6 +336,7 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes if hsm.Host.DeletionTimestamp.IsZero() { if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } } else { skipToDelete := func() actionResult { @@ -347,13 +348,13 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes switch r := actResult.(type) { case actionComplete: hsm.NextState = metal3v1alpha1.StateDeleting + hsm.Host.Status.ErrorCount = 0 case actionFailed: // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. - // Note that this is entirely theoretical, as the - // Ironic provisioner currently never gives up - // trying to deprovision. - return skipToDelete() + if hsm.Host.Status.ErrorCount > 3 { + return skipToDelete() + } case actionError: if r.NeedsRegistration() && !hsm.haveCreds { // If the host is not registered as a node in Ironic and we diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 7f717658e7..58d9a3d233 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -18,7 +18,7 @@ import ( func testStateMachine(host *metal3v1alpha1.BareMetalHost) *hostStateMachine { r := newTestReconciler() - p, _ := r.ProvisionerFactory(host, bmc.Credentials{}, + p, _ := r.ProvisionerFactory(*host.DeepCopy(), bmc.Credentials{}, func(reason, message string) {}) return newHostStateMachine(host, r, p, true) } @@ -234,8 +234,9 @@ func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) { func TestErrorCountCleared(t *testing.T) { tests := []struct { - Scenario string - Host *metal3v1alpha1.BareMetalHost + Scenario string + Host *metal3v1alpha1.BareMetalHost + PreserveErrorCountOnComplete bool }{ { Scenario: "registering", @@ -246,8 +247,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateInspecting).build(), }, { - Scenario: "ready", - Host: host(metal3v1alpha1.StateReady).build(), + Scenario: "ready", + Host: host(metal3v1alpha1.StateReady).build(), + PreserveErrorCountOnComplete: true, }, { Scenario: "deprovisioning", @@ -258,8 +260,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), }, { - Scenario: "externallyProvisioned", - Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + Scenario: "externallyProvisioned", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + PreserveErrorCountOnComplete: true, }, } for _, tt := range tests { @@ -272,8 +275,69 @@ func TestErrorCountCleared(t *testing.T) { prov.setNextResult(true) result := hsm.ReconcileState(info) - assert.Equal(t, tt.Host.Status.ErrorCount, 0) + assert.Equal(t, 1, tt.Host.Status.ErrorCount) assert.True(t, result.Dirty()) + + prov.setNextResult(false) + hsm.ReconcileState(info) + if tt.PreserveErrorCountOnComplete { + assert.Equal(t, 1, tt.Host.Status.ErrorCount) + } else { + assert.Equal(t, 0, tt.Host.Status.ErrorCount) + } + }) + } +} + +func TestErrorClean(t *testing.T) { + + tests := []struct { + Scenario string + Host *metal3v1alpha1.BareMetalHost + SecretName string + ExpectError bool + }{ + { + Scenario: "clean-after-registration-error", + Host: host(metal3v1alpha1.StateInspecting). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.RegistrationError, "some error", 1). + build(), + }, + { + Scenario: "not-clean-after-provisioned-registration-error", + Host: host(metal3v1alpha1.StateInspecting). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.ProvisionedRegistrationError, "some error", 1). + build(), + ExpectError: true, + }, + { + Scenario: "clean-after-creds-change", + Host: host(metal3v1alpha1.StateReady). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.InspectionError, "some error", 1). + build(), + SecretName: "NewCreds", + }, + } + for _, tt := range tests { + t.Run(tt.Scenario, func(t *testing.T) { + prov := &mockProvisioner{} + hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true) + + info := makeDefaultReconcileInfo(tt.Host) + if tt.SecretName != "" { + info.bmcCredsSecret.Name = tt.SecretName + } + + hsm.ReconcileState(info) + + if tt.ExpectError { + assert.Equal(t, tt.Host.Status.ErrorType, v1alpha1.ProvisionedRegistrationError) + assert.NotEmpty(t, tt.Host.Status.ErrorMessage) + } else { + assert.Equal(t, tt.Host.Status.OperationalStatus, v1alpha1.OperationalStatusOK) + assert.Empty(t, tt.Host.Status.ErrorType) + assert.Empty(t, tt.Host.Status.ErrorMessage) + } }) } } @@ -323,6 +387,15 @@ func (hb *hostBuilder) SetImageURL(url string) *hostBuilder { return hb } +func (hb *hostBuilder) SetStatusError(opStatus metal3v1alpha1.OperationalStatus, errType metal3v1alpha1.ErrorType, errMsg string, errCount int) *hostBuilder { + hb.Status.OperationalStatus = opStatus + hb.Status.ErrorType = errType + hb.Status.ErrorMessage = errMsg + hb.Status.ErrorCount = errCount + + return hb +} + func makeDefaultReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo { return &reconcileInfo{ log: logf.Log.WithName("controllers").WithName("BareMetalHost").WithName("host_state_machine"), @@ -355,16 +428,17 @@ func (m *mockProvisioner) setNextResult(dirty bool) { } } -func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { - return m.nextResult, err +func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { + return m.nextResult, "", err } -func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { + details = &metal3v1alpha1.HardwareDetails{} return m.nextResult, details, err } -func (m *mockProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { - return m.nextResult, err +func (m *mockProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { + return } func (m *mockProvisioner) Adopt(force bool) (result provisioner.Result, err error) { @@ -375,7 +449,7 @@ func (m *mockProvisioner) Provision(configData provisioner.HostConfigData) (resu return m.nextResult, err } -func (m *mockProvisioner) Deprovision() (result provisioner.Result, err error) { +func (m *mockProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { return m.nextResult, err } diff --git a/main.go b/main.go index 1a8d477d58..e528383805 100644 --- a/main.go +++ b/main.go @@ -118,21 +118,24 @@ func main() { os.Exit(1) } - provisionerFactory := func(host *metal3iov1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish provisioner.EventPublisher) (provisioner.Provisioner, error) { + provisionerFactory := func(host metal3iov1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish provisioner.EventPublisher) (provisioner.Provisioner, error) { isUnmanaged := host.Spec.ExternallyProvisioned && !host.HasBMCDetails() + hostCopy := host.DeepCopy() + if runInTestMode { ctrl.Log.Info("using test provisioner") - return fixture.New(host, bmcCreds, publish) + fix := fixture.Fixture{} + return fix.New(*hostCopy, bmcCreds, publish) } else if runInDemoMode { ctrl.Log.Info("using demo provisioner") - return demo.New(host, bmcCreds, publish) + return demo.New(*hostCopy, bmcCreds, publish) } else if isUnmanaged { ctrl.Log.Info("using empty provisioner") - return empty.New(host, bmcCreds, publish) + return empty.New(*hostCopy, bmcCreds, publish) } ironic.LogStartup() - return ironic.New(host, bmcCreds, publish) + return ironic.New(*hostCopy, bmcCreds, publish) } if err = (&metal3iocontroller.BareMetalHostReconciler{ diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index f118dc27c3..606bb99aae 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -46,7 +46,7 @@ const ( // and uses Ironic to manage the host. type demoProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // the bmc credentials bmcCreds bmc.Credentials // a logger configured for this host @@ -56,7 +56,7 @@ type demoProvisioner struct { } // New returns a new Ironic Provisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { p := &demoProvisioner{ host: host, bmcCreds: bmcCreds, @@ -68,7 +68,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { p.log.Info("testing management access") hostName := p.host.ObjectMeta.Name @@ -88,21 +88,21 @@ func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged bool) (res default: if p.host.Status.Provisioning.ID == "" { - p.host.Status.Provisioning.ID = p.host.ObjectMeta.Name + provID = p.host.ObjectMeta.Name p.log.Info("setting provisioning id", "provisioningID", p.host.Status.Provisioning.ID) result.Dirty = true } } - return result, nil + return } // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *demoProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *demoProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) hostName := p.host.ObjectMeta.Name @@ -174,12 +174,10 @@ func (p *demoProvisioner) InspectHardware() (result provisioner.Result, details // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *demoProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *demoProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { p.log.Info("updating hardware state") - result.Dirty = false - return result, nil + return } // Adopt allows an externally-provisioned server to be adopted. @@ -218,7 +216,7 @@ func (p *demoProvisioner) Provision(hostConf provisioner.HostConfigData) (result // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *demoProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *demoProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { hostName := p.host.ObjectMeta.Name switch hostName { diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index 3bfd226ec4..0fbc48802c 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -15,31 +15,30 @@ type emptyProvisioner struct { } // New returns a new Empty Provisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { return &emptyProvisioner{}, nil } // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged bool) (provisioner.Result, error) { - return provisioner.Result{}, nil +func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (provisioner.Result, string, error) { + return provisioner.Result{}, "", nil } // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *emptyProvisioner) InspectHardware() (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { +func (p *emptyProvisioner) InspectHardware(force bool) (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { return provisioner.Result{}, nil, nil } // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *emptyProvisioner) UpdateHardwareState() (provisioner.Result, error) { - return provisioner.Result{}, nil +// reading from a cache. +func (p *emptyProvisioner) UpdateHardwareState() (provisioner.HardwareState, error) { + return provisioner.HardwareState{}, nil } // Adopt allows an externally-provisioned server to be adopted. @@ -57,7 +56,7 @@ func (p *emptyProvisioner) Provision(hostConf provisioner.HostConfigData) (provi // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *emptyProvisioner) Deprovision() (provisioner.Result, error) { +func (p *emptyProvisioner) Deprovision(force bool) (provisioner.Result, error) { return provisioner.Result{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index bcd44458f1..136264383e 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -45,60 +45,64 @@ func (cd *fixtureHostConfigData) MetaData() (string, error) { // and uses Ironic to manage the host. type fixtureProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // the bmc credentials bmcCreds bmc.Credentials // a logger configured for this host log logr.Logger // an event publisher for recording significant events publisher provisioner.EventPublisher + // state storage for the Host + state *Fixture +} + +type Fixture struct { + // counter to set the provisioner as ready + BecomeReadyCounter int + // state to manage deletion + Deleted bool // state to manage the two-step adopt process adopted bool - // counter to set the provisioner as ready - becomeReadyCounter int + // state to manage provisioning + image metal3v1alpha1.Image + // state to manage power + poweredOn bool } // New returns a new Ironic FixtureProvisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { - return NewMock(host, bmcCreds, publisher, 0) -} - -// NewMock is used in tests to build a fixture provisioner and inject additional test parameters -func NewMock(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, becomeReadyCounter int) (provisioner.Provisioner, error) { +func (f *Fixture) New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { p := &fixtureProvisioner{ - host: host, - bmcCreds: bmcCreds, - log: log.WithValues("host", host.Name), - publisher: publisher, - becomeReadyCounter: becomeReadyCounter, + host: *host.DeepCopy(), + bmcCreds: bmcCreds, + log: log.WithValues("host", host.Name), + publisher: publisher, + state: f, } return p, nil } // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { p.log.Info("testing management access") // Fill in the ID of the host in the provisioning system if p.host.Status.Provisioning.ID == "" { - p.host.Status.Provisioning.ID = "temporary-fake-id" - p.log.Info("setting provisioning id", - "provisioningID", p.host.Status.Provisioning.ID) + provID = "temporary-fake-id" result.Dirty = true result.RequeueAfter = time.Second * 5 p.publisher("Registered", "Registered new host") - return result, nil + return } - return result, nil + return } // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *fixtureProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *fixtureProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) // The inspection is ongoing. We'll need to check the fixture @@ -159,21 +163,20 @@ func (p *fixtureProvisioner) InspectHardware() (result provisioner.Result, detai // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *fixtureProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { if !p.host.NeedsProvisioning() { + hwState.PoweredOn = &p.state.poweredOn p.log.Info("updating hardware state") - result.Dirty = false } - return result, nil + return } // Adopt allows an externally-provisioned server to be adopted. func (p *fixtureProvisioner) Adopt(force bool) (result provisioner.Result, err error) { p.log.Info("adopting host") - if p.host.Spec.ExternallyProvisioned && !p.adopted { - p.adopted = true + if p.host.Spec.ExternallyProvisioned && !p.state.adopted { + p.state.adopted = true result.Dirty = true result.RequeueAfter = provisionRequeueDelay } @@ -187,10 +190,10 @@ func (p *fixtureProvisioner) Provision(hostConf provisioner.HostConfigData) (res p.log.Info("provisioning image to host", "state", p.host.Status.Provisioning.State) - if p.host.Status.Provisioning.Image.URL == "" { + if p.state.image.URL == "" { p.publisher("ProvisioningComplete", "Image provisioning completed") p.log.Info("moving to done") - p.host.Status.Provisioning.Image = *p.host.Spec.Image + p.state.image = *p.host.Spec.Image result.Dirty = true result.RequeueAfter = provisionRequeueDelay } @@ -201,7 +204,7 @@ func (p *fixtureProvisioner) Provision(hostConf provisioner.HostConfigData) (res // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *fixtureProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("ensuring host is deprovisioned") result.RequeueAfter = deprovisionRequeueDelay @@ -211,10 +214,10 @@ func (p *fixtureProvisioner) Deprovision() (result provisioner.Result, err error // necessary once we really have Fixture doing the deprovisioning // and we can monitor it's status. - if p.host.Status.HardwareDetails != nil { + if p.state.image.URL != "" { p.publisher("DeprovisionStarted", "Image deprovisioning started") p.log.Info("clearing hardware details") - p.host.Status.HardwareDetails = nil + p.state.image = metal3v1alpha1.Image{} result.Dirty = true return result, nil } @@ -229,9 +232,9 @@ func (p *fixtureProvisioner) Deprovision() (result provisioner.Result, err error func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { p.log.Info("deleting host") - if p.host.Status.Provisioning.ID != "" { + if !p.state.Deleted { p.log.Info("clearing provisioning id") - p.host.Status.Provisioning.ID = "" + p.state.Deleted = true result.Dirty = true return result, nil } @@ -244,10 +247,10 @@ func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) { p.log.Info("ensuring host is powered on") - if !p.host.Status.PoweredOn { + if !p.state.poweredOn { p.publisher("PowerOn", "Host powered on") p.log.Info("changing status") - p.host.Status.PoweredOn = true + p.state.poweredOn = true result.Dirty = true return result, nil } @@ -260,10 +263,10 @@ func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) { func (p *fixtureProvisioner) PowerOff() (result provisioner.Result, err error) { p.log.Info("ensuring host is powered off") - if p.host.Status.PoweredOn { + if p.state.poweredOn { p.publisher("PowerOff", "Host powered off") p.log.Info("changing status") - p.host.Status.PoweredOn = false + p.state.poweredOn = false result.Dirty = true return result, nil } @@ -275,9 +278,9 @@ func (p *fixtureProvisioner) PowerOff() (result provisioner.Result, err error) { func (p *fixtureProvisioner) IsReady() (result bool, err error) { p.log.Info("checking provisioner status") - if p.becomeReadyCounter > 0 { - p.becomeReadyCounter-- + if p.state.BecomeReadyCounter > 0 { + p.state.BecomeReadyCounter-- } - return p.becomeReadyCounter == 0, nil + return p.state.BecomeReadyCounter == 0, nil } diff --git a/pkg/provisioner/ironic/delete_test.go b/pkg/provisioner/ironic/delete_test.go index 16b8e43889..00a1eb9a9f 100644 --- a/pkg/provisioner/ironic/delete_test.go +++ b/pkg/provisioner/ironic/delete_test.go @@ -50,7 +50,7 @@ func TestDelete(t *testing.T) { }, ).DeleteError(nodeUUID, http.StatusConflict), expectedDirty: true, - expectedRequestAfter: 0, + expectedRequestAfter: provisionRequeueDelay, }, { name: "delete-host-not-found", @@ -122,7 +122,7 @@ func TestDelete(t *testing.T) { ).NodeUpdateError(nodeUUID, http.StatusConflict), expectedDirty: true, - expectedRequestAfter: 0, + expectedRequestAfter: provisionRequeueDelay, }, { name: "not-in-maintenance-update", diff --git a/pkg/provisioner/ironic/inspecthardware_test.go b/pkg/provisioner/ironic/inspecthardware_test.go index b094dffec3..7118cb7ccc 100644 --- a/pkg/provisioner/ironic/inspecthardware_test.go +++ b/pkg/provisioner/ironic/inspecthardware_test.go @@ -152,7 +152,7 @@ func TestInspectHardware(t *testing.T) { } prov.status.ID = nodeUUID - result, details, err := prov.InspectHardware() + result, details, err := prov.InspectHardware(false) assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 0e75823d7b..2674774321 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -105,7 +105,7 @@ func init() { // and uses Ironic to manage the host. type ironicProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // a shorter path to the provisioning status data structure status *metal3v1alpha1.ProvisionStatus // access parameters for the BMC @@ -137,7 +137,7 @@ func LogStartup() { // A private function to construct an ironicProvisioner (rather than a // Provisioner interface) in a consistent way for tests. -func newProvisionerWithSettings(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, ironicURL string, ironicAuthSettings clients.AuthConfig, inspectorURL string, inspectorAuthSettings clients.AuthConfig) (*ironicProvisioner, error) { +func newProvisionerWithSettings(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, ironicURL string, ironicAuthSettings clients.AuthConfig, inspectorURL string, inspectorAuthSettings clients.AuthConfig) (*ironicProvisioner, error) { tlsConf := clients.TLSConfig{ TrustedCAFile: ironicTrustedCAFile, InsecureSkipVerify: ironicInsecure, @@ -156,7 +156,7 @@ func newProvisionerWithSettings(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc clientIronic, clientInspector) } -func newProvisionerWithIronicClients(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, clientIronic *gophercloud.ServiceClient, clientInspector *gophercloud.ServiceClient) (*ironicProvisioner, error) { +func newProvisionerWithIronicClients(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, clientIronic *gophercloud.ServiceClient, clientInspector *gophercloud.ServiceClient) (*ironicProvisioner, error) { bmcAccess, err := bmc.NewAccessDetails(host.Spec.BMC.Address, host.Spec.BMC.DisableCertificateVerification) if err != nil { @@ -182,7 +182,7 @@ func newProvisionerWithIronicClients(host *metal3v1alpha1.BareMetalHost, bmcCred // New returns a new Ironic Provisioner using the global configuration // for finding the Ironic services. -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { var err error if clientIronicSingleton == nil || clientInspectorSingleton == nil { tlsConf := clients.TLSConfig{ @@ -333,14 +333,15 @@ func (p *ironicProvisioner) findExistingHost() (ironicNode *nodes.Node, err erro // // FIXME(dhellmann): We should rename this method to describe what it // actually does. -func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { var ironicNode *nodes.Node p.log.Info("validating management access") ironicNode, err = p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + result, err = transientError(errors.Wrap(err, "failed to find existing host")) + return } // Some BMC types require a MAC address, so ensure we have one @@ -348,8 +349,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r if p.bmcAccess.NeedsMAC() && p.host.Spec.BootMACAddress == "" { msg := fmt.Sprintf("BMC driver %s requires a BootMACAddress value", p.bmcAccess.Type()) p.log.Info(msg) - result.ErrorMessage = msg - return result, nil + result, err = operationFailed(msg) + return } driverInfo := p.bmcAccess.DriverInfo(p.bmcCreds) @@ -358,6 +359,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r driverInfo["deploy_kernel"] = deployKernelURL driverInfo["deploy_ramdisk"] = deployRamdiskURL + result, err = operationComplete() + // If we have not found a node yet, we need to create one if ironicNode == nil { p.log.Info("registering host in ironic") @@ -380,15 +383,14 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r }).Extract() // FIXME(dhellmann): Handle 409 and 503? errors here. if err != nil { - return result, errors.Wrap(err, "failed to register host in ironic") + result, err = transientError(errors.Wrap(err, "failed to register host in ironic")) + return } p.publisher("Registered", "Registered new host") // Store the ID so other methods can assume it is set and so // we can find the node again later. - p.status.ID = ironicNode.UUID - result.Dirty = true - p.log.Info("setting provisioning id", "ID", p.status.ID) + provID = ironicNode.UUID // If we know the MAC, create a port. Otherwise we will have // to do this after we run the introspection step. @@ -396,7 +398,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r enable := true p.log.Info("creating port for node in ironic", "MAC", p.host.Spec.BootMACAddress) - _, err := ports.Create( + _, err = ports.Create( p.client, ports.CreateOpts{ NodeUUID: ironicNode.UUID, @@ -404,7 +406,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r PXEEnabled: &enable, }).Extract() if err != nil { - return result, errors.Wrap(err, "failed to create port in ironic") + result, err = transientError(errors.Wrap(err, "failed to create port in ironic")) + return } } @@ -420,75 +423,24 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r imageData = p.host.Spec.Image } - checksum, checksumType, ok := imageData.GetChecksum() - - if ok { - p.log.Info("setting instance info", - "image_source", imageData.URL, - "image_os_hash_value", checksum, - "image_os_hash_algo", checksumType, - ) - - updates := nodes.UpdateOpts{ - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_source", - Value: imageData.URL, - }, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_os_hash_value", - Value: checksum, - }, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_os_hash_algo", - Value: checksumType, - }, - nodes.UpdateOperation{ - Op: nodes.ReplaceOp, - Path: "/instance_uuid", - Value: string(p.host.ObjectMeta.UID), - }, - } - - // image_checksum - // - // FIXME: For older versions of ironic that do not have - // https://review.opendev.org/#/c/711816/ failing to - // include the 'image_checksum' causes ironic to refuse to - // provision the image, even if the other hash value - // parameters are given. We only want to do that for MD5, - // however, because those versions of ironic only support - // MD5 checksums. - if checksumType == string(metal3v1alpha1.MD5) { - updates = append( - updates, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_checksum", - Value: checksum, - }, - ) - } - - if imageData.DiskFormat != nil { - updates = append(updates, nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_disk_format", - Value: *imageData.DiskFormat, - }) + if imageData != nil { + updates, optsErr := p.getImageUpdateOptsForNode(ironicNode, imageData) + if optsErr != nil { + result, err = transientError(errors.Wrap(optsErr, "Could not get Image options for node")) + return } - _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() - switch err.(type) { - case nil: - case gophercloud.ErrDefault409: - p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil - default: - return result, errors.Wrap(err, "failed to update host settings in ironic") + if len(updates) != 0 { + _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() + switch err.(type) { + case nil: + case gophercloud.ErrDefault409: + p.log.Info("could not update host settings in ironic, busy") + result, err = retryAfterDelay(provisionRequeueDelay) + return + default: + result, err = transientError(errors.Wrap(err, "failed to update host settings in ironic")) + return + } } } } else { @@ -496,13 +448,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r // node in ironic by looking it up. We need to check its // settings against what we have in the host, and change them // if there are differences. - if p.status.ID != ironicNode.UUID { - // Store the ID so other methods can assume it is set and - // so we can find the node using that value next time. - p.status.ID = ironicNode.UUID - result.Dirty = true - p.log.Info("setting provisioning id", "ID", p.status.ID) - } + provID = ironicNode.UUID if ironicNode.Name == "" { updates := nodes.UpdateOpts{ @@ -517,11 +463,11 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r case nil: case gophercloud.ErrDefault409: p.log.Info("could not update ironic node name, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - return result, errors.Wrap(err, "failed to update ironc node name") + result, err = transientError(errors.Wrap(err, "failed to update ironc node name")) + return } p.log.Info("updated ironic node name") @@ -542,11 +488,11 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host driver settings, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - return result, errors.Wrap(err, "failed to update host driver settings") + result, err = transientError(errors.Wrap(err, "failed to update host driver settings")) + return } p.log.Info("updated host driver settings") // We don't return here because we also have to set the @@ -572,49 +518,48 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r case nodes.Enroll: // If ironic is reporting an error, stop working on the node. - if ironicNode.LastError != "" && !credentialsChanged { - result.ErrorMessage = ironicNode.LastError - return result, nil + if ironicNode.LastError != "" && !(credentialsChanged || force) { + result, err = operationFailed(ironicNode.LastError) + return } if ironicNode.TargetProvisionState == string(nodes.TargetManage) { // We have already tried to manage the node and did not // get an error, so do nothing and keep trying. - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + result, err = operationContinuing(provisionRequeueDelay) + return } - return p.changeNodeProvisionState( + result, err = p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, ) + return case nodes.Verifying: // If we're still waiting for the state to change in Ironic, // return true to indicate that we're dirty and need to be // reconciled again. - result.RequeueAfter = provisionRequeueDelay - result.Dirty = true - return result, nil + result, err = operationContinuing(provisionRequeueDelay) + return case nodes.Manageable: p.log.Info("have manageable host") - return result, nil + return case nodes.Available: // The host is fully registered (and probably wasn't cleanly // deleted previously) p.log.Info("have available host") - return result, nil + return case nodes.Active: // The host is already running, maybe it's a master? p.log.Info("have active host", "image_source", ironicNode.InstanceInfo["image_source"]) - return result, nil + return default: - return result, nil + return } } @@ -631,14 +576,15 @@ func (p *ironicProvisioner) tryChangeNodeProvisionState(ironicNode *nodes.Node, success = true case gophercloud.ErrDefault409: p.log.Info("could not change state of host, busy") + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - err = errors.Wrap(changeResult.Err, - fmt.Sprintf("failed to change provisioning state to %q", opts.Target)) + result, err = transientError(errors.Wrap(changeResult.Err, + fmt.Sprintf("failed to change provisioning state to %q", opts.Target))) return } - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay + result, err = operationContinuing(provisionRequeueDelay) return } @@ -651,16 +597,17 @@ func (p *ironicProvisioner) changeNodeProvisionState(ironicNode *nodes.Node, opt // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) ironicNode, err := p.findExistingHost() if err != nil { - err = errors.Wrap(err, "failed to find existing host") + result, err = transientError(errors.Wrap(err, "failed to find existing host")) return } if ironicNode == nil { - return result, nil, provisioner.NeedsRegistration + result, err = transientError(provisioner.NeedsRegistration) + return } status, err := introspection.GetIntrospectionStatus(p.inspector, ironicNode.UUID).Extract() @@ -669,11 +616,18 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Inspecting, nodes.InspectWait: p.log.Info("inspection already started") - result.Dirty = true - result.RequeueAfter = introspectionRequeueDelay - err = nil + result, err = operationContinuing(introspectionRequeueDelay) return default: + if nodes.ProvisionState(ironicNode.ProvisionState) == nodes.InspectFail && !force { + p.log.Info("starting inspection failed", "error", status.Error) + if ironicNode.LastError == "" { + result.ErrorMessage = "Inspection failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + err = nil + } p.log.Info("updating boot mode before hardware inspection") op, value := buildCapabilitiesValue(ironicNode, p.host.Status.Provisioning.BootMode) updates := nodes.UpdateOpts{ @@ -688,10 +642,10 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true + result, err = retryAfterDelay(provisionRequeueDelay) return default: - err = errors.Wrap(err, "failed to update host boot mode settings in ironic") + result, err = transientError(errors.Wrap(err, "failed to update host boot mode settings in ironic")) return } @@ -707,18 +661,17 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail return } } - err = errors.Wrap(err, "failed to extract hardware inspection status") + result, err = transientError(errors.Wrap(err, "failed to extract hardware inspection status")) return } if !status.Finished { p.log.Info("inspection in progress", "started_at", status.StartedAt) - result.Dirty = true // make sure we check back - result.RequeueAfter = introspectionRequeueDelay + result, err = operationContinuing(introspectionRequeueDelay) return } if status.Error != "" { p.log.Info("inspection failed", "error", status.Error) - result.ErrorMessage = status.Error + result, err = operationFailed(status.Error) return } @@ -727,64 +680,62 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail introData := introspection.GetIntrospectionData(p.inspector, ironicNode.UUID) data, err := introData.Extract() if err != nil { - err = errors.Wrap(err, "failed to retrieve hardware introspection data") + result, err = transientError(errors.Wrap(err, "failed to retrieve hardware introspection data")) return } p.log.Info("received introspection data", "data", introData.Body) details = hardwaredetails.GetHardwareDetails(data) p.publisher("InspectionComplete", "Hardware inspection completed") + result, err = operationComplete() return } // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *ironicProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { p.log.Info("updating hardware state") ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + err = errors.Wrap(err, "failed to find existing host") + return } if ironicNode == nil { - return result, provisioner.NeedsRegistration + err = provisioner.NeedsRegistration + return } - var discoveredVal bool switch ironicNode.PowerState { - case powerOn: - discoveredVal = true - case powerOff: - discoveredVal = false + case powerOn, powerOff: + discoveredVal := ironicNode.PowerState == powerOn + hwState.PoweredOn = &discoveredVal case powerNone: p.log.Info("could not determine power state", "value", ironicNode.PowerState) - return result, nil default: p.log.Info("unknown power state", "value", ironicNode.PowerState) - return result, nil - } - - if discoveredVal != p.host.Status.PoweredOn { - p.log.Info("updating power status", "discovered", discoveredVal) - p.host.Status.PoweredOn = discoveredVal - result.Dirty = true } - return result, nil + return } -func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) { - - hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) - - if err != nil { - return updates, errors.Wrap(err, - fmt.Sprintf("Could not start provisioning with bad hardware profile %s", - p.host.HardwareProfile())) +func (p *ironicProvisioner) getImageUpdateOptsForNode(ironicNode *nodes.Node, imageData *metal3v1alpha1.Image) (updates nodes.UpdateOpts, err error) { + checksum, checksumType, ok := p.host.GetImageChecksum() + if !ok { + p.log.Info("image/checksum not found for host") + return } - + // instance_uuid + p.log.Info("setting instance_uuid") + updates = append( + updates, + nodes.UpdateOperation{ + Op: nodes.ReplaceOp, + Path: "/instance_uuid", + Value: string(p.host.ObjectMeta.UID), + }, + ) // image_source var op nodes.UpdateOp if _, ok := ironicNode.InstanceInfo["image_source"]; !ok { @@ -799,12 +750,10 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update nodes.UpdateOperation{ Op: op, Path: "/instance_info/image_source", - Value: p.host.Spec.Image.URL, + Value: imageData.URL, }, ) - checksum, checksumType, _ := p.host.GetImageChecksum() - // image_os_hash_algo if _, ok := ironicNode.InstanceInfo["image_os_hash_algo"]; !ok { op = nodes.AddOp @@ -865,30 +814,38 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update ) } - if p.host.Spec.Image.DiskFormat != nil { + if imageData.DiskFormat != nil { updates = append(updates, nodes.UpdateOperation{ Op: nodes.AddOp, Path: "/instance_info/image_disk_format", - Value: *p.host.Spec.Image.DiskFormat, + Value: *imageData.DiskFormat, }) } + return updates, nil +} - // instance_uuid - p.log.Info("setting instance_uuid") - updates = append( - updates, - nodes.UpdateOperation{ - Op: nodes.ReplaceOp, - Path: "/instance_uuid", - Value: string(p.host.ObjectMeta.UID), - }, - ) +func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) { + + hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) + + if err != nil { + return updates, errors.Wrap(err, + fmt.Sprintf("Could not start provisioning with bad hardware profile %s", + p.host.HardwareProfile())) + } + + imageOpts, err := p.getImageUpdateOptsForNode(ironicNode, p.host.Spec.Image) + if err != nil { + return updates, errors.Wrap(err, "Could not get Image options for node") + } + updates = append(updates, imageOpts...) // root_gb // // FIXME(dhellmann): We have to provide something for the disk // size until https://storyboard.openstack.org/#!/story/2005165 is // fixed in ironic. + var op nodes.UpdateOp if _, ok := ironicNode.InstanceInfo["root_gb"]; !ok { op = nodes.AddOp p.log.Info("adding root_gb") @@ -1040,17 +997,16 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p updates, err := p.getUpdateOptsForNode(ironicNode) if err != nil { - return result, errors.Wrap(err, "failed to update opts for node") + return transientError(errors.Wrap(err, "failed to update opts for node")) } _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() switch err.(type) { case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to update host settings in ironic") + return transientError(errors.Wrap(err, "failed to update host settings in ironic")) } p.log.Info("validating host settings") @@ -1060,15 +1016,12 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p case nil: case gophercloud.ErrDefault409: p.log.Info("could not validate host during registration, busy") - result.Dirty = true - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to validate host during registration") + return transientError(errors.Wrap(err, "failed to validate host during registration")) } if errorMessage != "" { - result.ErrorMessage = errorMessage - result.Dirty = true // validateNode() would have set the errors - return result, nil + return operationFailed(errorMessage) } // If validation is successful we can start moving the host @@ -1089,18 +1042,16 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er var ironicNode *nodes.Node if ironicNode, err = p.findExistingHost(); err != nil { - err = errors.Wrap(err, "could not find host to adpot") - return + return transientError(errors.Wrap(err, "could not find host to adpot")) } if ironicNode == nil { - err = provisioner.NeedsRegistration - return + return transientError(provisioner.NeedsRegistration) } switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Enroll, nodes.Verifying: - err = fmt.Errorf("Invalid state for adopt: %s", - ironicNode.ProvisionState) + return transientError(fmt.Errorf("Invalid state for adopt: %s", + ironicNode.ProvisionState)) case nodes.Manageable: _, hasImageSource := ironicNode.InstanceInfo["image_source"] _, hasBootISO := ironicNode.InstanceInfo["boot_iso"] @@ -1122,8 +1073,7 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er }, ) case nodes.Adopting: - result.RequeueAfter = provisionRequeueDelay - result.Dirty = true + return operationContinuing(provisionRequeueDelay) case nodes.AdoptFail: if force { return p.changeNodeProvisionState( @@ -1133,13 +1083,13 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er }, ) } else { - result.ErrorMessage = fmt.Sprintf("Host adoption failed: %s", - ironicNode.LastError) + return operationFailed(fmt.Sprintf("Host adoption failed: %s", + ironicNode.LastError)) } case nodes.Active: default: } - return + return operationComplete() } // Provision writes the image from the host spec to the host. It may @@ -1149,10 +1099,10 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu var ironicNode *nodes.Node if ironicNode, err = p.findExistingHost(); err != nil { - return result, errors.Wrap(err, "could not find host to receive image") + return transientError(errors.Wrap(err, "could not find host to receive image")) } if ironicNode == nil { - return result, provisioner.NeedsRegistration + return transientError(provisioner.NeedsRegistration) } p.log.Info("provisioning image to host", "state", ironicNode.ProvisionState) @@ -1172,8 +1122,6 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu "same", ironicHasSameImage, "provisionState", ironicNode.ProvisionState) - result.RequeueAfter = provisionRequeueDelay - // Ironic has the settings it needs, see if it finds any issues // with them. switch nodes.ProvisionState(ironicNode.ProvisionState) { @@ -1188,13 +1136,11 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu // top of relational databases... if ironicNode.LastError == "" { p.log.Info("failed but error message not available") - result.Dirty = true - return result, nil + return retryAfterDelay(0) } p.log.Info("found error", "msg", ironicNode.LastError) - result.ErrorMessage = fmt.Sprintf("Image provisioning failed: %s", - ironicNode.LastError) - return result, nil + return operationFailed(fmt.Sprintf("Image provisioning failed: %s", + ironicNode.LastError)) } p.log.Info("recovering from previous failure") if provResult, err := p.startProvisioning(ironicNode, hostConf); err != nil || provResult.Dirty || provResult.ErrorMessage != "" { @@ -1220,17 +1166,17 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu // Retrieve cloud-init user data userData, err := hostConf.UserData() if err != nil { - return result, errors.Wrap(err, "could not retrieve user data") + return transientError(errors.Wrap(err, "could not retrieve user data")) } // Retrieve cloud-init network_data.json. Default value is empty networkDataRaw, err := hostConf.NetworkData() if err != nil { - return result, errors.Wrap(err, "could not retrieve network data") + return transientError(errors.Wrap(err, "could not retrieve network data")) } var networkData map[string]interface{} if err = yaml.Unmarshal([]byte(networkDataRaw), &networkData); err != nil { - return result, errors.Wrap(err, "failed to unmarshal network_data.json from secret") + return transientError(errors.Wrap(err, "failed to unmarshal network_data.json from secret")) } // Retrieve cloud-init meta_data.json with falback to default @@ -1243,11 +1189,11 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu } metaDataRaw, err := hostConf.MetaData() if err != nil { - return result, errors.Wrap(err, "could not retrieve metadata") + return transientError(errors.Wrap(err, "could not retrieve metadata")) } if metaDataRaw != "" { if err = yaml.Unmarshal([]byte(metaDataRaw), &metaData); err != nil { - return result, errors.Wrap(err, "failed to unmarshal metadata from secret") + return transientError(errors.Wrap(err, "failed to unmarshal metadata from secret")) } } @@ -1259,7 +1205,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu NetworkData: networkData, } if err != nil { - return result, errors.Wrap(err, "failed to build config drive") + return transientError(errors.Wrap(err, "failed to build config drive")) } p.log.Info("triggering provisioning with config drive") } else { @@ -1279,15 +1225,14 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu p.publisher("ProvisioningComplete", fmt.Sprintf("Image provisioning completed for %s", p.host.Spec.Image.URL)) p.log.Info("finished provisioning") - return result, nil + return operationComplete() default: // wait states like cleaning and clean wait p.log.Info("waiting for host to become available", "state", ironicNode.ProvisionState, "deploy step", ironicNode.DeployStep) - result.Dirty = true - return result, nil + return operationContinuing(provisionRequeueDelay) } } @@ -1307,25 +1252,25 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo case nil: case gophercloud.ErrDefault409: p.log.Info("could not set host maintenance flag, busy") + return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to set host maintenance flag") + return transientError(errors.Wrap(err, "failed to set host maintenance flag")) } - result.Dirty = true - return result, nil + return operationContinuing(0) } // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("deprovisioning") ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode == nil { - return result, provisioner.NeedsRegistration + return transientError(provisioner.NeedsRegistration) } p.log.Info("deprovisioning host", @@ -1339,9 +1284,20 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Error: + if !force { + p.log.Info("deprovisioning failed") + if ironicNode.LastError == "" { + result.ErrorMessage = "Deprovisioning failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + return result, nil + } + p.log.Info("retrying deprovisioning") + p.publisher("DeprovisioningStarted", "Image deprovisioning restarted") return p.changeNodeProvisionState( ironicNode, - nodes.ProvisionStateOpts{Target: nodes.TargetManage}, + nodes.ProvisionStateOpts{Target: nodes.TargetDeleted}, ) case nodes.CleanFail: @@ -1350,6 +1306,10 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) p.log.Info("clearing maintenance flag") return p.setMaintenanceFlag(ironicNode, false) } + // This will return us to the manageable state without completing + // cleaning. Because cleaning happens in the process of moving from + // manageable to available, the node will still get cleaned before + // we provision it again. return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, @@ -1365,29 +1325,23 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) case nodes.Available: p.publisher("DeprovisioningComplete", "Image deprovisioning completed") - return result, nil + return operationComplete() case nodes.Deleting: p.log.Info("deleting") // Transitions to Cleaning upon completion - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) case nodes.Cleaning: p.log.Info("cleaning") // Transitions to Available upon completion - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) case nodes.CleanWait: p.log.Info("cleaning") - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) - case nodes.Active: + case nodes.Active, nodes.DeployFail: p.log.Info("starting deprovisioning") p.publisher("DeprovisioningStarted", "Image deprovisioning started") return p.changeNodeProvisionState( @@ -1396,7 +1350,8 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) ) default: - return result, fmt.Errorf("Unhandled ironic state %s", ironicNode.ProvisionState) + // FIXME(zaneb): this error is unlikely to actually be transient + return transientError(fmt.Errorf("Unhandled ironic state %s", ironicNode.ProvisionState)) } } @@ -1407,11 +1362,11 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode == nil { p.log.Info("no node found, already deleted") - return result, nil + return operationComplete() } p.log.Info("deleting host", @@ -1453,14 +1408,14 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { p.log.Info("removed") case gophercloud.ErrDefault409: p.log.Info("could not remove host, busy") + return retryAfterDelay(provisionRequeueDelay) case gophercloud.ErrDefault404: p.log.Info("did not find host to delete, OK") default: - return result, errors.Wrap(err, "failed to remove host") + return transientError(errors.Wrap(err, "failed to remove host")) } - result.Dirty = true - return result, nil + return operationContinuing(0) } func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.TargetPowerState) (result provisioner.Result, err error) { @@ -1471,9 +1426,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar "state", ironicNode.ProvisionState, "target state", ironicNode.TargetProvisionState, ) - result.Dirty = true - result.RequeueAfter = powerRequeueDelay - return result, nil + return operationContinuing(powerRequeueDelay) } powerStateOpts := nodes.PowerStateOpts{ @@ -1490,12 +1443,11 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar switch changeResult.Err.(type) { case nil: - result.Dirty = true p.log.Info("power change OK") + return operationContinuing(0) case gophercloud.ErrDefault409: p.log.Info("host is locked, trying again after delay", "delay", powerRequeueDelay) - result.Dirty = true - result.RequeueAfter = powerRequeueDelay + result, _ = retryAfterDelay(powerRequeueDelay) return result, HostLockedError{Address: p.host.Spec.BMC.Address} case gophercloud.ErrDefault400: // Error 400 Bad Request means target power state is not supported by vendor driver @@ -1503,10 +1455,8 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar return result, SoftPowerOffUnsupportedError{Address: p.host.Spec.BMC.Address} default: p.log.Info("power change error", "message", changeResult.Err) - return result, errors.Wrap(changeResult.Err, "failed to change power state") + return transientError(errors.Wrap(changeResult.Err, "failed to change power state")) } - - return result, nil } // PowerOn ensures the server is powered on independently of any image @@ -1516,7 +1466,7 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } p.log.Info("checking current state", @@ -1526,13 +1476,14 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { if ironicNode.PowerState != powerOn { if ironicNode.TargetPowerState == powerOn { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } result, err = p.changePower(ironicNode, nodes.PowerOn) - if err != nil { - return result, errors.Wrap(err, "failed to power on host") + switch err.(type) { + case nil: + case HostLockedError: + default: + return transientError(errors.Wrap(err, "failed to power on host")) } p.publisher("PowerOn", "Host powered on") } @@ -1553,12 +1504,9 @@ func (p *ironicProvisioner) PowerOff() (result provisioner.Result, err error) { case SoftPowerOffUnsupportedError, SoftPowerOffFailed: return p.hardPowerOff() case HostLockedError: - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return retryAfterDelay(powerRequeueDelay) default: - result.RequeueAfter = powerRequeueDelay - return result, err + return transientError(err) } } return result, nil @@ -1570,24 +1518,23 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode.PowerState != powerOff { if ironicNode.TargetPowerState == powerOff { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } result, err = p.changePower(ironicNode, nodes.PowerOff) if err != nil { - return result, errors.Wrap(err, "failed to power off host") + return transientError(errors.Wrap(err, "failed to power off host")) } p.publisher("PowerOff", "Host powered off") + return result, err } - return result, nil + return operationComplete() } // softPowerOff sends 'soft power off' request to BM node. @@ -1600,7 +1547,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode.PowerState != powerOff { @@ -1608,9 +1555,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error // If the target state is either powerOff or softPowerOff, then we should wait if targetState == powerOff || targetState == softPowerOff { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } // If the target state is unset while the last error is set, // then the last execution of soft power off has failed. @@ -1619,8 +1564,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error } result, err = p.changePower(ironicNode, nodes.SoftPowerOff) if err != nil { - result.RequeueAfter = powerRequeueDelay - return result, err + return transientError(err) } p.publisher("PowerOff", "Host soft powered off") } diff --git a/pkg/provisioner/ironic/ironic_test.go b/pkg/provisioner/ironic/ironic_test.go index fbf2cba020..864609a3a6 100644 --- a/pkg/provisioner/ironic/ironic_test.go +++ b/pkg/provisioner/ironic/ironic_test.go @@ -16,10 +16,10 @@ func init() { logf.SetLogger(logf.ZapLogger(true)) } -func makeHost() *metal3v1alpha1.BareMetalHost { +func makeHost() metal3v1alpha1.BareMetalHost { rotational := true - return &metal3v1alpha1.BareMetalHost{ + return metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", diff --git a/pkg/provisioner/ironic/power_test.go b/pkg/provisioner/ironic/power_test.go index 890fd19cf8..8aeda72434 100644 --- a/pkg/provisioner/ironic/power_test.go +++ b/pkg/provisioner/ironic/power_test.go @@ -73,7 +73,6 @@ func TestPowerOn(t *testing.T) { }).WithNodeStatesPower(nodeUUID, http.StatusConflict).WithNodeStatesPowerUpdate(nodeUUID, http.StatusConflict), expectedRequestAfter: 10, expectedDirty: true, - expectedError: true, }, } diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 30156e5fd2..31ec7a5810 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -22,6 +22,7 @@ func TestProvision(t *testing.T) { ironic *testserver.IronicMock expectedDirty bool expectedError bool + expectedErrorMessage bool expectedRequestAfter int }{ { @@ -31,7 +32,8 @@ func TestProvision(t *testing.T) { UUID: nodeUUID, }), expectedRequestAfter: 0, - expectedDirty: true, + expectedDirty: false, + expectedErrorMessage: true, }, { name: "manageable state", @@ -49,7 +51,8 @@ func TestProvision(t *testing.T) { UUID: nodeUUID, }), expectedRequestAfter: 0, - expectedDirty: true, + expectedDirty: false, + expectedErrorMessage: true, }, { name: "active state", @@ -57,7 +60,7 @@ func TestProvision(t *testing.T) { ProvisionState: string(nodes.Active), UUID: nodeUUID, }), - expectedRequestAfter: 10, + expectedRequestAfter: 0, expectedDirty: false, }, { @@ -99,6 +102,11 @@ func TestProvision(t *testing.T) { assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) + if !tc.expectedErrorMessage { + assert.Equal(t, "", result.ErrorMessage) + } else { + assert.NotEqual(t, "", result.ErrorMessage) + } if !tc.expectedError { assert.NoError(t, err) } else { @@ -116,6 +124,7 @@ func TestDeprovision(t *testing.T) { ironic *testserver.IronicMock expectedDirty bool expectedError bool + expectedErrorMessage bool expectedRequestAfter int }{ { @@ -128,14 +137,22 @@ func TestDeprovision(t *testing.T) { expectedDirty: true, }, { - name: "error state", + name: "deploy failed state", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ - ProvisionState: string(nodes.Error), + ProvisionState: string(nodes.DeployFail), UUID: nodeUUID, }), expectedRequestAfter: 10, expectedDirty: true, }, + { + name: "error state", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.Error), + UUID: nodeUUID, + }), + expectedErrorMessage: true, + }, { name: "available state", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ @@ -215,9 +232,10 @@ func TestDeprovision(t *testing.T) { } prov.status.ID = nodeUUID - result, err := prov.Deprovision() + result, err := prov.Deprovision(false) assert.Equal(t, tc.expectedDirty, result.Dirty) + assert.Equal(t, tc.expectedErrorMessage, result.ErrorMessage != "") assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) if !tc.expectedError { assert.NoError(t, err) diff --git a/pkg/provisioner/ironic/result.go b/pkg/provisioner/ironic/result.go index 462085d605..cb9057e821 100644 --- a/pkg/provisioner/ironic/result.go +++ b/pkg/provisioner/ironic/result.go @@ -16,10 +16,6 @@ func retryAfterDelay(delay time.Duration) (provisioner.Result, error) { }, nil } -func hostUpdated() (provisioner.Result, error) { - return provisioner.Result{Dirty: true}, nil -} - func operationContinuing(delay time.Duration) (provisioner.Result, error) { return provisioner.Result{ Dirty: true, diff --git a/pkg/provisioner/ironic/updatehardwarestate_test.go b/pkg/provisioner/ironic/updatehardwarestate_test.go index 63944b8137..e4e89b0b80 100644 --- a/pkg/provisioner/ironic/updatehardwarestate_test.go +++ b/pkg/provisioner/ironic/updatehardwarestate_test.go @@ -3,7 +3,6 @@ package ironic import ( "net/http" "testing" - "time" "github.com/gophercloud/gophercloud/openstack/baremetal/v1/nodes" "github.com/stretchr/testify/assert" @@ -24,9 +23,7 @@ func TestUpdateHardwareState(t *testing.T) { hostCurrentlyPowered bool hostName string - expectedDirty bool - expectedRequestAfter int - expectedResultError string + expectUnreadablePower bool expectedPublish string expectedError string @@ -36,6 +33,7 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{ UUID: nodeUUID, }), + expectUnreadablePower: true, }, { name: "updated-power-on-state", @@ -52,8 +50,6 @@ func TestUpdateHardwareState(t *testing.T) { PowerState: "power on", }), hostCurrentlyPowered: false, - - expectedDirty: true, }, { name: "updated-power-off-state", @@ -70,8 +66,6 @@ func TestUpdateHardwareState(t *testing.T) { PowerState: "power off", }), hostCurrentlyPowered: true, - - expectedDirty: true, }, { name: "no-power", @@ -79,7 +73,8 @@ func TestUpdateHardwareState(t *testing.T) { UUID: nodeUUID, PowerState: "None", }), - hostCurrentlyPowered: true, + hostCurrentlyPowered: true, + expectUnreadablePower: true, }, { name: "node-not-found", @@ -88,6 +83,8 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().NodeError(nodeUUID, http.StatusGatewayTimeout), expectedError: "failed to find existing host: failed to find node by ID 33ce8659-7400-4c68-9535-d10766f07a58: Expected HTTP response code \\[200\\].*", + + expectUnreadablePower: true, }, { name: "node-not-found-by-name", @@ -96,12 +93,16 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().NoNode(nodeUUID).NodeError("myhost", http.StatusGatewayTimeout), expectedError: "failed to find existing host: failed to find node by name worker-0: EOF", + + expectUnreadablePower: true, }, { name: "not-ironic-node", ironic: testserver.NewIronic(t).Ready().NoNode(nodeUUID).NoNode("myhost"), expectedError: "Host not registered", + + expectUnreadablePower: true, }, } @@ -136,11 +137,9 @@ func TestUpdateHardwareState(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.UpdateHardwareState() + hwStatus, err := prov.UpdateHardwareState() - assert.Equal(t, tc.expectedDirty, result.Dirty) - assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) - assert.Equal(t, tc.expectedResultError, result.ErrorMessage) + assert.Equal(t, tc.expectUnreadablePower, hwStatus.PoweredOn == nil) assert.Equal(t, tc.expectedPublish, publishedMsg) if tc.expectedError == "" { diff --git a/pkg/provisioner/ironic/updateopts_test.go b/pkg/provisioner/ironic/updateopts_test.go index 88714db735..d712064f9e 100644 --- a/pkg/provisioner/ironic/updateopts_test.go +++ b/pkg/provisioner/ironic/updateopts_test.go @@ -83,7 +83,7 @@ func TestGetUpdateOptsForNodeWithRootHints(t *testing.T) { } func TestGetUpdateOptsForNodeVirtual(t *testing.T) { - host := &metal3v1alpha1.BareMetalHost{ + host := metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", @@ -188,7 +188,7 @@ func TestGetUpdateOptsForNodeVirtual(t *testing.T) { } func TestGetUpdateOptsForNodeDell(t *testing.T) { - host := &metal3v1alpha1.BareMetalHost{ + host := metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", diff --git a/pkg/provisioner/ironic/validatemanagementaccess_test.go b/pkg/provisioner/ironic/validatemanagementaccess_test.go index 522b0becd9..378d62ff63 100644 --- a/pkg/provisioner/ironic/validatemanagementaccess_test.go +++ b/pkg/provisioner/ironic/validatemanagementaccess_test.go @@ -33,7 +33,7 @@ func TestValidateManagementAccessNoMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -63,7 +63,7 @@ func TestValidateManagementAccessMACOptional(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -95,13 +95,13 @@ func TestValidateManagementAccessCreateNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) assert.NotEqual(t, "", createdNode.UUID) - assert.Equal(t, createdNode.UUID, host.Status.Provisioning.ID) + assert.Equal(t, createdNode.UUID, provID) } func TestValidateManagementAccessExistingNode(t *testing.T) { @@ -130,12 +130,12 @@ func TestValidateManagementAccessExistingNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.Equal(t, "uuid", host.Status.Provisioning.ID) + assert.Equal(t, "uuid", provID) } func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { @@ -193,7 +193,7 @@ func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -238,7 +238,7 @@ func TestValidateManagementAccessExistingNodeWaiting(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -280,12 +280,12 @@ func TestValidateManagementAccessNewCredentials(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(true) + result, provID, err := prov.ValidateManagementAccess(true, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.Equal(t, "uuid", host.Status.Provisioning.ID) + assert.Equal(t, "uuid", provID) updates := ironic.GetLastNodeUpdateRequestFor("uuid") newValues := updates[0].Value.(map[string]interface{}) @@ -327,12 +327,12 @@ func TestValidateManagementAccessLinkExistingIronicNodeByMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.NotEqual(t, "", host.Status.Provisioning.ID) + assert.NotEqual(t, "", provID) } func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { @@ -370,7 +370,7 @@ func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: port exists but linked node doesn't random-wrong-id: Resource not found") } @@ -412,7 +412,7 @@ func TestValidateManagementAccessExistingPortButHasName(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: node found by MAC but has a name: wrong-name") } @@ -450,10 +450,10 @@ func TestValidateManagementAccessAddTwoHostsWithSameMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.NotEqual(t, "", host.Status.Provisioning.ID) + assert.NotEqual(t, "", provID) } diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 8fe303271e..70c9b38027 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -17,7 +17,7 @@ Package provisioning defines the API for talking to the provisioning backend. type EventPublisher func(reason, message string) // Factory is the interface for creating new Provisioner objects. -type Factory func(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish EventPublisher) (Provisioner, error) +type Factory func(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish EventPublisher) (Provisioner, error) // HostConfigData retrieves host configuration data type HostConfigData interface { @@ -43,20 +43,19 @@ type Provisioner interface { // of credentials it has are different from the credentials it has // previously been using, without implying that either set of // credentials is correct. - ValidateManagementAccess(credentialsChanged bool) (result Result, err error) + ValidateManagementAccess(credentialsChanged, force bool) (result Result, provID string, err error) // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. - InspectHardware() (result Result, details *metal3v1alpha1.HardwareDetails, err error) + InspectHardware(force bool) (result Result, details *metal3v1alpha1.HardwareDetails, err error) // UpdateHardwareState fetches the latest hardware state of the // server and updates the HardwareDetails field of the host with // details. It is expected to do this in the least expensive way - // possible, such as reading from a cache, and return dirty only - // if any state information has changed. - UpdateHardwareState() (result Result, err error) + // possible, such as reading from a cache. + UpdateHardwareState() (hwState HardwareState, err error) // Adopt brings an externally-provisioned host under management by // the provisioner. @@ -70,11 +69,11 @@ type Provisioner interface { // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until // the deprovisioning operation is completed. - Deprovision() (result Result, err error) + Deprovision(force bool) (result Result, err error) // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty - // flag until the deprovisioning operation is completed. + // flag until the deletion operation is completed. Delete() (result Result, err error) // PowerOn ensures the server is powered on independently of any image @@ -102,4 +101,11 @@ type Result struct { ErrorMessage string } +// HardwareState holds the response from an UpdateHardwareState call +type HardwareState struct { + // PoweredOn is a pointer to a bool indicating whether the Host is currently + // powered on. The value is nil if the power state cannot be determined. + PoweredOn *bool +} + var NeedsRegistration = errors.New("Host not registered")