diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index c92e92a25a..701037256e 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -208,6 +208,10 @@ const ( // learn about the hardware components available there StateInspecting ProvisioningState = "inspecting" + // StatePoweringOffBeforeDelete means we are in the process of + // powering off the node before it's deleted. + StatePoweringOffBeforeDelete ProvisioningState = "powering off before delete" + // StateDeleting means we are in the process of cleaning up the host // ready for deletion StateDeleting ProvisioningState = "deleting" diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 734ab1fbd5..10b127dddc 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -485,6 +485,31 @@ func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType, host.Status.ErrorCount++ } +func (r *BareMetalHostReconciler) actionPowerOffBeforeDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { + info.log.Info("host ready to be powered off") + provResult, err := prov.PowerOff( + metal3api.RebootModeHard, + info.host.Status.ErrorType == metal3api.PowerManagementError) + + if err != nil { + return actionError{errors.Wrap(err, "failed to power off before deleting node")} + } + + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3api.PowerManagementError, provResult.ErrorMessage) + } + + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result + } + + return actionComplete{} +} + // Manage deletion of the host func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info( diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index d688c165da..9c383cae23 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -41,19 +41,20 @@ type stateHandler func(*reconcileInfo) actionResult func (hsm *hostStateMachine) handlers() map[metal3api.ProvisioningState]stateHandler { return map[metal3api.ProvisioningState]stateHandler{ - metal3api.StateNone: hsm.handleNone, - metal3api.StateUnmanaged: hsm.handleUnmanaged, - metal3api.StateRegistering: hsm.handleRegistering, - metal3api.StateInspecting: hsm.handleInspecting, - metal3api.StateExternallyProvisioned: hsm.handleExternallyProvisioned, - metal3api.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually - metal3api.StatePreparing: hsm.handlePreparing, - metal3api.StateAvailable: hsm.handleAvailable, - metal3api.StateReady: hsm.handleAvailable, - metal3api.StateProvisioning: hsm.handleProvisioning, - metal3api.StateProvisioned: hsm.handleProvisioned, - metal3api.StateDeprovisioning: hsm.handleDeprovisioning, - metal3api.StateDeleting: hsm.handleDeleting, + metal3api.StateNone: hsm.handleNone, + metal3api.StateUnmanaged: hsm.handleUnmanaged, + metal3api.StateRegistering: hsm.handleRegistering, + metal3api.StateInspecting: hsm.handleInspecting, + metal3api.StateExternallyProvisioned: hsm.handleExternallyProvisioned, + metal3api.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually + metal3api.StatePreparing: hsm.handlePreparing, + metal3api.StateAvailable: hsm.handleAvailable, + metal3api.StateReady: hsm.handleAvailable, + metal3api.StateProvisioning: hsm.handleProvisioning, + metal3api.StateProvisioned: hsm.handleProvisioned, + metal3api.StateDeprovisioning: hsm.handleDeprovisioning, + metal3api.StatePoweringOffBeforeDelete: hsm.handlePoweringOffBeforeDelete, + metal3api.StateDeleting: hsm.handleDeleting, } } @@ -223,7 +224,7 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { switch hsm.NextState { default: - hsm.NextState = metal3api.StateDeleting + hsm.NextState = metal3api.StatePoweringOffBeforeDelete case metal3api.StateProvisioning, metal3api.StateProvisioned: if hsm.Host.OperationalStatus() == metal3api.OperationalStatusDetached { if delayDeleteForDetachedHost(hsm.Host) { @@ -231,6 +232,7 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { deleteDelayedForDetached.Inc() return false } + // We cannot power off a detached host. Skip to delete. hsm.NextState = metal3api.StateDeleting } else { hsm.NextState = metal3api.StateDeprovisioning @@ -241,6 +243,9 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { case metal3api.StateDeleting: // Already in deleting state. Allow state machine to run. return false + case metal3api.StatePoweringOffBeforeDelete: + // Already in powering off state. Allow state machine to run. + return false } return true } @@ -322,7 +327,7 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio case metal3api.StateMatchProfile: // Backward compatibility, remove eventually return - case metal3api.StateDeleting: + case metal3api.StateDeleting, metal3api.StatePoweringOffBeforeDelete: // In the deleting state the whole idea is to de-register the host return case metal3api.StateRegistering: @@ -561,6 +566,37 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes return actResult } +func (hsm *hostStateMachine) handlePoweringOffBeforeDelete(info *reconcileInfo) actionResult { + actResult := hsm.Reconciler.actionPowerOffBeforeDeleting(hsm.Provisioner, info) + skipToDelete := func() actionResult { + hsm.NextState = metal3api.StateDeleting + info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutPowerOff.Inc) + return actionComplete{} + } + + switch r := actResult.(type) { + case actionComplete: + hsm.NextState = metal3api.StateDeleting + hsm.Host.Status.ErrorCount = 0 + hsm.Host.Status.PoweredOn = false + case actionFailed: + // If the provisioner gives up deprovisioning and + // deletion has been requested, continue to delete. + if hsm.Host.Status.ErrorCount > 3 { + info.log.Info("Giving up on host power off after 3 attempts.") + return skipToDelete() + } + case actionError: + if r.NeedsRegistration() && !hsm.haveCreds { + // If the host is not registered as a node in Ironic and we + // lack the credentials to power it off, just continue to + // delete. + return skipToDelete() + } + } + return actResult +} + func (hsm *hostStateMachine) handleDeleting(info *reconcileInfo) actionResult { return hsm.Reconciler.actionDeleting(hsm.Provisioner, info) } diff --git a/controllers/metal3.io/metrics.go b/controllers/metal3.io/metrics.go index 1828d1a9b6..b926a9796c 100644 --- a/controllers/metal3.io/metrics.go +++ b/controllers/metal3.io/metrics.go @@ -118,6 +118,11 @@ var deleteWithoutDeprov = prometheus.NewCounter(prometheus.CounterOpts{ Help: "Number of times a host is deleted despite deprovisioning failing", }) +var deleteWithoutPowerOff = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "metal3_delete_without_powering_off_total", + Help: "Number of times a host is deleted despite powering off failing", +}) + var provisionerNotReady = prometheus.NewCounter(prometheus.CounterOpts{ Name: "metal3_provisioner_not_ready_total", Help: "Number of times a host is not provision ready",