Skip to content
32 changes: 18 additions & 14 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -849,27 +849,31 @@ func (host *BareMetalHost) OperationMetricForState(operation ProvisioningState)

// GetImageChecksum returns the hash value and its algo.
func (host *BareMetalHost) GetImageChecksum() (string, string, bool) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would probably go ahead and remove this method, but that's not a reason to hold up this PR.

if host.Spec.Image == nil {
return "", "", false
}
return host.Spec.Image.GetChecksum()
}

checksum := host.Spec.Image.Checksum
checksumType := host.Spec.Image.ChecksumType
func (image *Image) GetChecksum() (checksum, checksumType string, ok bool) {
if image == nil {
return
}

if checksum == "" {
if image.Checksum == "" {
// Return empty if checksum is not provided
return "", "", false
return
}
if checksumType == "" {
// If only checksum is specified. Assume type is md5
return checksum, string(MD5), true
}
switch checksumType {

switch image.ChecksumType {
case "":
checksumType = string(MD5)
case MD5, SHA256, SHA512:
return checksum, string(checksumType), true
checksumType = string(image.ChecksumType)
default:
return "", "", false
return
}

checksum = image.Checksum
ok = true
return
}

// +kubebuilder:object:root=true
Expand Down
6 changes: 6 additions & 0 deletions controllers/metal3.io/action_result.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package controllers

import (
"errors"
"math"
"math/rand"
"time"

"sigs.k8s.io/controller-runtime/pkg/reconcile"

metal3 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1"
"github.com/metal3-io/baremetal-operator/pkg/provisioner"
)

const maxBackOffCount = 10
Expand Down Expand Up @@ -96,6 +98,10 @@ func (r actionError) Dirty() bool {
return false
}

func (r actionError) NeedsRegistration() bool {
return errors.Is(r.err, provisioner.NeedsRegistration)
}

// actionFailed is a result indicating that the current action has failed,
// and that the resource should be marked as in error.
type actionFailed struct {
Expand Down
61 changes: 28 additions & 33 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R
// management controller.
var bmcCreds *bmc.Credentials
var bmcCredsSecret *corev1.Secret
haveCreds := false
switch host.Status.Provisioning.State {
case metal3v1alpha1.StateNone, metal3v1alpha1.StateUnmanaged:
bmcCreds = &bmc.Credentials{}
Expand All @@ -194,6 +195,8 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R
} else {
return r.credentialsErrorResult(err, request, host)
}
} else {
haveCreds = true
}
}

Expand All @@ -219,7 +222,7 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R
return ctrl.Result{Requeue: true, RequeueAfter: provisionerNotReadyRetryDelay}, nil
}

stateMachine := newHostStateMachine(host, r, prov)
stateMachine := newHostStateMachine(host, r, prov, haveCreds)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: just to avoid passing another param to the host machine, couldn't this check be done via the Provisioner interface (verifying that bmcCreds is not empty)?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't know what type of Provisioner we have, so we don't know where (or even if) that data is stored. Also I don't want to have to look at the contents of the creds (which are not passed as a pointer) to see if they're empty or not.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bmCreds are already passed to the ProvisionerFactory, a plain wrapper having the same check in the Provisioner interface could avoid to store it directly in the state machine. But it's a minor point.

actResult := stateMachine.ReconcileState(info)
result, err = actResult.Result()

Expand Down Expand Up @@ -428,15 +431,13 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner
// so clear any previous error and record the success in the
// status block.
info.log.Info("updating credentials success status fields")
registeredNewCreds := !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret)
info.host.UpdateGoodCredentials(*info.bmcCredsSecret)
info.log.Info("clearing previous error message")
info.host.ClearError()

info.publishEvent("BMCAccessValidated", "Verified access to BMC")

if info.host.Spec.ExternallyProvisioned {
info.publishEvent("ExternallyProvisioned",
"Registered host that was externally provisioned")
if registeredNewCreds {
info.publishEvent("BMCAccessValidated", "Verified access to BMC")
}

return actionComplete{}
Expand Down Expand Up @@ -562,9 +563,23 @@ func clearHostProvisioningSettings(host *metal3v1alpha1.BareMetalHost) {
}

func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
// Adopt the host in case it has been re-registered during the
// deprovisioning process before it completed
provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError)
if err != nil {
return actionError{err}
}
if provResult.ErrorMessage != "" {
return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage)
}
if provResult.Dirty {
info.host.ClearError()
return actionContinue{provResult.RequeueAfter}
}

info.log.Info("deprovisioning")

provResult, err := prov.Deprovision()
provResult, err = prov.Deprovision()
if err != nil {
return actionError{errors.Wrap(err, "failed to deprovision")}
}
Expand Down Expand Up @@ -679,14 +694,12 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
return steadyStateResult
}

// A host reaching this action handler should be provisioned or
// externally provisioned -- a state that it will stay in until the
// user takes further action. Both of those states mean that it has
// been registered with the provisioner once, so we use the Adopt()
// API to ensure that is still true. Then we monitor its power status.
// A host reaching this action handler should be provisioned or externally
// provisioned -- a state that it will stay in until the user takes further
// action. We use the Adopt() API to make sure that the provisioner is aware of
// the provisioning details. Then we monitor its power status.
func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provisioner, info *reconcileInfo) actionResult {

provResult, err := prov.Adopt()
provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError)
if err != nil {
return actionError{err}
}
Expand All @@ -702,28 +715,10 @@ func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provi
}

// A host reaching this action handler should be ready -- a state that
// it will stay in until the user takes further action. It has been
// registered with the provisioner once, so we use
// ValidateManagementAccess() to ensure that is still true. We don't
// it will stay in until the user takes further action. We don't
// use Adopt() because we don't want Ironic to treat the host as
// having been provisioned. Then we monitor its power status.
func (r *BareMetalHostReconciler) actionManageReady(prov provisioner.Provisioner, info *reconcileInfo) actionResult {

// We always pass false for credentialsChanged because if they had
// changed we would have ended up in actionRegister() instead of
// here.
provResult, err := prov.ValidateManagementAccess(false)
if err != nil {
return actionError{err}
}
if provResult.ErrorMessage != "" {
return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage)
}
if provResult.Dirty {
info.host.ClearError()
return actionContinue{provResult.RequeueAfter}
}

if info.host.NeedsProvisioning() {
// Ensure the provisioning settings we're going to use are stored.
dirty, err := saveHostProvisioningSettings(info.host)
Expand Down
97 changes: 59 additions & 38 deletions controllers/metal3.io/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,20 @@ type hostStateMachine struct {
NextState metal3v1alpha1.ProvisioningState
Reconciler *BareMetalHostReconciler
Provisioner provisioner.Provisioner
haveCreds bool
}

func newHostStateMachine(host *metal3v1alpha1.BareMetalHost,
reconciler *BareMetalHostReconciler,
provisioner provisioner.Provisioner) *hostStateMachine {
provisioner provisioner.Provisioner,
haveCreds bool) *hostStateMachine {
currentState := host.Status.Provisioning.State
r := hostStateMachine{
Host: host,
NextState: currentState, // Remain in current state by default
Reconciler: reconciler,
Provisioner: provisioner,
haveCreds: haveCreds,
}
return &r
}
Expand Down Expand Up @@ -62,7 +65,7 @@ func recordStateBegin(host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.P

func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState, time metav1.Time) {
if prevMetric := host.OperationMetricForState(state); prevMetric != nil {
if !prevMetric.Start.IsZero() {
if !prevMetric.Start.IsZero() && prevMetric.End.IsZero() {
prevMetric.End = time
info.postSaveCallbacks = append(info.postSaveCallbacks, func() {
observer := stateTime[state].With(hostMetricLabels(info.request))
Expand Down Expand Up @@ -160,40 +163,48 @@ func (hsm *hostStateMachine) checkInitiateDelete() bool {
}

func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actionResult) {
if !hsm.haveCreds {
// If we are in the process of deletion (which may start with
// deprovisioning) and we have been unable to obtain any credentials,
// don't attempt to re-register the Host as this will always fail.
return
}

needsReregister := false

switch hsm.NextState {
case metal3v1alpha1.StateNone, metal3v1alpha1.StateUnmanaged:
// We haven't yet reached the Registration state, so don't attempt
// to register the Host.
return
}
if !hsm.Host.DeletionTimestamp.IsZero() {
// BUG(zaneb) We currently don't attempt to re-register the Host
// if we find it missing once a delete has been requested (in
// part because we are also willing to pass empty credentials
// after this point), but this means that we may not be able to
// deprovision a Host if the Ironic database pod is rescheduled
// during deprovisioning, or if the credentials need to be
// modified.
return
}

if hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret) {
// Credentials are unchanged since we verified them.
case metal3v1alpha1.StateDeleting:
// In the deleting state the whole idea is to de-register the host
return
}

recordStateBegin(hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now())
if hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError {
if hsm.Host.Status.TriedCredentials.Match(*info.bmcCredsSecret) {
// Already tried with these credentials; no point retrying
info.log.Info("Unmodified credentials; not retrying")
return actionFailed{ErrorType: metal3v1alpha1.RegistrationError}
case metal3v1alpha1.StateRegistering:
default:
needsReregister = (hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError ||
!hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret))
if needsReregister {
info.log.Info("Retrying registration")
recordStateBegin(hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now())
}
info.log.Info("Modified credentials detected; will retry registration")
}

result = hsm.Reconciler.actionRegistering(hsm.Provisioner, info)
if _, complete := result.(actionComplete); complete && hsm.Host.Status.Provisioning.State != metal3v1alpha1.StateRegistering {
recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now())
if _, complete := result.(actionComplete); complete {
if hsm.NextState != metal3v1alpha1.StateRegistering {
recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now())
}
if needsReregister {
// Host was re-registered, so requeue and run the state machine on
// the next reconcile
result = actionContinue{}
} else {
// Allow the state machine to run, either because we were just
// reconfirming an existing registration, or because we are in the
// Registering state
result = nil
}
}
return
}
Expand Down Expand Up @@ -224,7 +235,6 @@ func (hsm *hostStateMachine) handleRegistering(info *reconcileInfo) actionResult
// registered using the current BMC credentials, so we can move to the
// next state. We will not return to the Registering state, even
// if the credentials change and the Host must be re-registered.
hsm.Host.ClearError()
if hsm.Host.Spec.ExternallyProvisioned {
hsm.NextState = metal3v1alpha1.StateExternallyProvisioned
} else {
Expand Down Expand Up @@ -323,23 +333,34 @@ func (hsm *hostStateMachine) handleProvisioned(info *reconcileInfo) actionResult
func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionResult {
actResult := hsm.Reconciler.actionDeprovisioning(hsm.Provisioner, info)

switch actResult.(type) {
case actionComplete:
if !hsm.Host.DeletionTimestamp.IsZero() {
hsm.NextState = metal3v1alpha1.StateDeleting
} else {
if hsm.Host.DeletionTimestamp.IsZero() {
if _, complete := actResult.(actionComplete); complete {
hsm.NextState = metal3v1alpha1.StateReady
}
case actionFailed:
if !hsm.Host.DeletionTimestamp.IsZero() {
} else {
skipToDelete := func() actionResult {
hsm.NextState = metal3v1alpha1.StateDeleting
info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutDeprov.Inc)
return actionComplete{}
}

switch r := actResult.(type) {
case actionComplete:
hsm.NextState = metal3v1alpha1.StateDeleting
case actionFailed:
// If the provisioner gives up deprovisioning and
// deletion has been requested, continue to delete.
// Note that this is entirely theoretical, as the
// Ironic provisioner currently never gives up
// trying to deprovision.
hsm.NextState = metal3v1alpha1.StateDeleting
info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutDeprov.Inc)
actResult = actionComplete{}
return skipToDelete()
case actionError:
if r.NeedsRegistration() && !hsm.haveCreds {
// If the host is not registered as a node in Ironic and we
// lack the credentials to deprovision it, just continue to
// delete.
return skipToDelete()
}
}
}
return actResult
Expand Down
10 changes: 5 additions & 5 deletions controllers/metal3.io/host_state_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func testStateMachine(host *metal3v1alpha1.BareMetalHost) *hostStateMachine {
r := newTestReconciler()
p, _ := r.ProvisionerFactory(host, bmc.Credentials{},
func(reason, message string) {})
return newHostStateMachine(host, r, p)
return newHostStateMachine(host, r, p, true)
}

func TestProvisioningCancelled(t *testing.T) {
Expand Down Expand Up @@ -205,7 +205,7 @@ func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) {
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
prov := &mockProvisioner{}
hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov)
hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true)
info := makeDefaultReconcileInfo(tt.Host)

prov.setNextError("some error")
Expand All @@ -220,7 +220,7 @@ func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) {
func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) {
bmh := host(metal3v1alpha1.StateRegistering).build()
prov := &mockProvisioner{}
hsm := newHostStateMachine(bmh, &BareMetalHostReconciler{}, prov)
hsm := newHostStateMachine(bmh, &BareMetalHostReconciler{}, prov, true)
info := makeDefaultReconcileInfo(bmh)
bmh.Status.GoodCredentials = metal3v1alpha1.CredentialsStatus{}

Expand Down Expand Up @@ -265,7 +265,7 @@ func TestErrorCountCleared(t *testing.T) {
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
prov := &mockProvisioner{}
hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov)
hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true)
info := makeDefaultReconcileInfo(tt.Host)

info.host.Status.ErrorCount = 1
Expand Down Expand Up @@ -367,7 +367,7 @@ func (m *mockProvisioner) UpdateHardwareState() (result provisioner.Result, err
return m.nextResult, err
}

func (m *mockProvisioner) Adopt() (result provisioner.Result, err error) {
func (m *mockProvisioner) Adopt(force bool) (result provisioner.Result, err error) {
return m.nextResult, err
}

Expand Down
Loading