From 562b6ac58e8be1baa5e968beed5d9912a3f8f153 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 21:40:48 -0500 Subject: [PATCH 1/6] Clear ErrorCount independently of ErrorType/Message When a call to the provisioner succeeds rather than returning an error message, that's a good sign, and a reason to not have an error message set in the Host object. But it doesn't guarantee success: if the previous failure came at the end of some long-running operation (rather than outright rejection at the beginning), it could yet fail in exactly the same way. Clearing the ErrorType as soon as we start an operation allows us to use that field to determine whether to force the provisioner to retry in the presence of an error. (It will only be set if the last thing we did was record an error, therefore if we see it then we are at the beginning of a new retry.) One known issue with this is that if a request to change state in Ironic results in a 409 Conflict response, this just sets the Dirty flag and is indistinguishable from success when viewed outside the Ironic provisioner. If such a conflict occurs, we will effectively skip a retry attempt, increment the ErrorCount again, and sleep for even longer before the next attempt. To ensure that we actually do exponential backoff between retries, leave the ErrorCount unchanged until the whole operation actually succeeds (Dirty is no longer true). This is now decoupled from the ClearError() method that clears the ErrorType and ErrorMessage. As much as possible, do the clearing of ErrorCount in the host state machine. The exception is the steady states where we only do power management - the actionResult types currently lack enough detail to distinguish when count should be cleared when viewed from the state machine. In power management states (Ready/Available, Provisioned, Externally Provisioned), count the number of errors of any type since the power state was last successfully changed. Otherwise, the ErrorCount is cleared when an operation succesfully completes. Successful re-registration or adoption is never sufficient to clear the error count, except in the registration state. --- .../metal3.io/baremetalhost_controller.go | 5 +--- .../baremetalhost_controller_test.go | 24 ----------------- controllers/metal3.io/host_state_machine.go | 10 ++++++- .../metal3.io/host_state_machine_test.go | 26 ++++++++++++++----- 4 files changed, 29 insertions(+), 36 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 73fd61b9d6..110e0ada41 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -370,10 +370,6 @@ func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) { host.Status.ErrorMessage = "" dirty = true } - if host.Status.ErrorCount != 0 { - host.Status.ErrorCount = 0 - dirty = true - } return dirty } @@ -720,6 +716,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // state and there were no errors, so reflect the new state in the // host status field. info.host.Status.PoweredOn = info.host.Spec.Online + info.host.Status.ErrorCount = 0 return steadyStateResult } diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 03df5bbc4e..3c02a2b966 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -1310,27 +1310,3 @@ func TestErrorCountIncrementsAlways(t *testing.T) { setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") assert.Equal(t, b.Status.ErrorCount, 2) } - -func TestClearErrorCount(t *testing.T) { - - b := &metal3v1alpha1.BareMetalHost{ - Status: metal3v1alpha1.BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, clearError(b)) - assert.Equal(t, 0, b.Status.ErrorCount) -} - -func TestClearErrorCountOnlyIfNotZero(t *testing.T) { - - b := &metal3v1alpha1.BareMetalHost{ - Status: metal3v1alpha1.BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, clearError(b)) - assert.False(t, clearError(b)) -} diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 9954a38b9e..b8e0dc70b1 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -240,6 +240,7 @@ func (hsm *hostStateMachine) handleRegistering(info *reconcileInfo) actionResult } else { hsm.NextState = metal3v1alpha1.StateInspecting } + hsm.Host.Status.ErrorCount = 0 return actionComplete{} } @@ -247,6 +248,7 @@ func (hsm *hostStateMachine) handleInspecting(info *reconcileInfo) actionResult actResult := hsm.Reconciler.actionInspecting(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateMatchProfile + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -255,12 +257,14 @@ func (hsm *hostStateMachine) handleMatchProfile(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionMatchProfile(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } return actResult } func (hsm *hostStateMachine) handleExternallyProvisioned(info *reconcileInfo) actionResult { if hsm.Host.Spec.ExternallyProvisioned { + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -281,8 +285,8 @@ func (hsm *hostStateMachine) handleReady(info *reconcileInfo) actionResult { return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageReady actResult := hsm.Reconciler.actionManageReady(hsm.Provisioner, info) - if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioning } @@ -317,6 +321,7 @@ func (hsm *hostStateMachine) handleProvisioning(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionProvisioning(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioned + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -327,6 +332,7 @@ func (hsm *hostStateMachine) handleProvisioned(info *reconcileInfo) actionResult return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -336,6 +342,7 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes if hsm.Host.DeletionTimestamp.IsZero() { if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } } else { skipToDelete := func() actionResult { @@ -347,6 +354,7 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes switch r := actResult.(type) { case actionComplete: hsm.NextState = metal3v1alpha1.StateDeleting + hsm.Host.Status.ErrorCount = 0 case actionFailed: // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 7f717658e7..270511075f 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -234,8 +234,9 @@ func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) { func TestErrorCountCleared(t *testing.T) { tests := []struct { - Scenario string - Host *metal3v1alpha1.BareMetalHost + Scenario string + Host *metal3v1alpha1.BareMetalHost + PreserveErrorCountOnComplete bool }{ { Scenario: "registering", @@ -246,8 +247,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateInspecting).build(), }, { - Scenario: "ready", - Host: host(metal3v1alpha1.StateReady).build(), + Scenario: "ready", + Host: host(metal3v1alpha1.StateReady).build(), + PreserveErrorCountOnComplete: true, }, { Scenario: "deprovisioning", @@ -258,8 +260,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), }, { - Scenario: "externallyProvisioned", - Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + Scenario: "externallyProvisioned", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + PreserveErrorCountOnComplete: true, }, } for _, tt := range tests { @@ -272,8 +275,16 @@ func TestErrorCountCleared(t *testing.T) { prov.setNextResult(true) result := hsm.ReconcileState(info) - assert.Equal(t, tt.Host.Status.ErrorCount, 0) + assert.Equal(t, 1, tt.Host.Status.ErrorCount) assert.True(t, result.Dirty()) + + prov.setNextResult(false) + hsm.ReconcileState(info) + if tt.PreserveErrorCountOnComplete { + assert.Equal(t, 1, tt.Host.Status.ErrorCount) + } else { + assert.Equal(t, 0, tt.Host.Status.ErrorCount) + } }) } } @@ -360,6 +371,7 @@ func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (res } func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { + details = &metal3v1alpha1.HardwareDetails{} return m.nextResult, details, err } From 84ca573754f9812c2cf622745297f9cbc78e22ae Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 12:51:49 -0500 Subject: [PATCH 2/6] Add force flag to ValidateManagementAccess() Once we see an error in the Node, it returns to the 'enroll' [sic] state and we don't have a way of determining whether we have seen and saved that error or not. Previously we always assumed we had not, and didn't retry the validation unless the credentials had changed. Add a force flag to indicate that this is a new attempt and should start again. Fixes #739 --- .../metal3.io/baremetalhost_controller.go | 2 +- .../metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/ironic.go | 4 ++-- .../ironic/validatemanagementaccess_test.go | 22 +++++++++---------- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 110e0ada41..32f5123bd1 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -434,7 +434,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner info.postSaveCallbacks = append(info.postSaveCallbacks, updatedCredentials.Inc) } - provResult, err := prov.ValidateManagementAccess(credsChanged) + provResult, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) if err != nil { noManagementAccess.Inc() return actionError{errors.Wrap(err, "failed to validate BMC access")} diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 270511075f..c93dc089ff 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -366,7 +366,7 @@ func (m *mockProvisioner) setNextResult(dirty bool) { } } -func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { return m.nextResult, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index f118dc27c3..b19791b419 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -68,7 +68,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { p.log.Info("testing management access") hostName := p.host.ObjectMeta.Name diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index 3bfd226ec4..cbc71d8164 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -21,7 +21,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged bool) (provisioner.Result, error) { +func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (provisioner.Result, error) { return provisioner.Result{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index bcd44458f1..cae9e78bec 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -77,7 +77,7 @@ func NewMock(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publi // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { p.log.Info("testing management access") // Fill in the ID of the host in the provisioning system diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 95be528753..dff956c18d 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -333,7 +333,7 @@ func (p *ironicProvisioner) findExistingHost() (ironicNode *nodes.Node, err erro // // FIXME(dhellmann): We should rename this method to describe what it // actually does. -func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { var ironicNode *nodes.Node p.log.Info("validating management access") @@ -572,7 +572,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r case nodes.Enroll: // If ironic is reporting an error, stop working on the node. - if ironicNode.LastError != "" && !credentialsChanged { + if ironicNode.LastError != "" && !(credentialsChanged || force) { result.ErrorMessage = ironicNode.LastError return result, nil } diff --git a/pkg/provisioner/ironic/validatemanagementaccess_test.go b/pkg/provisioner/ironic/validatemanagementaccess_test.go index 522b0becd9..13f4977788 100644 --- a/pkg/provisioner/ironic/validatemanagementaccess_test.go +++ b/pkg/provisioner/ironic/validatemanagementaccess_test.go @@ -33,7 +33,7 @@ func TestValidateManagementAccessNoMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -63,7 +63,7 @@ func TestValidateManagementAccessMACOptional(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -95,7 +95,7 @@ func TestValidateManagementAccessCreateNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -130,7 +130,7 @@ func TestValidateManagementAccessExistingNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -193,7 +193,7 @@ func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -238,7 +238,7 @@ func TestValidateManagementAccessExistingNodeWaiting(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -280,7 +280,7 @@ func TestValidateManagementAccessNewCredentials(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(true) + result, err := prov.ValidateManagementAccess(true, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -327,7 +327,7 @@ func TestValidateManagementAccessLinkExistingIronicNodeByMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -370,7 +370,7 @@ func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: port exists but linked node doesn't random-wrong-id: Resource not found") } @@ -412,7 +412,7 @@ func TestValidateManagementAccessExistingPortButHasName(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: node found by MAC but has a name: wrong-name") } @@ -450,7 +450,7 @@ func TestValidateManagementAccessAddTwoHostsWithSameMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 8fe303271e..1c9aadf1af 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -43,7 +43,7 @@ type Provisioner interface { // of credentials it has are different from the credentials it has // previously been using, without implying that either set of // credentials is correct. - ValidateManagementAccess(credentialsChanged bool) (result Result, err error) + ValidateManagementAccess(credentialsChanged, force bool) (result Result, err error) // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called From 2d02462fb45dbb5dd9dd0189ce91e0c417bbab2e Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Fri, 4 Dec 2020 15:18:17 -0500 Subject: [PATCH 3/6] Delay retry on inspect failed If inspection fails at some point before it is actually started in ironic-inspector, we were just repeatedly retrying it instead of setting an error. Instead, set an error and retry only after a backoff. --- controllers/metal3.io/baremetalhost_controller.go | 2 +- controllers/metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/inspecthardware_test.go | 2 +- pkg/provisioner/ironic/ironic.go | 11 ++++++++++- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 32f5123bd1..b1599328f6 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -472,7 +472,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("inspecting hardware") - provResult, details, err := prov.InspectHardware() + provResult, details, err := prov.InspectHardware(info.host.Status.ErrorType == metal3v1alpha1.InspectionError) if err != nil { return actionError{errors.Wrap(err, "hardware inspection failed")} } diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index c93dc089ff..ae8afb14ea 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -370,7 +370,7 @@ func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force boo return m.nextResult, err } -func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { details = &metal3v1alpha1.HardwareDetails{} return m.nextResult, details, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index b19791b419..826c684f45 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -102,7 +102,7 @@ func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force boo // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *demoProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *demoProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) hostName := p.host.ObjectMeta.Name diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index cbc71d8164..e205e3f26a 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -29,7 +29,7 @@ func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bo // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *emptyProvisioner) InspectHardware() (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { +func (p *emptyProvisioner) InspectHardware(force bool) (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { return provisioner.Result{}, nil, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index cae9e78bec..ebc2dbfc48 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -98,7 +98,7 @@ func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *fixtureProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *fixtureProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) // The inspection is ongoing. We'll need to check the fixture diff --git a/pkg/provisioner/ironic/inspecthardware_test.go b/pkg/provisioner/ironic/inspecthardware_test.go index b094dffec3..7118cb7ccc 100644 --- a/pkg/provisioner/ironic/inspecthardware_test.go +++ b/pkg/provisioner/ironic/inspecthardware_test.go @@ -152,7 +152,7 @@ func TestInspectHardware(t *testing.T) { } prov.status.ID = nodeUUID - result, details, err := prov.InspectHardware() + result, details, err := prov.InspectHardware(false) assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index dff956c18d..257ac965e7 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -651,7 +651,7 @@ func (p *ironicProvisioner) changeNodeProvisionState(ironicNode *nodes.Node, opt // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) ironicNode, err := p.findExistingHost() @@ -674,6 +674,15 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail err = nil return default: + if nodes.ProvisionState(ironicNode.ProvisionState) == nodes.InspectFail && !force { + p.log.Info("starting inspection failed", "error", status.Error) + if ironicNode.LastError == "" { + result.ErrorMessage = "Inspection failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + err = nil + } p.log.Info("updating boot mode before hardware inspection") op, value := buildCapabilitiesValue(ironicNode, p.host.Status.Provisioning.BootMode) updates := nodes.UpdateOpts{ diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 1c9aadf1af..335506796c 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -49,7 +49,7 @@ type Provisioner interface { // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. - InspectHardware() (result Result, details *metal3v1alpha1.HardwareDetails, err error) + InspectHardware(force bool) (result Result, details *metal3v1alpha1.HardwareDetails, err error) // UpdateHardwareState fetches the latest hardware state of the // server and updates the HardwareDetails field of the host with From 7791d21daa694af5bff006ee2dba17246772eed7 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:29:11 -0500 Subject: [PATCH 4/6] Ironic: Handle Error state correctly Error is the state that the Node goes into if deleting (i.e. deprovisioning) it fails. The only valid action from this state is to try deleting again, so do that rather than attempt to go straight to manageable. --- pkg/provisioner/ironic/ironic.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 257ac965e7..2e87a3f4cc 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1347,7 +1347,7 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) case nodes.Error: return p.changeNodeProvisionState( ironicNode, - nodes.ProvisionStateOpts{Target: nodes.TargetManage}, + nodes.ProvisionStateOpts{Target: nodes.TargetDeleted}, ) case nodes.CleanFail: From d7d38eef2ade940e54d3ce9f2be910e908331bc4 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:44:13 -0500 Subject: [PATCH 5/6] Delay retry on deprovision failed If deprovisioning fails, we were just repeatedly retrying it instead of setting an error. Instead, set an error and retry only after a backoff. If we are deprovisioning because of a deletion, give up after 3 retries (since this may indicate that the host simply doesn't exist any more) and allow the Host to be deleted. --- .../metal3.io/baremetalhost_controller.go | 2 +- controllers/metal3.io/host_state_machine.go | 7 +++---- .../metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/ironic.go | 17 ++++++++++++++++- pkg/provisioner/ironic/provision_test.go | 7 ++++--- pkg/provisioner/provisioner.go | 2 +- 9 files changed, 29 insertions(+), 14 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index b1599328f6..b15a9ba924 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -604,7 +604,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio info.log.Info("deprovisioning") - provResult, err = prov.Deprovision() + provResult, err = prov.Deprovision(info.host.Status.ErrorType == metal3v1alpha1.ProvisioningError) if err != nil { return actionError{errors.Wrap(err, "failed to deprovision")} } diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index b8e0dc70b1..449782cebe 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -358,10 +358,9 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes case actionFailed: // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. - // Note that this is entirely theoretical, as the - // Ironic provisioner currently never gives up - // trying to deprovision. - return skipToDelete() + if hsm.Host.Status.ErrorCount > 3 { + return skipToDelete() + } case actionError: if r.NeedsRegistration() && !hsm.haveCreds { // If the host is not registered as a node in Ironic and we diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index ae8afb14ea..96834305f9 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -387,7 +387,7 @@ func (m *mockProvisioner) Provision(configData provisioner.HostConfigData) (resu return m.nextResult, err } -func (m *mockProvisioner) Deprovision() (result provisioner.Result, err error) { +func (m *mockProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { return m.nextResult, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 826c684f45..9fc1bc8168 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -218,7 +218,7 @@ func (p *demoProvisioner) Provision(hostConf provisioner.HostConfigData) (result // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *demoProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *demoProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { hostName := p.host.ObjectMeta.Name switch hostName { diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index e205e3f26a..a50af7fc82 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -57,7 +57,7 @@ func (p *emptyProvisioner) Provision(hostConf provisioner.HostConfigData) (provi // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *emptyProvisioner) Deprovision() (provisioner.Result, error) { +func (p *emptyProvisioner) Deprovision(force bool) (provisioner.Result, error) { return provisioner.Result{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index ebc2dbfc48..dcb365b30d 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -201,7 +201,7 @@ func (p *fixtureProvisioner) Provision(hostConf provisioner.HostConfigData) (res // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *fixtureProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("ensuring host is deprovisioned") result.RequeueAfter = deprovisionRequeueDelay diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 2e87a3f4cc..ade5a59acf 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1323,7 +1323,7 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("deprovisioning") ironicNode, err := p.findExistingHost() @@ -1345,6 +1345,17 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Error: + if !force { + p.log.Info("deprovisioning failed") + if ironicNode.LastError == "" { + result.ErrorMessage = "Deprovisioning failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + return result, nil + } + p.log.Info("retrying deprovisioning") + p.publisher("DeprovisioningStarted", "Image deprovisioning restarted") return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetDeleted}, @@ -1355,6 +1366,10 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) p.log.Info("clearing maintenance flag") return p.setMaintenanceFlag(ironicNode, false) } + // This will return us to the manageable state without completing + // cleaning. Because cleaning happens in the process of moving from + // manageable to available, the node will still get cleaned before + // we provision it again. return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 6b6038ffc6..8c463ffaab 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -125,6 +125,7 @@ func TestDeprovision(t *testing.T) { ironic *testserver.IronicMock expectedDirty bool expectedError bool + expectedErrorMessage bool expectedRequestAfter int }{ { @@ -142,8 +143,7 @@ func TestDeprovision(t *testing.T) { ProvisionState: string(nodes.Error), UUID: nodeUUID, }), - expectedRequestAfter: 10, - expectedDirty: true, + expectedErrorMessage: true, }, { name: "available state", @@ -207,9 +207,10 @@ func TestDeprovision(t *testing.T) { } prov.status.ID = nodeUUID - result, err := prov.Deprovision() + result, err := prov.Deprovision(false) assert.Equal(t, tc.expectedDirty, result.Dirty) + assert.Equal(t, tc.expectedErrorMessage, result.ErrorMessage != "") assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) if !tc.expectedError { assert.NoError(t, err) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 335506796c..a4677027ce 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -70,7 +70,7 @@ type Provisioner interface { // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until // the deprovisioning operation is completed. - Deprovision() (result Result, err error) + Deprovision(force bool) (result Result, err error) // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty From 7c3daefee2ab4d46e994162dc2dcc19bae59585d Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:52:37 -0500 Subject: [PATCH 6/6] provisioner: Fix copy-paste error in docs --- pkg/provisioner/provisioner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index a4677027ce..e5e01f2c29 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -74,7 +74,7 @@ type Provisioner interface { // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty - // flag until the deprovisioning operation is completed. + // flag until the deletion operation is completed. Delete() (result Result, err error) // PowerOn ensures the server is powered on independently of any image