From cb79aeeb430bca090f06565812e14ff8d11c24d4 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Mon, 7 Dec 2020 13:10:30 -0500 Subject: [PATCH 01/28] remove Available() method from BareMetalHost It is up to the consumer, not the host, to decide when a host is ready to have provisioning instructions added. The checks in the Available() method will be inlined in cluster-api-provider-metal3. Signed-off-by: Doug Hellmann (cherry picked from commit 095bf63f6a837cbb05308ea3054f50c5c6d85d51) Signed-off-by: Honza Pokorny --- .../metal3.io/v1alpha1/baremetalhost_types.go | 14 ---- .../v1alpha1/baremetalhost_types_test.go | 66 ------------------- 2 files changed, 80 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 74fbd55b1f..45d9eebc0e 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -608,20 +608,6 @@ func (host *BareMetalHost) BootMode() BootMode { return mode } -// Available returns true if the host is available to be provisioned. -func (host *BareMetalHost) Available() bool { - if host.Spec.ConsumerRef != nil { - return false - } - if host.GetDeletionTimestamp() != nil { - return false - } - if host.HasError() { - return false - } - return true -} - // SetErrorMessage updates the ErrorMessage in the host Status struct // and increases the ErrorCount func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) { diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go index dddb47b607..136522664e 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go @@ -2,7 +2,6 @@ package v1alpha1 import ( "testing" - "time" "github.com/stretchr/testify/assert" @@ -10,71 +9,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func TestHostAvailable(t *testing.T) { - hostWithError := BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - } - hostWithError.SetErrorMessage(RegistrationError, "oops something went wrong") - - testCases := []struct { - Host BareMetalHost - Expected bool - FailMessage string - }{ - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - }, - Expected: true, - FailMessage: "available host returned not available", - }, - { - Host: hostWithError, - Expected: false, - FailMessage: "host with error returned as available", - }, - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - }, - Spec: BareMetalHostSpec{ - ConsumerRef: &corev1.ObjectReference{ - Name: "mymachine", - Namespace: "myns", - }, - }, - }, - Expected: false, - FailMessage: "host with consumerref returned as available", - }, - { - Host: BareMetalHost{ - ObjectMeta: metav1.ObjectMeta{ - Name: "myhost", - Namespace: "myns", - DeletionTimestamp: &metav1.Time{Time: time.Now()}, - }, - }, - Expected: false, - FailMessage: "deleted host returned as available", - }, - } - - for _, tc := range testCases { - if tc.Host.Available() != tc.Expected { - t.Error(tc.FailMessage) - } - } -} - func TestHostNeedsHardwareInspection(t *testing.T) { testCases := []struct { From 3e04aaf6284b97f93682dcdb9b262824fc9268d5 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Tue, 8 Dec 2020 09:20:28 -0500 Subject: [PATCH 02/28] remove HasError method from host type Use the API field directly instead of the convenience method. Signed-off-by: Doug Hellmann (cherry picked from commit 29c784bdb0eacad95c488021a8d7170e5c6c9fa6) Signed-off-by: Honza Pokorny --- apis/metal3.io/v1alpha1/baremetalhost_types.go | 6 ------ controllers/metal3.io/baremetalhost_controller_test.go | 4 ++-- controllers/metal3.io/demo_test.go | 4 ++-- controllers/metal3.io/host_state_machine.go | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 45d9eebc0e..ed3ff14b11 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -699,12 +699,6 @@ func (host *BareMetalHost) OperationalStatus() OperationalStatus { return host.Status.OperationalStatus } -// HasError returns a boolean indicating whether there is an error -// set for the host. -func (host *BareMetalHost) HasError() bool { - return host.Status.ErrorMessage != "" -} - // CredentialsKey returns a NamespacedName suitable for loading the // Secret containing the credentials associated with the host. func (host *BareMetalHost) CredentialsKey() types.NamespacedName { diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 25e28b9a22..3642c10277 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -168,7 +168,7 @@ func waitForError(t *testing.T, r *BareMetalHostReconciler, host *metal3v1alpha1 tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { t.Logf("Waiting for error: %q", host.Status.ErrorMessage) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } @@ -177,7 +177,7 @@ func waitForNoError(t *testing.T, r *BareMetalHostReconciler, host *metal3v1alph tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { t.Logf("Waiting for no error message: %q", host.Status.ErrorMessage) - return !host.HasError() + return host.Status.ErrorMessage == "" }, ) } diff --git a/controllers/metal3.io/demo_test.go b/controllers/metal3.io/demo_test.go index ba68f96e22..0560323fd9 100644 --- a/controllers/metal3.io/demo_test.go +++ b/controllers/metal3.io/demo_test.go @@ -45,7 +45,7 @@ func TestDemoRegistrationError(t *testing.T) { host.Status.Provisioning.State, host.Status.ErrorMessage, ) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } @@ -168,7 +168,7 @@ func TestDemoValidationError(t *testing.T) { host.Status.Provisioning.State, host.Status.ErrorMessage, ) - return host.HasError() + return host.Status.ErrorMessage != "" }, ) } diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 1835ca4761..9954a38b9e 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -290,7 +290,7 @@ func (hsm *hostStateMachine) handleReady(info *reconcileInfo) actionResult { } func (hsm *hostStateMachine) provisioningCancelled() bool { - if hsm.Host.HasError() { + if hsm.Host.Status.ErrorMessage != "" { return true } if hsm.Host.Spec.Image == nil { From 5ef617afdce30256d97ec51e00e05336c19ddbb9 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Tue, 8 Dec 2020 12:47:36 -0500 Subject: [PATCH 03/28] move error handling methods from host to private functions Move public methods of the host type to private functions in the controller package. Signed-off-by: Doug Hellmann (cherry picked from commit b0c8c586ef886021589a3b00824604fa0ef4fc60) Signed-off-by: Honza Pokorny --- .../metal3.io/v1alpha1/baremetalhost_types.go | 28 --- .../v1alpha1/baremetalhost_types_test.go | 36 ---- .../metal3.io/baremetalhost_controller.go | 54 ++++-- .../baremetalhost_controller_test.go | 159 ++++++++++++++++++ 4 files changed, 200 insertions(+), 77 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index ed3ff14b11..4d93fe683b 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -608,34 +608,6 @@ func (host *BareMetalHost) BootMode() BootMode { return mode } -// SetErrorMessage updates the ErrorMessage in the host Status struct -// and increases the ErrorCount -func (host *BareMetalHost) SetErrorMessage(errType ErrorType, message string) { - host.Status.OperationalStatus = OperationalStatusError - host.Status.ErrorType = errType - host.Status.ErrorMessage = message - host.Status.ErrorCount++ -} - -// ClearError removes any existing error message. -func (host *BareMetalHost) ClearError() (dirty bool) { - dirty = host.SetOperationalStatus(OperationalStatusOK) - var emptyErrType ErrorType = "" - if host.Status.ErrorType != emptyErrType { - host.Status.ErrorType = emptyErrType - dirty = true - } - if host.Status.ErrorMessage != "" { - host.Status.ErrorMessage = "" - dirty = true - } - if host.Status.ErrorCount != 0 { - host.Status.ErrorCount = 0 - dirty = true - } - return dirty -} - // setLabel updates the given label when necessary and returns true // when a change is made or false when no change is made. func (host *BareMetalHost) setLabel(name, value string) bool { diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go index 136522664e..5a005d5d8b 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types_test.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types_test.go @@ -488,39 +488,3 @@ func TestBootMode(t *testing.T) { }) } } - -func TestErrorCountIncrementsAlways(t *testing.T) { - - b := &BareMetalHost{} - assert.Equal(t, b.Status.ErrorCount, 0) - - b.SetErrorMessage(RegistrationError, "An error message") - assert.Equal(t, b.Status.ErrorCount, 1) - - b.SetErrorMessage(InspectionError, "Another error message") - assert.Equal(t, b.Status.ErrorCount, 2) -} - -func TestClearErrorCount(t *testing.T) { - - b := &BareMetalHost{ - Status: BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, b.ClearError()) - assert.Equal(t, 0, b.Status.ErrorCount) -} - -func TestClearErrorCountOnlyIfNotZero(t *testing.T) { - - b := &BareMetalHost{ - Status: BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, b.ClearError()) - assert.False(t, b.ClearError()) -} diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index dfc243609a..415881a75b 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -276,7 +276,7 @@ func logResult(info *reconcileInfo, result ctrl.Result) { func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType, errorMessage string) actionFailed { - info.host.SetErrorMessage(errorType, errorMessage) + setErrorMessage(info.host, errorType, errorMessage) eventType := map[metal3v1alpha1.ErrorType]string{ metal3v1alpha1.RegistrationError: "RegistrationError", @@ -357,6 +357,34 @@ func clearRebootAnnotations(host *metal3v1alpha1.BareMetalHost) (dirty bool) { return } +// clearError removes any existing error message. +func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) { + dirty = host.SetOperationalStatus(metal3v1alpha1.OperationalStatusOK) + var emptyErrType metal3v1alpha1.ErrorType = "" + if host.Status.ErrorType != emptyErrType { + host.Status.ErrorType = emptyErrType + dirty = true + } + if host.Status.ErrorMessage != "" { + host.Status.ErrorMessage = "" + dirty = true + } + if host.Status.ErrorCount != 0 { + host.Status.ErrorCount = 0 + dirty = true + } + return dirty +} + +// setErrorMessage updates the ErrorMessage in the host Status struct +// and increases the ErrorCount +func setErrorMessage(host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) { + host.Status.OperationalStatus = metal3v1alpha1.OperationalStatusError + host.Status.ErrorType = errType + host.Status.ErrorMessage = message + host.Status.ErrorCount++ +} + // Manage deletion of the host func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info( @@ -423,7 +451,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner if provResult.Dirty { info.log.Info("host not ready", "wait", provResult.RequeueAfter) - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -434,7 +462,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner registeredNewCreds := !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) info.host.UpdateGoodCredentials(*info.bmcCredsSecret) info.log.Info("clearing previous error message") - info.host.ClearError() + clearError(info.host) if registeredNewCreds { info.publishEvent("BMCAccessValidated", "Verified access to BMC") @@ -456,7 +484,7 @@ func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage) } - info.host.ClearError() + clearError(info.host) if provResult.Dirty || details == nil { return actionContinue{provResult.RequeueAfter} @@ -506,7 +534,7 @@ func (r *BareMetalHostReconciler) actionMatchProfile(prov provisioner.Provisione info.publishEvent("ProfileSet", fmt.Sprintf("Hardware profile set: %s", hardwareProfile)) } - info.host.ClearError() + clearError(info.host) return actionComplete{} } @@ -540,7 +568,7 @@ func (r *BareMetalHostReconciler) actionProvisioning(prov provisioner.Provisione // Go back into the queue and wait for the Provision() method // to return false, indicating that it has no more work to // do. - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -573,7 +601,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty { - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -589,7 +617,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio } if provResult.Dirty { - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -623,7 +651,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, } if provResult.Dirty { - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -683,7 +711,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, } powerChangeAttempts.With(metricLabels).Inc() }) - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -707,7 +735,7 @@ func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provi return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty { - info.host.ClearError() + clearError(info.host) return actionContinue{provResult.RequeueAfter} } @@ -728,7 +756,7 @@ func (r *BareMetalHostReconciler) actionManageReady(prov provisioner.Provisioner if dirty { info.log.Info("updating host provisioning settings") } - info.host.ClearError() + clearError(info.host) return actionComplete{} } return r.manageHostPower(prov, info) @@ -791,7 +819,7 @@ func (r *BareMetalHostReconciler) getHostStatusFromAnnotation(host *metal3v1alph func (r *BareMetalHostReconciler) setErrorCondition(request ctrl.Request, host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.ErrorType, message string) (err error) { reqLogger := r.Log.WithValues("baremetalhost", request.NamespacedName) - host.SetErrorMessage(errType, message) + setErrorMessage(host, errType, message) reqLogger.Info( "adding error message", diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 3642c10277..1f90b41e30 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -1174,3 +1174,162 @@ func TestProvisionerIsReady(t *testing.T) { }, ) } + +func TestUpdateEventHandler(t *testing.T) { + cases := []struct { + name string + event event.UpdateEvent + expectedProcess bool + }{ + { + name: "process-non-bmh-events", + event: event.UpdateEvent{ + ObjectOld: &corev1.Secret{}, + ObjectNew: &corev1.Secret{}, + }, + expectedProcess: true, + }, + { + name: "process-generation-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{Generation: 0}, + MetaNew: &metav1.ObjectMeta{Generation: 1}, + }, + + expectedProcess: true, + }, + { + name: "skip-if-same-generation-finalizers-and-annotations", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: false, + }, + { + name: "process-same-generation-annotations-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{}, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + { + name: "process-same-generation-finalizers-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + { + name: "process-same-generation-finalizers-and-annotation-change", + event: event.UpdateEvent{ + ObjectOld: &metal3v1alpha1.BareMetalHost{}, + ObjectNew: &metal3v1alpha1.BareMetalHost{}, + MetaOld: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{}, + Annotations: map[string]string{}, + }, + MetaNew: &metav1.ObjectMeta{ + Generation: 0, + Finalizers: []string{metal3v1alpha1.BareMetalHostFinalizer}, + Annotations: map[string]string{ + metal3v1alpha1.PausedAnnotation: "true", + }, + }, + }, + + expectedProcess: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + r := newTestReconciler() + assert.Equal(t, tc.expectedProcess, r.updateEventHandler(tc.event)) + }) + } +} + +func TestErrorCountIncrementsAlways(t *testing.T) { + + b := &metal3v1alpha1.BareMetalHost{} + assert.Equal(t, b.Status.ErrorCount, 0) + + setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message") + assert.Equal(t, b.Status.ErrorCount, 1) + + setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") + assert.Equal(t, b.Status.ErrorCount, 2) +} + +func TestClearErrorCount(t *testing.T) { + + b := &metal3v1alpha1.BareMetalHost{ + Status: metal3v1alpha1.BareMetalHostStatus{ + ErrorCount: 5, + }, + } + + assert.True(t, clearError(b)) + assert.Equal(t, 0, b.Status.ErrorCount) +} + +func TestClearErrorCountOnlyIfNotZero(t *testing.T) { + + b := &metal3v1alpha1.BareMetalHost{ + Status: metal3v1alpha1.BareMetalHostStatus{ + ErrorCount: 5, + }, + } + + assert.True(t, clearError(b)) + assert.False(t, clearError(b)) +} From 9b5f914b4c38467f75d6d44e7093f3a9fe458073 Mon Sep 17 00:00:00 2001 From: Andrea Fasano Date: Wed, 9 Dec 2020 08:40:48 -0500 Subject: [PATCH 04/28] Filter out status updates from the reconcile loop (cherry picked from commit 9b3dbf701cad2abbe707a36373773779af4b5f4f) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 43 +++++++++++-------- .../baremetalhost_controller_test.go | 37 +--------------- 2 files changed, 27 insertions(+), 53 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 415881a75b..73fd61b9d6 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -20,6 +20,7 @@ import ( "encoding/json" "fmt" "os" + "reflect" "strconv" "strings" "time" @@ -937,6 +938,30 @@ func hostHasFinalizer(host *metal3v1alpha1.BareMetalHost) bool { return utils.StringInList(host.Finalizers, metal3v1alpha1.BareMetalHostFinalizer) } +func (r *BareMetalHostReconciler) updateEventHandler(e event.UpdateEvent) bool { + _, oldOK := e.ObjectOld.(*metal3v1alpha1.BareMetalHost) + _, newOK := e.ObjectNew.(*metal3v1alpha1.BareMetalHost) + if !(oldOK && newOK) { + // The thing that changed wasn't a host, so we + // need to assume that we must update. This + // happens when, for example, an owned Secret + // changes. + return true + } + + //If the update increased the resource Generation then let's process it + if e.MetaNew.GetGeneration() != e.MetaOld.GetGeneration() { + return true + } + + //Discard updates that did not increase the resource Generation (such as on Status.LastUpdated), except for the finalizers or annotations + if reflect.DeepEqual(e.MetaNew.GetFinalizers(), e.MetaOld.GetFinalizers()) && reflect.DeepEqual(e.MetaNew.GetAnnotations(), e.MetaOld.GetAnnotations()) { + return false + } + + return true +} + // SetupWithManager reigsters the reconciler to be run by the manager func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager) error { @@ -964,23 +989,7 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager) error { For(&metal3v1alpha1.BareMetalHost{}). WithEventFilter( predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - oldHost, oldOK := e.ObjectOld.(*metal3v1alpha1.BareMetalHost) - newHost, newOK := e.ObjectNew.(*metal3v1alpha1.BareMetalHost) - if !(oldOK && newOK) { - // The thing that changed wasn't a host, so we - // need to assume that we must update. This - // happens when, for example, an owned Secret - // changes. - return true - } - - if oldHost.Status.ErrorCount != newHost.Status.ErrorCount { - //skip reconcile loop - return false - } - return true - }, + UpdateFunc: r.updateEventHandler, }). WithOptions(opts). Owns(&corev1.Secret{}). diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 1f90b41e30..90e7361423 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -19,6 +19,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/reconcile" metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" @@ -1297,39 +1298,3 @@ func TestUpdateEventHandler(t *testing.T) { }) } } - -func TestErrorCountIncrementsAlways(t *testing.T) { - - b := &metal3v1alpha1.BareMetalHost{} - assert.Equal(t, b.Status.ErrorCount, 0) - - setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message") - assert.Equal(t, b.Status.ErrorCount, 1) - - setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") - assert.Equal(t, b.Status.ErrorCount, 2) -} - -func TestClearErrorCount(t *testing.T) { - - b := &metal3v1alpha1.BareMetalHost{ - Status: metal3v1alpha1.BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, clearError(b)) - assert.Equal(t, 0, b.Status.ErrorCount) -} - -func TestClearErrorCountOnlyIfNotZero(t *testing.T) { - - b := &metal3v1alpha1.BareMetalHost{ - Status: metal3v1alpha1.BareMetalHostStatus{ - ErrorCount: 5, - }, - } - - assert.True(t, clearError(b)) - assert.False(t, clearError(b)) -} From 16fc1d5de5caa6bb40d5c3ca3c2abb3eb1448546 Mon Sep 17 00:00:00 2001 From: Steven Hardy Date: Mon, 14 Dec 2020 18:11:06 +0000 Subject: [PATCH 05/28] ironic provisioner de-duplicate image options (cherry picked from commit 177d9efb4d627f6c3d9fae92e0b68c94c06aa713) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 144 +++++++++++-------------------- 1 file changed, 52 insertions(+), 92 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 0e75823d7b..bc18e93108 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -420,75 +420,23 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r imageData = p.host.Spec.Image } - checksum, checksumType, ok := imageData.GetChecksum() - - if ok { - p.log.Info("setting instance info", - "image_source", imageData.URL, - "image_os_hash_value", checksum, - "image_os_hash_algo", checksumType, - ) - - updates := nodes.UpdateOpts{ - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_source", - Value: imageData.URL, - }, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_os_hash_value", - Value: checksum, - }, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_os_hash_algo", - Value: checksumType, - }, - nodes.UpdateOperation{ - Op: nodes.ReplaceOp, - Path: "/instance_uuid", - Value: string(p.host.ObjectMeta.UID), - }, - } - - // image_checksum - // - // FIXME: For older versions of ironic that do not have - // https://review.opendev.org/#/c/711816/ failing to - // include the 'image_checksum' causes ironic to refuse to - // provision the image, even if the other hash value - // parameters are given. We only want to do that for MD5, - // however, because those versions of ironic only support - // MD5 checksums. - if checksumType == string(metal3v1alpha1.MD5) { - updates = append( - updates, - nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_checksum", - Value: checksum, - }, - ) - } - - if imageData.DiskFormat != nil { - updates = append(updates, nodes.UpdateOperation{ - Op: nodes.AddOp, - Path: "/instance_info/image_disk_format", - Value: *imageData.DiskFormat, - }) + if imageData != nil { + updates, err := p.getImageUpdateOptsForNode(ironicNode, imageData) + if err != nil { + return result, errors.Wrap(err, "Could not get Image options for node") } - _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() - switch err.(type) { - case nil: - case gophercloud.ErrDefault409: - p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil - default: - return result, errors.Wrap(err, "failed to update host settings in ironic") + if len(updates) != 0 { + _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() + switch err.(type) { + case nil: + case gophercloud.ErrDefault409: + p.log.Info("could not update host settings in ironic, busy") + result.Dirty = true + result.RequeueAfter = provisionRequeueDelay + return result, nil + default: + return result, errors.Wrap(err, "failed to update host settings in ironic") + } } } } else { @@ -775,16 +723,22 @@ func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, er return result, nil } -func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) { - - hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) - - if err != nil { - return updates, errors.Wrap(err, - fmt.Sprintf("Could not start provisioning with bad hardware profile %s", - p.host.HardwareProfile())) +func (p *ironicProvisioner) getImageUpdateOptsForNode(ironicNode *nodes.Node, imageData *metal3v1alpha1.Image) (updates nodes.UpdateOpts, err error) { + checksum, checksumType, ok := p.host.GetImageChecksum() + if !ok { + p.log.Info("image/checksum not found for host") + return } - + // instance_uuid + p.log.Info("setting instance_uuid") + updates = append( + updates, + nodes.UpdateOperation{ + Op: nodes.ReplaceOp, + Path: "/instance_uuid", + Value: string(p.host.ObjectMeta.UID), + }, + ) // image_source var op nodes.UpdateOp if _, ok := ironicNode.InstanceInfo["image_source"]; !ok { @@ -799,12 +753,10 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update nodes.UpdateOperation{ Op: op, Path: "/instance_info/image_source", - Value: p.host.Spec.Image.URL, + Value: imageData.URL, }, ) - checksum, checksumType, _ := p.host.GetImageChecksum() - // image_os_hash_algo if _, ok := ironicNode.InstanceInfo["image_os_hash_algo"]; !ok { op = nodes.AddOp @@ -865,30 +817,38 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update ) } - if p.host.Spec.Image.DiskFormat != nil { + if imageData.DiskFormat != nil { updates = append(updates, nodes.UpdateOperation{ Op: nodes.AddOp, Path: "/instance_info/image_disk_format", - Value: *p.host.Spec.Image.DiskFormat, + Value: *imageData.DiskFormat, }) } + return updates, nil +} - // instance_uuid - p.log.Info("setting instance_uuid") - updates = append( - updates, - nodes.UpdateOperation{ - Op: nodes.ReplaceOp, - Path: "/instance_uuid", - Value: string(p.host.ObjectMeta.UID), - }, - ) +func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) { + + hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) + + if err != nil { + return updates, errors.Wrap(err, + fmt.Sprintf("Could not start provisioning with bad hardware profile %s", + p.host.HardwareProfile())) + } + + imageOpts, err := p.getImageUpdateOptsForNode(ironicNode, p.host.Spec.Image) + if err != nil { + return updates, errors.Wrap(err, "Could not get Image options for node") + } + updates = append(updates, imageOpts...) // root_gb // // FIXME(dhellmann): We have to provide something for the disk // size until https://storyboard.openstack.org/#!/story/2005165 is // fixed in ironic. + var op nodes.UpdateOp if _, ok := ironicNode.InstanceInfo["root_gb"]; !ok { op = nodes.AddOp p.log.Info("adding root_gb") From 65ce67faac71d4a1410b756cb2ffaa0be8ec6bd1 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 21:40:48 -0500 Subject: [PATCH 06/28] Clear ErrorCount independently of ErrorType/Message When a call to the provisioner succeeds rather than returning an error message, that's a good sign, and a reason to not have an error message set in the Host object. But it doesn't guarantee success: if the previous failure came at the end of some long-running operation (rather than outright rejection at the beginning), it could yet fail in exactly the same way. Clearing the ErrorType as soon as we start an operation allows us to use that field to determine whether to force the provisioner to retry in the presence of an error. (It will only be set if the last thing we did was record an error, therefore if we see it then we are at the beginning of a new retry.) One known issue with this is that if a request to change state in Ironic results in a 409 Conflict response, this just sets the Dirty flag and is indistinguishable from success when viewed outside the Ironic provisioner. If such a conflict occurs, we will effectively skip a retry attempt, increment the ErrorCount again, and sleep for even longer before the next attempt. To ensure that we actually do exponential backoff between retries, leave the ErrorCount unchanged until the whole operation actually succeeds (Dirty is no longer true). This is now decoupled from the ClearError() method that clears the ErrorType and ErrorMessage. As much as possible, do the clearing of ErrorCount in the host state machine. The exception is the steady states where we only do power management - the actionResult types currently lack enough detail to distinguish when count should be cleared when viewed from the state machine. In power management states (Ready/Available, Provisioned, Externally Provisioned), count the number of errors of any type since the power state was last successfully changed. Otherwise, the ErrorCount is cleared when an operation succesfully completes. Successful re-registration or adoption is never sufficient to clear the error count, except in the registration state. (cherry picked from commit 562b6ac58e8be1baa5e968beed5d9912a3f8f153) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 5 +--- .../baremetalhost_controller_test.go | 12 +++++++++ controllers/metal3.io/host_state_machine.go | 10 ++++++- .../metal3.io/host_state_machine_test.go | 26 ++++++++++++++----- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 73fd61b9d6..110e0ada41 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -370,10 +370,6 @@ func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) { host.Status.ErrorMessage = "" dirty = true } - if host.Status.ErrorCount != 0 { - host.Status.ErrorCount = 0 - dirty = true - } return dirty } @@ -720,6 +716,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // state and there were no errors, so reflect the new state in the // host status field. info.host.Status.PoweredOn = info.host.Spec.Online + info.host.Status.ErrorCount = 0 return steadyStateResult } diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 90e7361423..3c02a2b966 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -1298,3 +1298,15 @@ func TestUpdateEventHandler(t *testing.T) { }) } } + +func TestErrorCountIncrementsAlways(t *testing.T) { + + b := &metal3v1alpha1.BareMetalHost{} + assert.Equal(t, b.Status.ErrorCount, 0) + + setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message") + assert.Equal(t, b.Status.ErrorCount, 1) + + setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") + assert.Equal(t, b.Status.ErrorCount, 2) +} diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 9954a38b9e..b8e0dc70b1 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -240,6 +240,7 @@ func (hsm *hostStateMachine) handleRegistering(info *reconcileInfo) actionResult } else { hsm.NextState = metal3v1alpha1.StateInspecting } + hsm.Host.Status.ErrorCount = 0 return actionComplete{} } @@ -247,6 +248,7 @@ func (hsm *hostStateMachine) handleInspecting(info *reconcileInfo) actionResult actResult := hsm.Reconciler.actionInspecting(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateMatchProfile + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -255,12 +257,14 @@ func (hsm *hostStateMachine) handleMatchProfile(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionMatchProfile(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } return actResult } func (hsm *hostStateMachine) handleExternallyProvisioned(info *reconcileInfo) actionResult { if hsm.Host.Spec.ExternallyProvisioned { + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -281,8 +285,8 @@ func (hsm *hostStateMachine) handleReady(info *reconcileInfo) actionResult { return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageReady actResult := hsm.Reconciler.actionManageReady(hsm.Provisioner, info) - if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioning } @@ -317,6 +321,7 @@ func (hsm *hostStateMachine) handleProvisioning(info *reconcileInfo) actionResul actResult := hsm.Reconciler.actionProvisioning(hsm.Provisioner, info) if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateProvisioned + hsm.Host.Status.ErrorCount = 0 } return actResult } @@ -327,6 +332,7 @@ func (hsm *hostStateMachine) handleProvisioned(info *reconcileInfo) actionResult return actionComplete{} } + // ErrorCount is cleared when appropriate inside actionManageSteadyState return hsm.Reconciler.actionManageSteadyState(hsm.Provisioner, info) } @@ -336,6 +342,7 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes if hsm.Host.DeletionTimestamp.IsZero() { if _, complete := actResult.(actionComplete); complete { hsm.NextState = metal3v1alpha1.StateReady + hsm.Host.Status.ErrorCount = 0 } } else { skipToDelete := func() actionResult { @@ -347,6 +354,7 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes switch r := actResult.(type) { case actionComplete: hsm.NextState = metal3v1alpha1.StateDeleting + hsm.Host.Status.ErrorCount = 0 case actionFailed: // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 7f717658e7..270511075f 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -234,8 +234,9 @@ func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) { func TestErrorCountCleared(t *testing.T) { tests := []struct { - Scenario string - Host *metal3v1alpha1.BareMetalHost + Scenario string + Host *metal3v1alpha1.BareMetalHost + PreserveErrorCountOnComplete bool }{ { Scenario: "registering", @@ -246,8 +247,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateInspecting).build(), }, { - Scenario: "ready", - Host: host(metal3v1alpha1.StateReady).build(), + Scenario: "ready", + Host: host(metal3v1alpha1.StateReady).build(), + PreserveErrorCountOnComplete: true, }, { Scenario: "deprovisioning", @@ -258,8 +260,9 @@ func TestErrorCountCleared(t *testing.T) { Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), }, { - Scenario: "externallyProvisioned", - Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + Scenario: "externallyProvisioned", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + PreserveErrorCountOnComplete: true, }, } for _, tt := range tests { @@ -272,8 +275,16 @@ func TestErrorCountCleared(t *testing.T) { prov.setNextResult(true) result := hsm.ReconcileState(info) - assert.Equal(t, tt.Host.Status.ErrorCount, 0) + assert.Equal(t, 1, tt.Host.Status.ErrorCount) assert.True(t, result.Dirty()) + + prov.setNextResult(false) + hsm.ReconcileState(info) + if tt.PreserveErrorCountOnComplete { + assert.Equal(t, 1, tt.Host.Status.ErrorCount) + } else { + assert.Equal(t, 0, tt.Host.Status.ErrorCount) + } }) } } @@ -360,6 +371,7 @@ func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (res } func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { + details = &metal3v1alpha1.HardwareDetails{} return m.nextResult, details, err } From 3460c8e8f3489746ecca167662aaeec0fd66c585 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 12:51:49 -0500 Subject: [PATCH 07/28] Add force flag to ValidateManagementAccess() Once we see an error in the Node, it returns to the 'enroll' [sic] state and we don't have a way of determining whether we have seen and saved that error or not. Previously we always assumed we had not, and didn't retry the validation unless the credentials had changed. Add a force flag to indicate that this is a new attempt and should start again. Fixes #739 (cherry picked from commit 84ca573754f9812c2cf622745297f9cbc78e22ae) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 2 +- .../metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/ironic.go | 4 ++-- .../ironic/validatemanagementaccess_test.go | 22 +++++++++---------- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 110e0ada41..32f5123bd1 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -434,7 +434,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner info.postSaveCallbacks = append(info.postSaveCallbacks, updatedCredentials.Inc) } - provResult, err := prov.ValidateManagementAccess(credsChanged) + provResult, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) if err != nil { noManagementAccess.Inc() return actionError{errors.Wrap(err, "failed to validate BMC access")} diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 270511075f..c93dc089ff 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -366,7 +366,7 @@ func (m *mockProvisioner) setNextResult(dirty bool) { } } -func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { return m.nextResult, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index f118dc27c3..b19791b419 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -68,7 +68,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { p.log.Info("testing management access") hostName := p.host.ObjectMeta.Name diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index 3bfd226ec4..cbc71d8164 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -21,7 +21,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged bool) (provisioner.Result, error) { +func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (provisioner.Result, error) { return provisioner.Result{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index bcd44458f1..cae9e78bec 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -77,7 +77,7 @@ func NewMock(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publi // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { p.log.Info("testing management access") // Fill in the ID of the host in the provisioning system diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index bc18e93108..2e8c876d1f 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -333,7 +333,7 @@ func (p *ironicProvisioner) findExistingHost() (ironicNode *nodes.Node, err erro // // FIXME(dhellmann): We should rename this method to describe what it // actually does. -func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (result provisioner.Result, err error) { +func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { var ironicNode *nodes.Node p.log.Info("validating management access") @@ -520,7 +520,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r case nodes.Enroll: // If ironic is reporting an error, stop working on the node. - if ironicNode.LastError != "" && !credentialsChanged { + if ironicNode.LastError != "" && !(credentialsChanged || force) { result.ErrorMessage = ironicNode.LastError return result, nil } diff --git a/pkg/provisioner/ironic/validatemanagementaccess_test.go b/pkg/provisioner/ironic/validatemanagementaccess_test.go index 522b0becd9..13f4977788 100644 --- a/pkg/provisioner/ironic/validatemanagementaccess_test.go +++ b/pkg/provisioner/ironic/validatemanagementaccess_test.go @@ -33,7 +33,7 @@ func TestValidateManagementAccessNoMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -63,7 +63,7 @@ func TestValidateManagementAccessMACOptional(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -95,7 +95,7 @@ func TestValidateManagementAccessCreateNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -130,7 +130,7 @@ func TestValidateManagementAccessExistingNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -193,7 +193,7 @@ func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -238,7 +238,7 @@ func TestValidateManagementAccessExistingNodeWaiting(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -280,7 +280,7 @@ func TestValidateManagementAccessNewCredentials(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(true) + result, err := prov.ValidateManagementAccess(true, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -327,7 +327,7 @@ func TestValidateManagementAccessLinkExistingIronicNodeByMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -370,7 +370,7 @@ func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: port exists but linked node doesn't random-wrong-id: Resource not found") } @@ -412,7 +412,7 @@ func TestValidateManagementAccessExistingPortButHasName(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false) + _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: node found by MAC but has a name: wrong-name") } @@ -450,7 +450,7 @@ func TestValidateManagementAccessAddTwoHostsWithSameMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false) + result, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 8fe303271e..1c9aadf1af 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -43,7 +43,7 @@ type Provisioner interface { // of credentials it has are different from the credentials it has // previously been using, without implying that either set of // credentials is correct. - ValidateManagementAccess(credentialsChanged bool) (result Result, err error) + ValidateManagementAccess(credentialsChanged, force bool) (result Result, err error) // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called From bcda3be04f685f110b8d1af3bd0c52eeb2f2eca5 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Fri, 4 Dec 2020 15:18:17 -0500 Subject: [PATCH 08/28] Delay retry on inspect failed If inspection fails at some point before it is actually started in ironic-inspector, we were just repeatedly retrying it instead of setting an error. Instead, set an error and retry only after a backoff. (cherry picked from commit 2d02462fb45dbb5dd9dd0189ce91e0c417bbab2e) Signed-off-by: Honza Pokorny --- controllers/metal3.io/baremetalhost_controller.go | 2 +- controllers/metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/inspecthardware_test.go | 2 +- pkg/provisioner/ironic/ironic.go | 11 ++++++++++- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 32f5123bd1..b1599328f6 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -472,7 +472,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("inspecting hardware") - provResult, details, err := prov.InspectHardware() + provResult, details, err := prov.InspectHardware(info.host.Status.ErrorType == metal3v1alpha1.InspectionError) if err != nil { return actionError{errors.Wrap(err, "hardware inspection failed")} } diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index c93dc089ff..ae8afb14ea 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -370,7 +370,7 @@ func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force boo return m.nextResult, err } -func (m *mockProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { details = &metal3v1alpha1.HardwareDetails{} return m.nextResult, details, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index b19791b419..826c684f45 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -102,7 +102,7 @@ func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force boo // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *demoProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *demoProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) hostName := p.host.ObjectMeta.Name diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index cbc71d8164..e205e3f26a 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -29,7 +29,7 @@ func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bo // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *emptyProvisioner) InspectHardware() (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { +func (p *emptyProvisioner) InspectHardware(force bool) (provisioner.Result, *metal3v1alpha1.HardwareDetails, error) { return provisioner.Result{}, nil, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index cae9e78bec..ebc2dbfc48 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -98,7 +98,7 @@ func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *fixtureProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *fixtureProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) // The inspection is ongoing. We'll need to check the fixture diff --git a/pkg/provisioner/ironic/inspecthardware_test.go b/pkg/provisioner/ironic/inspecthardware_test.go index b094dffec3..7118cb7ccc 100644 --- a/pkg/provisioner/ironic/inspecthardware_test.go +++ b/pkg/provisioner/ironic/inspecthardware_test.go @@ -152,7 +152,7 @@ func TestInspectHardware(t *testing.T) { } prov.status.ID = nodeUUID - result, details, err := prov.InspectHardware() + result, details, err := prov.InspectHardware(false) assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 2e8c876d1f..88c5496b1e 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -599,7 +599,7 @@ func (p *ironicProvisioner) changeNodeProvisionState(ironicNode *nodes.Node, opt // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. -func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { +func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { p.log.Info("inspecting hardware", "status", p.host.OperationalStatus()) ironicNode, err := p.findExistingHost() @@ -622,6 +622,15 @@ func (p *ironicProvisioner) InspectHardware() (result provisioner.Result, detail err = nil return default: + if nodes.ProvisionState(ironicNode.ProvisionState) == nodes.InspectFail && !force { + p.log.Info("starting inspection failed", "error", status.Error) + if ironicNode.LastError == "" { + result.ErrorMessage = "Inspection failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + err = nil + } p.log.Info("updating boot mode before hardware inspection") op, value := buildCapabilitiesValue(ironicNode, p.host.Status.Provisioning.BootMode) updates := nodes.UpdateOpts{ diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 1c9aadf1af..335506796c 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -49,7 +49,7 @@ type Provisioner interface { // details of devices discovered on the hardware. It may be called // multiple times, and should return true for its dirty flag until the // inspection is completed. - InspectHardware() (result Result, details *metal3v1alpha1.HardwareDetails, err error) + InspectHardware(force bool) (result Result, details *metal3v1alpha1.HardwareDetails, err error) // UpdateHardwareState fetches the latest hardware state of the // server and updates the HardwareDetails field of the host with From e859a75dbfde9ed220d1609d0093aff04e78ffb5 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:29:11 -0500 Subject: [PATCH 09/28] Ironic: Handle Error state correctly Error is the state that the Node goes into if deleting (i.e. deprovisioning) it fails. The only valid action from this state is to try deleting again, so do that rather than attempt to go straight to manageable. (cherry picked from commit 7791d21daa694af5bff006ee2dba17246772eed7) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 88c5496b1e..dc394ce12f 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1310,7 +1310,7 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) case nodes.Error: return p.changeNodeProvisionState( ironicNode, - nodes.ProvisionStateOpts{Target: nodes.TargetManage}, + nodes.ProvisionStateOpts{Target: nodes.TargetDeleted}, ) case nodes.CleanFail: From 9e8e885060035e993557bde981df3745766f215c Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:44:13 -0500 Subject: [PATCH 10/28] Delay retry on deprovision failed If deprovisioning fails, we were just repeatedly retrying it instead of setting an error. Instead, set an error and retry only after a backoff. If we are deprovisioning because of a deletion, give up after 3 retries (since this may indicate that the host simply doesn't exist any more) and allow the Host to be deleted. (cherry picked from commit d7d38eef2ade940e54d3ce9f2be910e908331bc4) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 2 +- controllers/metal3.io/host_state_machine.go | 7 +++---- .../metal3.io/host_state_machine_test.go | 2 +- pkg/provisioner/demo/demo.go | 2 +- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 2 +- pkg/provisioner/ironic/ironic.go | 17 ++++++++++++++++- pkg/provisioner/ironic/provision_test.go | 7 ++++--- pkg/provisioner/provisioner.go | 2 +- 9 files changed, 29 insertions(+), 14 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index b1599328f6..b15a9ba924 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -604,7 +604,7 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio info.log.Info("deprovisioning") - provResult, err = prov.Deprovision() + provResult, err = prov.Deprovision(info.host.Status.ErrorType == metal3v1alpha1.ProvisioningError) if err != nil { return actionError{errors.Wrap(err, "failed to deprovision")} } diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index b8e0dc70b1..449782cebe 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -358,10 +358,9 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes case actionFailed: // If the provisioner gives up deprovisioning and // deletion has been requested, continue to delete. - // Note that this is entirely theoretical, as the - // Ironic provisioner currently never gives up - // trying to deprovision. - return skipToDelete() + if hsm.Host.Status.ErrorCount > 3 { + return skipToDelete() + } case actionError: if r.NeedsRegistration() && !hsm.haveCreds { // If the host is not registered as a node in Ironic and we diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index ae8afb14ea..96834305f9 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -387,7 +387,7 @@ func (m *mockProvisioner) Provision(configData provisioner.HostConfigData) (resu return m.nextResult, err } -func (m *mockProvisioner) Deprovision() (result provisioner.Result, err error) { +func (m *mockProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { return m.nextResult, err } diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 826c684f45..9fc1bc8168 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -218,7 +218,7 @@ func (p *demoProvisioner) Provision(hostConf provisioner.HostConfigData) (result // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *demoProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *demoProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { hostName := p.host.ObjectMeta.Name switch hostName { diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index e205e3f26a..a50af7fc82 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -57,7 +57,7 @@ func (p *emptyProvisioner) Provision(hostConf provisioner.HostConfigData) (provi // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *emptyProvisioner) Deprovision() (provisioner.Result, error) { +func (p *emptyProvisioner) Deprovision(force bool) (provisioner.Result, error) { return provisioner.Result{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index ebc2dbfc48..dcb365b30d 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -201,7 +201,7 @@ func (p *fixtureProvisioner) Provision(hostConf provisioner.HostConfigData) (res // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *fixtureProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("ensuring host is deprovisioned") result.RequeueAfter = deprovisionRequeueDelay diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index dc394ce12f..16687bbf97 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1286,7 +1286,7 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until the // deprovisioning operation is completed. -func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) { +func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { p.log.Info("deprovisioning") ironicNode, err := p.findExistingHost() @@ -1308,6 +1308,17 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Error: + if !force { + p.log.Info("deprovisioning failed") + if ironicNode.LastError == "" { + result.ErrorMessage = "Deprovisioning failed" + } else { + result.ErrorMessage = ironicNode.LastError + } + return result, nil + } + p.log.Info("retrying deprovisioning") + p.publisher("DeprovisioningStarted", "Image deprovisioning restarted") return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetDeleted}, @@ -1319,6 +1330,10 @@ func (p *ironicProvisioner) Deprovision() (result provisioner.Result, err error) p.log.Info("clearing maintenance flag") return p.setMaintenanceFlag(ironicNode, false) } + // This will return us to the manageable state without completing + // cleaning. Because cleaning happens in the process of moving from + // manageable to available, the node will still get cleaned before + // we provision it again. return p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 30156e5fd2..5807c59ffe 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -116,6 +116,7 @@ func TestDeprovision(t *testing.T) { ironic *testserver.IronicMock expectedDirty bool expectedError bool + expectedErrorMessage bool expectedRequestAfter int }{ { @@ -133,8 +134,7 @@ func TestDeprovision(t *testing.T) { ProvisionState: string(nodes.Error), UUID: nodeUUID, }), - expectedRequestAfter: 10, - expectedDirty: true, + expectedErrorMessage: true, }, { name: "available state", @@ -215,9 +215,10 @@ func TestDeprovision(t *testing.T) { } prov.status.ID = nodeUUID - result, err := prov.Deprovision() + result, err := prov.Deprovision(false) assert.Equal(t, tc.expectedDirty, result.Dirty) + assert.Equal(t, tc.expectedErrorMessage, result.ErrorMessage != "") assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) if !tc.expectedError { assert.NoError(t, err) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 335506796c..a4677027ce 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -70,7 +70,7 @@ type Provisioner interface { // Deprovision removes the host from the image. It may be called // multiple times, and should return true for its dirty flag until // the deprovisioning operation is completed. - Deprovision() (result Result, err error) + Deprovision(force bool) (result Result, err error) // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty From d5b715fa3be86075076e0904d1b04e27c9e147b8 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 10 Dec 2020 20:52:37 -0500 Subject: [PATCH 11/28] provisioner: Fix copy-paste error in docs (cherry picked from commit 7c3daefee2ab4d46e994162dc2dcc19bae59585d) Signed-off-by: Honza Pokorny --- pkg/provisioner/provisioner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index a4677027ce..e5e01f2c29 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -74,7 +74,7 @@ type Provisioner interface { // Delete removes the host from the provisioning system. It may be // called multiple times, and should return true for its dirty - // flag until the deprovisioning operation is completed. + // flag until the deletion operation is completed. Delete() (result Result, err error) // PowerOn ensures the server is powered on independently of any image From 048a4483f5fc379eeef87b98a5aa2511120b30b3 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 5 Jan 2021 11:50:38 -0500 Subject: [PATCH 12/28] Ironic: Use retryAfterDelay() to handle conflicts Denote instances where we are returning due to a 409 Conflict in Ironic by calling the retryAfterDelay() function to get a Result. Note that in some cases this introduces a delay where previously there was none. Delaying after a 409 response is probably always the right thing to do. (cherry picked from commit ac23a91eae17e4f698237c6f10cccc8e038ae291) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/delete_test.go | 4 ++-- pkg/provisioner/ironic/ironic.go | 27 +++++++++++---------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pkg/provisioner/ironic/delete_test.go b/pkg/provisioner/ironic/delete_test.go index 16b8e43889..00a1eb9a9f 100644 --- a/pkg/provisioner/ironic/delete_test.go +++ b/pkg/provisioner/ironic/delete_test.go @@ -50,7 +50,7 @@ func TestDelete(t *testing.T) { }, ).DeleteError(nodeUUID, http.StatusConflict), expectedDirty: true, - expectedRequestAfter: 0, + expectedRequestAfter: provisionRequeueDelay, }, { name: "delete-host-not-found", @@ -122,7 +122,7 @@ func TestDelete(t *testing.T) { ).NodeUpdateError(nodeUUID, http.StatusConflict), expectedDirty: true, - expectedRequestAfter: 0, + expectedRequestAfter: provisionRequeueDelay, }, { name: "not-in-maintenance-update", diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 16687bbf97..f45083368c 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -431,9 +431,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to update host settings in ironic") } @@ -465,9 +463,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update ironic node name, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to update ironc node name") } @@ -490,9 +486,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host driver settings, busy") - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to update host driver settings") } @@ -579,6 +573,8 @@ func (p *ironicProvisioner) tryChangeNodeProvisionState(ironicNode *nodes.Node, success = true case gophercloud.ErrDefault409: p.log.Info("could not change state of host, busy") + result, err = retryAfterDelay(provisionRequeueDelay) + return default: err = errors.Wrap(changeResult.Err, fmt.Sprintf("failed to change provisioning state to %q", opts.Target)) @@ -645,7 +641,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true + result, err = retryAfterDelay(provisionRequeueDelay) return default: err = errors.Wrap(err, "failed to update host boot mode settings in ironic") @@ -1016,8 +1012,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - result.Dirty = true - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to update host settings in ironic") } @@ -1029,8 +1024,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p case nil: case gophercloud.ErrDefault409: p.log.Info("could not validate host during registration, busy") - result.Dirty = true - return result, nil + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to validate host during registration") } @@ -1276,6 +1270,7 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo case nil: case gophercloud.ErrDefault409: p.log.Info("could not set host maintenance flag, busy") + return retryAfterDelay(provisionRequeueDelay) default: return result, errors.Wrap(err, "failed to set host maintenance flag") } @@ -1437,6 +1432,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { p.log.Info("removed") case gophercloud.ErrDefault409: p.log.Info("could not remove host, busy") + return retryAfterDelay(provisionRequeueDelay) case gophercloud.ErrDefault404: p.log.Info("did not find host to delete, OK") default: @@ -1478,8 +1474,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar p.log.Info("power change OK") case gophercloud.ErrDefault409: p.log.Info("host is locked, trying again after delay", "delay", powerRequeueDelay) - result.Dirty = true - result.RequeueAfter = powerRequeueDelay + result, _ = retryAfterDelay(powerRequeueDelay) return result, HostLockedError{Address: p.host.Spec.BMC.Address} case gophercloud.ErrDefault400: // Error 400 Bad Request means target power state is not supported by vendor driver From 693b4fdf51dcd67bf66a7fe4e3be4de73f123027 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 5 Jan 2021 17:38:17 -0500 Subject: [PATCH 13/28] Ironic: Use retryAfterDelay() for other retries (cherry picked from commit 5faf3616e439e852d7d1a08697b4b21b4f50a8b4) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index f45083368c..dad7c4f450 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1151,8 +1151,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu // top of relational databases... if ironicNode.LastError == "" { p.log.Info("failed but error message not available") - result.Dirty = true - return result, nil + return retryAfterDelay(provisionRequeueDelay) } p.log.Info("found error", "msg", ironicNode.LastError) result.ErrorMessage = fmt.Sprintf("Image provisioning failed: %s", @@ -1532,9 +1531,7 @@ func (p *ironicProvisioner) PowerOff() (result provisioner.Result, err error) { case SoftPowerOffUnsupportedError, SoftPowerOffFailed: return p.hardPowerOff() case HostLockedError: - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return retryAfterDelay(powerRequeueDelay) default: result.RequeueAfter = powerRequeueDelay return result, err From 829c11969c83c1b580a1fa1786e670788a5898af Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 7 Jan 2021 17:06:40 -0500 Subject: [PATCH 14/28] Ironic: Use operationComplete() when complete (cherry picked from commit c1e13c5097fc48e66178f293b9d18fc108200a97) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 25 +++++++++++++++--------- pkg/provisioner/ironic/provision_test.go | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index dad7c4f450..25ce7350f6 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -358,6 +358,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b driverInfo["deploy_kernel"] = deployKernelURL driverInfo["deploy_ramdisk"] = deployRamdiskURL + result, err = operationComplete() + // If we have not found a node yet, we need to create one if ironicNode == nil { p.log.Info("registering host in ironic") @@ -542,21 +544,21 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nodes.Manageable: p.log.Info("have manageable host") - return result, nil + return case nodes.Available: // The host is fully registered (and probably wasn't cleanly // deleted previously) p.log.Info("have available host") - return result, nil + return case nodes.Active: // The host is already running, maybe it's a master? p.log.Info("have active host", "image_source", ironicNode.InstanceInfo["image_source"]) - return result, nil + return default: - return result, nil + return } } @@ -687,6 +689,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu details = hardwaredetails.GetHardwareDetails(data) p.publisher("InspectionComplete", "Hardware inspection completed") + result, err = operationComplete() return } @@ -1064,6 +1067,7 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er case nodes.Enroll, nodes.Verifying: err = fmt.Errorf("Invalid state for adopt: %s", ironicNode.ProvisionState) + return case nodes.Manageable: _, hasImageSource := ironicNode.InstanceInfo["image_source"] _, hasBootISO := ironicNode.InstanceInfo["boot_iso"] @@ -1087,6 +1091,7 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er case nodes.Adopting: result.RequeueAfter = provisionRequeueDelay result.Dirty = true + return case nodes.AdoptFail: if force { return p.changeNodeProvisionState( @@ -1098,11 +1103,12 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er } else { result.ErrorMessage = fmt.Sprintf("Host adoption failed: %s", ironicNode.LastError) + return } case nodes.Active: default: } - return + return operationComplete() } // Provision writes the image from the host spec to the host. It may @@ -1241,7 +1247,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu p.publisher("ProvisioningComplete", fmt.Sprintf("Image provisioning completed for %s", p.host.Spec.Image.URL)) p.log.Info("finished provisioning") - return result, nil + return operationComplete() default: // wait states like cleaning and clean wait @@ -1343,7 +1349,7 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, case nodes.Available: p.publisher("DeprovisioningComplete", "Image deprovisioning completed") - return result, nil + return operationComplete() case nodes.Deleting: p.log.Info("deleting") @@ -1389,7 +1395,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { } if ironicNode == nil { p.log.Info("no node found, already deleted") - return result, nil + return operationComplete() } p.log.Info("deleting host", @@ -1561,9 +1567,10 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error return result, errors.Wrap(err, "failed to power off host") } p.publisher("PowerOff", "Host powered off") + return result, err } - return result, nil + return operationComplete() } // softPowerOff sends 'soft power off' request to BM node. diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 5807c59ffe..450e4caae0 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -57,7 +57,7 @@ func TestProvision(t *testing.T) { ProvisionState: string(nodes.Active), UUID: nodeUUID, }), - expectedRequestAfter: 10, + expectedRequestAfter: 0, expectedDirty: false, }, { From 74b5da4887704c9fd4d0533f65c6b14b3128914f Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 5 Jan 2021 17:38:44 -0500 Subject: [PATCH 15/28] Ironic: Return operationContinuing() for delays (cherry picked from commit 54c8028412ee1c6274b895be6c72a9359af380bb) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 65 +++++++++----------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 25ce7350f6..480d86342d 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -524,9 +524,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b if ironicNode.TargetProvisionState == string(nodes.TargetManage) { // We have already tried to manage the node and did not // get an error, so do nothing and keep trying. - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay - return result, nil + return operationContinuing(provisionRequeueDelay) } return p.changeNodeProvisionState( @@ -538,9 +536,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // If we're still waiting for the state to change in Ironic, // return true to indicate that we're dirty and need to be // reconciled again. - result.RequeueAfter = provisionRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(provisionRequeueDelay) case nodes.Manageable: p.log.Info("have manageable host") @@ -583,8 +579,7 @@ func (p *ironicProvisioner) tryChangeNodeProvisionState(ironicNode *nodes.Node, return } - result.Dirty = true - result.RequeueAfter = provisionRequeueDelay + result, err = operationContinuing(provisionRequeueDelay) return } @@ -615,9 +610,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Inspecting, nodes.InspectWait: p.log.Info("inspection already started") - result.Dirty = true - result.RequeueAfter = introspectionRequeueDelay - err = nil + result, err = operationContinuing(introspectionRequeueDelay) return default: if nodes.ProvisionState(ironicNode.ProvisionState) == nodes.InspectFail && !force { @@ -667,8 +660,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu } if !status.Finished { p.log.Info("inspection in progress", "started_at", status.StartedAt) - result.Dirty = true // make sure we check back - result.RequeueAfter = introspectionRequeueDelay + result, err = operationContinuing(introspectionRequeueDelay) return } if status.Error != "" { @@ -1089,9 +1081,7 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er }, ) case nodes.Adopting: - result.RequeueAfter = provisionRequeueDelay - result.Dirty = true - return + return operationContinuing(provisionRequeueDelay) case nodes.AdoptFail: if force { return p.changeNodeProvisionState( @@ -1141,8 +1131,6 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu "same", ironicHasSameImage, "provisionState", ironicNode.ProvisionState) - result.RequeueAfter = provisionRequeueDelay - // Ironic has the settings it needs, see if it finds any issues // with them. switch nodes.ProvisionState(ironicNode.ProvisionState) { @@ -1254,8 +1242,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu p.log.Info("waiting for host to become available", "state", ironicNode.ProvisionState, "deploy step", ironicNode.DeployStep) - result.Dirty = true - return result, nil + return operationContinuing(provisionRequeueDelay) } } @@ -1279,8 +1266,7 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo default: return result, errors.Wrap(err, "failed to set host maintenance flag") } - result.Dirty = true - return result, nil + return operationContinuing(0) } // Deprovision removes the host from the image. It may be called @@ -1354,22 +1340,16 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, case nodes.Deleting: p.log.Info("deleting") // Transitions to Cleaning upon completion - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) case nodes.Cleaning: p.log.Info("cleaning") // Transitions to Available upon completion - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) case nodes.CleanWait: p.log.Info("cleaning") - result.Dirty = true - result.RequeueAfter = deprovisionRequeueDelay - return result, nil + return operationContinuing(deprovisionRequeueDelay) case nodes.Active: p.log.Info("starting deprovisioning") @@ -1444,8 +1424,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { return result, errors.Wrap(err, "failed to remove host") } - result.Dirty = true - return result, nil + return operationContinuing(0) } func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.TargetPowerState) (result provisioner.Result, err error) { @@ -1456,9 +1435,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar "state", ironicNode.ProvisionState, "target state", ironicNode.TargetProvisionState, ) - result.Dirty = true - result.RequeueAfter = powerRequeueDelay - return result, nil + return operationContinuing(powerRequeueDelay) } powerStateOpts := nodes.PowerStateOpts{ @@ -1475,8 +1452,8 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar switch changeResult.Err.(type) { case nil: - result.Dirty = true p.log.Info("power change OK") + return operationContinuing(0) case gophercloud.ErrDefault409: p.log.Info("host is locked, trying again after delay", "delay", powerRequeueDelay) result, _ = retryAfterDelay(powerRequeueDelay) @@ -1489,8 +1466,6 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar p.log.Info("power change error", "message", changeResult.Err) return result, errors.Wrap(changeResult.Err, "failed to change power state") } - - return result, nil } // PowerOn ensures the server is powered on independently of any image @@ -1510,9 +1485,7 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { if ironicNode.PowerState != powerOn { if ironicNode.TargetPowerState == powerOn { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } result, err = p.changePower(ironicNode, nodes.PowerOn) if err != nil { @@ -1558,9 +1531,7 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error if ironicNode.PowerState != powerOff { if ironicNode.TargetPowerState == powerOff { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } result, err = p.changePower(ironicNode, nodes.PowerOff) if err != nil { @@ -1591,9 +1562,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error // If the target state is either powerOff or softPowerOff, then we should wait if targetState == powerOff || targetState == softPowerOff { p.log.Info("waiting for power status to change") - result.RequeueAfter = powerRequeueDelay - result.Dirty = true - return result, nil + return operationContinuing(powerRequeueDelay) } // If the target state is unset while the last error is set, // then the last execution of soft power off has failed. From 1feeb9ff851ebd7c87ea0060dc4a0c642595afef Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 7 Jan 2021 00:05:47 -0500 Subject: [PATCH 16/28] Ironic: Use transientError() to return errors (cherry picked from commit 6857a7fc6be69a6938e440b9e905c9bb0f13d5b9) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 102 +++++++++++++-------------- pkg/provisioner/ironic/power_test.go | 1 - 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 480d86342d..09cea99b48 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -340,7 +340,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b ironicNode, err = p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } // Some BMC types require a MAC address, so ensure we have one @@ -382,7 +382,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b }).Extract() // FIXME(dhellmann): Handle 409 and 503? errors here. if err != nil { - return result, errors.Wrap(err, "failed to register host in ironic") + return transientError(errors.Wrap(err, "failed to register host in ironic")) } p.publisher("Registered", "Registered new host") @@ -406,7 +406,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b PXEEnabled: &enable, }).Extract() if err != nil { - return result, errors.Wrap(err, "failed to create port in ironic") + return transientError(errors.Wrap(err, "failed to create port in ironic")) } } @@ -425,7 +425,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b if imageData != nil { updates, err := p.getImageUpdateOptsForNode(ironicNode, imageData) if err != nil { - return result, errors.Wrap(err, "Could not get Image options for node") + return transientError(errors.Wrap(err, "Could not get Image options for node")) } if len(updates) != 0 { _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() @@ -435,7 +435,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b p.log.Info("could not update host settings in ironic, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to update host settings in ironic") + return transientError(errors.Wrap(err, "failed to update host settings in ironic")) } } } @@ -467,7 +467,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b p.log.Info("could not update ironic node name, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to update ironc node name") + return transientError(errors.Wrap(err, "failed to update ironc node name")) } p.log.Info("updated ironic node name") @@ -490,7 +490,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b p.log.Info("could not update host driver settings, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to update host driver settings") + return transientError(errors.Wrap(err, "failed to update host driver settings")) } p.log.Info("updated host driver settings") // We don't return here because we also have to set the @@ -574,8 +574,8 @@ func (p *ironicProvisioner) tryChangeNodeProvisionState(ironicNode *nodes.Node, result, err = retryAfterDelay(provisionRequeueDelay) return default: - err = errors.Wrap(changeResult.Err, - fmt.Sprintf("failed to change provisioning state to %q", opts.Target)) + result, err = transientError(errors.Wrap(changeResult.Err, + fmt.Sprintf("failed to change provisioning state to %q", opts.Target))) return } @@ -597,11 +597,12 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu ironicNode, err := p.findExistingHost() if err != nil { - err = errors.Wrap(err, "failed to find existing host") + result, err = transientError(errors.Wrap(err, "failed to find existing host")) return } if ironicNode == nil { - return result, nil, provisioner.NeedsRegistration + result, err = transientError(provisioner.NeedsRegistration) + return } status, err := introspection.GetIntrospectionStatus(p.inspector, ironicNode.UUID).Extract() @@ -639,7 +640,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu result, err = retryAfterDelay(provisionRequeueDelay) return default: - err = errors.Wrap(err, "failed to update host boot mode settings in ironic") + result, err = transientError(errors.Wrap(err, "failed to update host boot mode settings in ironic")) return } @@ -655,7 +656,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu return } } - err = errors.Wrap(err, "failed to extract hardware inspection status") + result, err = transientError(errors.Wrap(err, "failed to extract hardware inspection status")) return } if !status.Finished { @@ -674,7 +675,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu introData := introspection.GetIntrospectionData(p.inspector, ironicNode.UUID) data, err := introData.Extract() if err != nil { - err = errors.Wrap(err, "failed to retrieve hardware introspection data") + result, err = transientError(errors.Wrap(err, "failed to retrieve hardware introspection data")) return } p.log.Info("received introspection data", "data", introData.Body) @@ -695,10 +696,10 @@ func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, er ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode == nil { - return result, provisioner.NeedsRegistration + return transientError(provisioner.NeedsRegistration) } var discoveredVal bool @@ -1000,7 +1001,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p updates, err := p.getUpdateOptsForNode(ironicNode) if err != nil { - return result, errors.Wrap(err, "failed to update opts for node") + return transientError(errors.Wrap(err, "failed to update opts for node")) } _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() switch err.(type) { @@ -1009,7 +1010,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p p.log.Info("could not update host settings in ironic, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to update host settings in ironic") + return transientError(errors.Wrap(err, "failed to update host settings in ironic")) } p.log.Info("validating host settings") @@ -1021,7 +1022,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p p.log.Info("could not validate host during registration, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to validate host during registration") + return transientError(errors.Wrap(err, "failed to validate host during registration")) } if errorMessage != "" { result.ErrorMessage = errorMessage @@ -1047,19 +1048,16 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er var ironicNode *nodes.Node if ironicNode, err = p.findExistingHost(); err != nil { - err = errors.Wrap(err, "could not find host to adpot") - return + return transientError(errors.Wrap(err, "could not find host to adpot")) } if ironicNode == nil { - err = provisioner.NeedsRegistration - return + return transientError(provisioner.NeedsRegistration) } switch nodes.ProvisionState(ironicNode.ProvisionState) { case nodes.Enroll, nodes.Verifying: - err = fmt.Errorf("Invalid state for adopt: %s", - ironicNode.ProvisionState) - return + return transientError(fmt.Errorf("Invalid state for adopt: %s", + ironicNode.ProvisionState)) case nodes.Manageable: _, hasImageSource := ironicNode.InstanceInfo["image_source"] _, hasBootISO := ironicNode.InstanceInfo["boot_iso"] @@ -1108,10 +1106,10 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu var ironicNode *nodes.Node if ironicNode, err = p.findExistingHost(); err != nil { - return result, errors.Wrap(err, "could not find host to receive image") + return transientError(errors.Wrap(err, "could not find host to receive image")) } if ironicNode == nil { - return result, provisioner.NeedsRegistration + return transientError(provisioner.NeedsRegistration) } p.log.Info("provisioning image to host", "state", ironicNode.ProvisionState) @@ -1176,17 +1174,17 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu // Retrieve cloud-init user data userData, err := hostConf.UserData() if err != nil { - return result, errors.Wrap(err, "could not retrieve user data") + return transientError(errors.Wrap(err, "could not retrieve user data")) } // Retrieve cloud-init network_data.json. Default value is empty networkDataRaw, err := hostConf.NetworkData() if err != nil { - return result, errors.Wrap(err, "could not retrieve network data") + return transientError(errors.Wrap(err, "could not retrieve network data")) } var networkData map[string]interface{} if err = yaml.Unmarshal([]byte(networkDataRaw), &networkData); err != nil { - return result, errors.Wrap(err, "failed to unmarshal network_data.json from secret") + return transientError(errors.Wrap(err, "failed to unmarshal network_data.json from secret")) } // Retrieve cloud-init meta_data.json with falback to default @@ -1199,11 +1197,11 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu } metaDataRaw, err := hostConf.MetaData() if err != nil { - return result, errors.Wrap(err, "could not retrieve metadata") + return transientError(errors.Wrap(err, "could not retrieve metadata")) } if metaDataRaw != "" { if err = yaml.Unmarshal([]byte(metaDataRaw), &metaData); err != nil { - return result, errors.Wrap(err, "failed to unmarshal metadata from secret") + return transientError(errors.Wrap(err, "failed to unmarshal metadata from secret")) } } @@ -1215,7 +1213,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu NetworkData: networkData, } if err != nil { - return result, errors.Wrap(err, "failed to build config drive") + return transientError(errors.Wrap(err, "failed to build config drive")) } p.log.Info("triggering provisioning with config drive") } else { @@ -1264,7 +1262,7 @@ func (p *ironicProvisioner) setMaintenanceFlag(ironicNode *nodes.Node, value boo p.log.Info("could not set host maintenance flag, busy") return retryAfterDelay(provisionRequeueDelay) default: - return result, errors.Wrap(err, "failed to set host maintenance flag") + return transientError(errors.Wrap(err, "failed to set host maintenance flag")) } return operationContinuing(0) } @@ -1277,10 +1275,10 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode == nil { - return result, provisioner.NeedsRegistration + return transientError(provisioner.NeedsRegistration) } p.log.Info("deprovisioning host", @@ -1360,7 +1358,8 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, ) default: - return result, fmt.Errorf("Unhandled ironic state %s", ironicNode.ProvisionState) + // FIXME(zaneb): this error is unlikely to actually be transient + return transientError(fmt.Errorf("Unhandled ironic state %s", ironicNode.ProvisionState)) } } @@ -1371,7 +1370,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode == nil { p.log.Info("no node found, already deleted") @@ -1421,7 +1420,7 @@ func (p *ironicProvisioner) Delete() (result provisioner.Result, err error) { case gophercloud.ErrDefault404: p.log.Info("did not find host to delete, OK") default: - return result, errors.Wrap(err, "failed to remove host") + return transientError(errors.Wrap(err, "failed to remove host")) } return operationContinuing(0) @@ -1464,7 +1463,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar return result, SoftPowerOffUnsupportedError{Address: p.host.Spec.BMC.Address} default: p.log.Info("power change error", "message", changeResult.Err) - return result, errors.Wrap(changeResult.Err, "failed to change power state") + return transientError(errors.Wrap(changeResult.Err, "failed to change power state")) } } @@ -1475,7 +1474,7 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } p.log.Info("checking current state", @@ -1488,8 +1487,11 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { return operationContinuing(powerRequeueDelay) } result, err = p.changePower(ironicNode, nodes.PowerOn) - if err != nil { - return result, errors.Wrap(err, "failed to power on host") + switch err.(type) { + case nil: + case HostLockedError: + default: + return transientError(errors.Wrap(err, "failed to power on host")) } p.publisher("PowerOn", "Host powered on") } @@ -1512,8 +1514,7 @@ func (p *ironicProvisioner) PowerOff() (result provisioner.Result, err error) { case HostLockedError: return retryAfterDelay(powerRequeueDelay) default: - result.RequeueAfter = powerRequeueDelay - return result, err + return transientError(err) } } return result, nil @@ -1525,7 +1526,7 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode.PowerState != powerOff { @@ -1535,7 +1536,7 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error } result, err = p.changePower(ironicNode, nodes.PowerOff) if err != nil { - return result, errors.Wrap(err, "failed to power off host") + return transientError(errors.Wrap(err, "failed to power off host")) } p.publisher("PowerOff", "Host powered off") return result, err @@ -1554,7 +1555,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error ironicNode, err := p.findExistingHost() if err != nil { - return result, errors.Wrap(err, "failed to find existing host") + return transientError(errors.Wrap(err, "failed to find existing host")) } if ironicNode.PowerState != powerOff { @@ -1571,8 +1572,7 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error } result, err = p.changePower(ironicNode, nodes.SoftPowerOff) if err != nil { - result.RequeueAfter = powerRequeueDelay - return result, err + return transientError(err) } p.publisher("PowerOff", "Host soft powered off") } diff --git a/pkg/provisioner/ironic/power_test.go b/pkg/provisioner/ironic/power_test.go index 890fd19cf8..8aeda72434 100644 --- a/pkg/provisioner/ironic/power_test.go +++ b/pkg/provisioner/ironic/power_test.go @@ -73,7 +73,6 @@ func TestPowerOn(t *testing.T) { }).WithNodeStatesPower(nodeUUID, http.StatusConflict).WithNodeStatesPowerUpdate(nodeUUID, http.StatusConflict), expectedRequestAfter: 10, expectedDirty: true, - expectedError: true, }, } From db6f27808f6625424ef5d0463430581a9d244bb5 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 7 Jan 2021 00:10:25 -0500 Subject: [PATCH 17/28] Ironic: Use operationFailed() to handle failures (cherry picked from commit 1e6fd3468854824312a7b9419e3ea4d428fe54d8) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 22 ++++++++-------------- pkg/provisioner/ironic/provision_test.go | 12 ++++++++++-- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 09cea99b48..4033e01e1d 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -348,8 +348,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b if p.bmcAccess.NeedsMAC() && p.host.Spec.BootMACAddress == "" { msg := fmt.Sprintf("BMC driver %s requires a BootMACAddress value", p.bmcAccess.Type()) p.log.Info(msg) - result.ErrorMessage = msg - return result, nil + return operationFailed(msg) } driverInfo := p.bmcAccess.DriverInfo(p.bmcCreds) @@ -517,8 +516,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // If ironic is reporting an error, stop working on the node. if ironicNode.LastError != "" && !(credentialsChanged || force) { - result.ErrorMessage = ironicNode.LastError - return result, nil + return operationFailed(ironicNode.LastError) } if ironicNode.TargetProvisionState == string(nodes.TargetManage) { @@ -666,7 +664,7 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu } if status.Error != "" { p.log.Info("inspection failed", "error", status.Error) - result.ErrorMessage = status.Error + result, err = operationFailed(status.Error) return } @@ -1025,9 +1023,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p return transientError(errors.Wrap(err, "failed to validate host during registration")) } if errorMessage != "" { - result.ErrorMessage = errorMessage - result.Dirty = true // validateNode() would have set the errors - return result, nil + return operationFailed(errorMessage) } // If validation is successful we can start moving the host @@ -1089,9 +1085,8 @@ func (p *ironicProvisioner) Adopt(force bool) (result provisioner.Result, err er }, ) } else { - result.ErrorMessage = fmt.Sprintf("Host adoption failed: %s", - ironicNode.LastError) - return + return operationFailed(fmt.Sprintf("Host adoption failed: %s", + ironicNode.LastError)) } case nodes.Active: default: @@ -1146,9 +1141,8 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu return retryAfterDelay(provisionRequeueDelay) } p.log.Info("found error", "msg", ironicNode.LastError) - result.ErrorMessage = fmt.Sprintf("Image provisioning failed: %s", - ironicNode.LastError) - return result, nil + return operationFailed(fmt.Sprintf("Image provisioning failed: %s", + ironicNode.LastError)) } p.log.Info("recovering from previous failure") if provResult, err := p.startProvisioning(ironicNode, hostConf); err != nil || provResult.Dirty || provResult.ErrorMessage != "" { diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 450e4caae0..2620310312 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -22,6 +22,7 @@ func TestProvision(t *testing.T) { ironic *testserver.IronicMock expectedDirty bool expectedError bool + expectedErrorMessage bool expectedRequestAfter int }{ { @@ -31,7 +32,8 @@ func TestProvision(t *testing.T) { UUID: nodeUUID, }), expectedRequestAfter: 0, - expectedDirty: true, + expectedDirty: false, + expectedErrorMessage: true, }, { name: "manageable state", @@ -49,7 +51,8 @@ func TestProvision(t *testing.T) { UUID: nodeUUID, }), expectedRequestAfter: 0, - expectedDirty: true, + expectedDirty: false, + expectedErrorMessage: true, }, { name: "active state", @@ -99,6 +102,11 @@ func TestProvision(t *testing.T) { assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) + if !tc.expectedErrorMessage { + assert.Equal(t, "", result.ErrorMessage) + } else { + assert.NotEqual(t, "", result.ErrorMessage) + } if !tc.expectedError { assert.NoError(t, err) } else { From 0f15f6355b69fa74cc9ab089164363b70538cdea Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 5 Jan 2021 14:14:07 -0500 Subject: [PATCH 18/28] Ironic: Use hostUpdated() to handle Status updates Denote instances where we are returning due to data in the Host's Status structure being updated by calling the hostUpdated() function to get a Result. (cherry picked from commit 4b4c29d5a3ad1a147f3119592b567f998dbf4599) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 4033e01e1d..d19b4bb949 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -388,7 +388,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // Store the ID so other methods can assume it is set and so // we can find the node again later. p.status.ID = ironicNode.UUID - result.Dirty = true + result, err = hostUpdated() p.log.Info("setting provisioning id", "ID", p.status.ID) // If we know the MAC, create a port. Otherwise we will have @@ -447,7 +447,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // Store the ID so other methods can assume it is set and // so we can find the node using that value next time. p.status.ID = ironicNode.UUID - result.Dirty = true + result, err = hostUpdated() p.log.Info("setting provisioning id", "ID", p.status.ID) } @@ -717,7 +717,7 @@ func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, er if discoveredVal != p.host.Status.PoweredOn { p.log.Info("updating power status", "discovered", discoveredVal) p.host.Status.PoweredOn = discoveredVal - result.Dirty = true + return hostUpdated() } return result, nil } From 3705a6b0d7f64185e0956c74eb45788694d862a1 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 7 Jan 2021 21:31:26 -0500 Subject: [PATCH 19/28] Return provisioner ID from ValidateManagementAccess() The provisioner interface currently requires provisioners to write to the host's Status in order to set the ID. This is undesirable. Return the provisioner's ID for the host from ValidateManagementAccess() and write it to the Status in the controller. (cherry picked from commit 50f97a7558adff7f1c04faf0452bb1c1d1d6fcfd) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 8 ++- .../metal3.io/host_state_machine_test.go | 4 +- pkg/provisioner/demo/demo.go | 6 +- pkg/provisioner/empty/empty.go | 4 +- pkg/provisioner/fixture/fixture.go | 10 ++- pkg/provisioner/ironic/ironic.go | 67 ++++++++++--------- .../ironic/validatemanagementaccess_test.go | 32 ++++----- pkg/provisioner/provisioner.go | 2 +- 8 files changed, 72 insertions(+), 61 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index b15a9ba924..05cc6b84fb 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -434,7 +434,7 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner info.postSaveCallbacks = append(info.postSaveCallbacks, updatedCredentials.Inc) } - provResult, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) + provResult, provID, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) if err != nil { noManagementAccess.Inc() return actionError{errors.Wrap(err, "failed to validate BMC access")} @@ -446,6 +446,12 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } + if provID != "" && info.host.Status.Provisioning.ID != provID { + info.log.Info("setting provisioning id", "ID", provID) + info.host.Status.Provisioning.ID = provID + provResult.Dirty = true + } + if provResult.Dirty { info.log.Info("host not ready", "wait", provResult.RequeueAfter) clearError(info.host) diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 96834305f9..42134eac48 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -366,8 +366,8 @@ func (m *mockProvisioner) setNextResult(dirty bool) { } } -func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { - return m.nextResult, err +func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { + return m.nextResult, "", err } func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 9fc1bc8168..10a6fb3150 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -68,7 +68,7 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { +func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { p.log.Info("testing management access") hostName := p.host.ObjectMeta.Name @@ -88,14 +88,14 @@ func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force boo default: if p.host.Status.Provisioning.ID == "" { - p.host.Status.Provisioning.ID = p.host.ObjectMeta.Name + provID = p.host.ObjectMeta.Name p.log.Info("setting provisioning id", "provisioningID", p.host.Status.Provisioning.ID) result.Dirty = true } } - return result, nil + return } // InspectHardware updates the HardwareDetails field of the host with diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index a50af7fc82..ccd2e9516c 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -21,8 +21,8 @@ func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (provisioner.Result, error) { - return provisioner.Result{}, nil +func (p *emptyProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (provisioner.Result, string, error) { + return provisioner.Result{}, "", nil } // InspectHardware updates the HardwareDetails field of the host with diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index dcb365b30d..e8b3e7de46 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -77,21 +77,19 @@ func NewMock(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publi // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. -func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { +func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { p.log.Info("testing management access") // Fill in the ID of the host in the provisioning system if p.host.Status.Provisioning.ID == "" { - p.host.Status.Provisioning.ID = "temporary-fake-id" - p.log.Info("setting provisioning id", - "provisioningID", p.host.Status.Provisioning.ID) + provID = "temporary-fake-id" result.Dirty = true result.RequeueAfter = time.Second * 5 p.publisher("Registered", "Registered new host") - return result, nil + return } - return result, nil + return } // InspectHardware updates the HardwareDetails field of the host with diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index d19b4bb949..83a0795fc7 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -333,14 +333,15 @@ func (p *ironicProvisioner) findExistingHost() (ironicNode *nodes.Node, err erro // // FIXME(dhellmann): We should rename this method to describe what it // actually does. -func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, err error) { +func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { var ironicNode *nodes.Node p.log.Info("validating management access") ironicNode, err = p.findExistingHost() if err != nil { - return transientError(errors.Wrap(err, "failed to find existing host")) + result, err = transientError(errors.Wrap(err, "failed to find existing host")) + return } // Some BMC types require a MAC address, so ensure we have one @@ -348,7 +349,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b if p.bmcAccess.NeedsMAC() && p.host.Spec.BootMACAddress == "" { msg := fmt.Sprintf("BMC driver %s requires a BootMACAddress value", p.bmcAccess.Type()) p.log.Info(msg) - return operationFailed(msg) + result, err = operationFailed(msg) + return } driverInfo := p.bmcAccess.DriverInfo(p.bmcCreds) @@ -381,15 +383,14 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b }).Extract() // FIXME(dhellmann): Handle 409 and 503? errors here. if err != nil { - return transientError(errors.Wrap(err, "failed to register host in ironic")) + result, err = transientError(errors.Wrap(err, "failed to register host in ironic")) + return } p.publisher("Registered", "Registered new host") // Store the ID so other methods can assume it is set and so // we can find the node again later. - p.status.ID = ironicNode.UUID - result, err = hostUpdated() - p.log.Info("setting provisioning id", "ID", p.status.ID) + provID = ironicNode.UUID // If we know the MAC, create a port. Otherwise we will have // to do this after we run the introspection step. @@ -397,7 +398,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b enable := true p.log.Info("creating port for node in ironic", "MAC", p.host.Spec.BootMACAddress) - _, err := ports.Create( + _, err = ports.Create( p.client, ports.CreateOpts{ NodeUUID: ironicNode.UUID, @@ -405,7 +406,8 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b PXEEnabled: &enable, }).Extract() if err != nil { - return transientError(errors.Wrap(err, "failed to create port in ironic")) + result, err = transientError(errors.Wrap(err, "failed to create port in ironic")) + return } } @@ -422,9 +424,10 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b } if imageData != nil { - updates, err := p.getImageUpdateOptsForNode(ironicNode, imageData) - if err != nil { - return transientError(errors.Wrap(err, "Could not get Image options for node")) + updates, optsErr := p.getImageUpdateOptsForNode(ironicNode, imageData) + if optsErr != nil { + result, err = transientError(errors.Wrap(optsErr, "Could not get Image options for node")) + return } if len(updates) != 0 { _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() @@ -432,9 +435,11 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host settings in ironic, busy") - return retryAfterDelay(provisionRequeueDelay) + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - return transientError(errors.Wrap(err, "failed to update host settings in ironic")) + result, err = transientError(errors.Wrap(err, "failed to update host settings in ironic")) + return } } } @@ -443,13 +448,7 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // node in ironic by looking it up. We need to check its // settings against what we have in the host, and change them // if there are differences. - if p.status.ID != ironicNode.UUID { - // Store the ID so other methods can assume it is set and - // so we can find the node using that value next time. - p.status.ID = ironicNode.UUID - result, err = hostUpdated() - p.log.Info("setting provisioning id", "ID", p.status.ID) - } + provID = ironicNode.UUID if ironicNode.Name == "" { updates := nodes.UpdateOpts{ @@ -464,9 +463,11 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update ironic node name, busy") - return retryAfterDelay(provisionRequeueDelay) + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - return transientError(errors.Wrap(err, "failed to update ironc node name")) + result, err = transientError(errors.Wrap(err, "failed to update ironc node name")) + return } p.log.Info("updated ironic node name") @@ -487,9 +488,11 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b case nil: case gophercloud.ErrDefault409: p.log.Info("could not update host driver settings, busy") - return retryAfterDelay(provisionRequeueDelay) + result, err = retryAfterDelay(provisionRequeueDelay) + return default: - return transientError(errors.Wrap(err, "failed to update host driver settings")) + result, err = transientError(errors.Wrap(err, "failed to update host driver settings")) + return } p.log.Info("updated host driver settings") // We don't return here because we also have to set the @@ -516,25 +519,29 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged, force b // If ironic is reporting an error, stop working on the node. if ironicNode.LastError != "" && !(credentialsChanged || force) { - return operationFailed(ironicNode.LastError) + result, err = operationFailed(ironicNode.LastError) + return } if ironicNode.TargetProvisionState == string(nodes.TargetManage) { // We have already tried to manage the node and did not // get an error, so do nothing and keep trying. - return operationContinuing(provisionRequeueDelay) + result, err = operationContinuing(provisionRequeueDelay) + return } - return p.changeNodeProvisionState( + result, err = p.changeNodeProvisionState( ironicNode, nodes.ProvisionStateOpts{Target: nodes.TargetManage}, ) + return case nodes.Verifying: // If we're still waiting for the state to change in Ironic, // return true to indicate that we're dirty and need to be // reconciled again. - return operationContinuing(provisionRequeueDelay) + result, err = operationContinuing(provisionRequeueDelay) + return case nodes.Manageable: p.log.Info("have manageable host") @@ -1138,7 +1145,7 @@ func (p *ironicProvisioner) Provision(hostConf provisioner.HostConfigData) (resu // top of relational databases... if ironicNode.LastError == "" { p.log.Info("failed but error message not available") - return retryAfterDelay(provisionRequeueDelay) + return retryAfterDelay(0) } p.log.Info("found error", "msg", ironicNode.LastError) return operationFailed(fmt.Sprintf("Image provisioning failed: %s", diff --git a/pkg/provisioner/ironic/validatemanagementaccess_test.go b/pkg/provisioner/ironic/validatemanagementaccess_test.go index 13f4977788..378d62ff63 100644 --- a/pkg/provisioner/ironic/validatemanagementaccess_test.go +++ b/pkg/provisioner/ironic/validatemanagementaccess_test.go @@ -33,7 +33,7 @@ func TestValidateManagementAccessNoMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -63,7 +63,7 @@ func TestValidateManagementAccessMACOptional(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -95,13 +95,13 @@ func TestValidateManagementAccessCreateNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) assert.NotEqual(t, "", createdNode.UUID) - assert.Equal(t, createdNode.UUID, host.Status.Provisioning.ID) + assert.Equal(t, createdNode.UUID, provID) } func TestValidateManagementAccessExistingNode(t *testing.T) { @@ -130,12 +130,12 @@ func TestValidateManagementAccessExistingNode(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.Equal(t, "uuid", host.Status.Provisioning.ID) + assert.Equal(t, "uuid", provID) } func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { @@ -193,7 +193,7 @@ func TestValidateManagementAccessExistingNodeContinue(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -238,7 +238,7 @@ func TestValidateManagementAccessExistingNodeWaiting(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, _, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } @@ -280,12 +280,12 @@ func TestValidateManagementAccessNewCredentials(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(true, false) + result, provID, err := prov.ValidateManagementAccess(true, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.Equal(t, "uuid", host.Status.Provisioning.ID) + assert.Equal(t, "uuid", provID) updates := ironic.GetLastNodeUpdateRequestFor("uuid") newValues := updates[0].Value.(map[string]interface{}) @@ -327,12 +327,12 @@ func TestValidateManagementAccessLinkExistingIronicNodeByMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.NotEqual(t, "", host.Status.Provisioning.ID) + assert.NotEqual(t, "", provID) } func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { @@ -370,7 +370,7 @@ func TestValidateManagementAccessExistingPortWithWrongUUID(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false, false) + _, _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: port exists but linked node doesn't random-wrong-id: Resource not found") } @@ -412,7 +412,7 @@ func TestValidateManagementAccessExistingPortButHasName(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - _, err = prov.ValidateManagementAccess(false, false) + _, _, err = prov.ValidateManagementAccess(false, false) assert.EqualError(t, err, "failed to find existing host: node found by MAC but has a name: wrong-name") } @@ -450,10 +450,10 @@ func TestValidateManagementAccessAddTwoHostsWithSameMAC(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.ValidateManagementAccess(false, false) + result, provID, err := prov.ValidateManagementAccess(false, false) if err != nil { t.Fatalf("error from ValidateManagementAccess: %s", err) } assert.Equal(t, "", result.ErrorMessage) - assert.NotEqual(t, "", host.Status.Provisioning.ID) + assert.NotEqual(t, "", provID) } diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index e5e01f2c29..a8e96377a8 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -43,7 +43,7 @@ type Provisioner interface { // of credentials it has are different from the credentials it has // previously been using, without implying that either set of // credentials is correct. - ValidateManagementAccess(credentialsChanged, force bool) (result Result, err error) + ValidateManagementAccess(credentialsChanged, force bool) (result Result, provID string, err error) // InspectHardware updates the HardwareDetails field of the host with // details of devices discovered on the hardware. It may be called From 260f79b11f7db741ec20bfa277270afa35fb96d6 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Fri, 8 Jan 2021 10:38:15 -0500 Subject: [PATCH 20/28] Return power status from UpdateHardwareState() The provisioner interface currently requires provisioners to write to the host's Status in order to set the power status. This is undesirable. Return the power status (if known) for the host from UpdateHardwareState(), rather than a Result structure from which only the Dirty flag was used. Write the updated power state to the Status in the controller. (cherry picked from commit cd60f7dfae393e5e6725d736aef7ae25e3d716dc) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 12 ++++---- .../metal3.io/host_state_machine_test.go | 4 +-- pkg/provisioner/demo/demo.go | 8 ++--- pkg/provisioner/empty/empty.go | 7 ++--- pkg/provisioner/fixture/fixture.go | 8 ++--- pkg/provisioner/ironic/ironic.go | 29 +++++++------------ .../ironic/updatehardwarestate_test.go | 25 ++++++++-------- pkg/provisioner/provisioner.go | 12 ++++++-- 8 files changed, 47 insertions(+), 58 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 05cc6b84fb..aa5cb8174e 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -644,18 +644,16 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, var provResult provisioner.Result // Check the current status and save it before trying to update it. - provResult, err := prov.UpdateHardwareState() + hwState, err := prov.UpdateHardwareState() if err != nil { return actionError{errors.Wrap(err, "failed to update the host power status")} } - if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) - } - - if provResult.Dirty { + if hwState.PoweredOn != nil && *hwState.PoweredOn != info.host.Status.PoweredOn { + info.log.Info("updating power status", "discovered", *hwState.PoweredOn) + info.host.Status.PoweredOn = *hwState.PoweredOn clearError(info.host) - return actionContinue{provResult.RequeueAfter} + return actionContinue{} } desiredPowerOnState := info.host.Spec.Online diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 42134eac48..6ae6145f2e 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -375,8 +375,8 @@ func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result return m.nextResult, details, err } -func (m *mockProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { - return m.nextResult, err +func (m *mockProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { + return } func (m *mockProvisioner) Adopt(force bool) (result provisioner.Result, err error) { diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 10a6fb3150..982fd4ea19 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -174,12 +174,10 @@ func (p *demoProvisioner) InspectHardware(force bool) (result provisioner.Result // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *demoProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *demoProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { p.log.Info("updating hardware state") - result.Dirty = false - return result, nil + return } // Adopt allows an externally-provisioned server to be adopted. diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index ccd2e9516c..842652f9c6 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -36,10 +36,9 @@ func (p *emptyProvisioner) InspectHardware(force bool) (provisioner.Result, *met // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *emptyProvisioner) UpdateHardwareState() (provisioner.Result, error) { - return provisioner.Result{}, nil +// reading from a cache. +func (p *emptyProvisioner) UpdateHardwareState() (provisioner.HardwareState, error) { + return provisioner.HardwareState{}, nil } // Adopt allows an externally-provisioned server to be adopted. diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index e8b3e7de46..8e4470d7f9 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -157,14 +157,12 @@ func (p *fixtureProvisioner) InspectHardware(force bool) (result provisioner.Res // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *fixtureProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { if !p.host.NeedsProvisioning() { p.log.Info("updating hardware state") - result.Dirty = false } - return result, nil + return } // Adopt allows an externally-provisioned server to be adopted. diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 83a0795fc7..e7a514f39d 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -694,39 +694,30 @@ func (p *ironicProvisioner) InspectHardware(force bool) (result provisioner.Resu // UpdateHardwareState fetches the latest hardware state of the server // and updates the HardwareDetails field of the host with details. It // is expected to do this in the least expensive way possible, such as -// reading from a cache, and return dirty only if any state -// information has changed. -func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, err error) { +// reading from a cache. +func (p *ironicProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { p.log.Info("updating hardware state") ironicNode, err := p.findExistingHost() if err != nil { - return transientError(errors.Wrap(err, "failed to find existing host")) + err = errors.Wrap(err, "failed to find existing host") + return } if ironicNode == nil { - return transientError(provisioner.NeedsRegistration) + err = provisioner.NeedsRegistration + return } - var discoveredVal bool switch ironicNode.PowerState { - case powerOn: - discoveredVal = true - case powerOff: - discoveredVal = false + case powerOn, powerOff: + discoveredVal := ironicNode.PowerState == powerOn + hwState.PoweredOn = &discoveredVal case powerNone: p.log.Info("could not determine power state", "value", ironicNode.PowerState) - return result, nil default: p.log.Info("unknown power state", "value", ironicNode.PowerState) - return result, nil } - - if discoveredVal != p.host.Status.PoweredOn { - p.log.Info("updating power status", "discovered", discoveredVal) - p.host.Status.PoweredOn = discoveredVal - return hostUpdated() - } - return result, nil + return } func (p *ironicProvisioner) getImageUpdateOptsForNode(ironicNode *nodes.Node, imageData *metal3v1alpha1.Image) (updates nodes.UpdateOpts, err error) { diff --git a/pkg/provisioner/ironic/updatehardwarestate_test.go b/pkg/provisioner/ironic/updatehardwarestate_test.go index 63944b8137..e4e89b0b80 100644 --- a/pkg/provisioner/ironic/updatehardwarestate_test.go +++ b/pkg/provisioner/ironic/updatehardwarestate_test.go @@ -3,7 +3,6 @@ package ironic import ( "net/http" "testing" - "time" "github.com/gophercloud/gophercloud/openstack/baremetal/v1/nodes" "github.com/stretchr/testify/assert" @@ -24,9 +23,7 @@ func TestUpdateHardwareState(t *testing.T) { hostCurrentlyPowered bool hostName string - expectedDirty bool - expectedRequestAfter int - expectedResultError string + expectUnreadablePower bool expectedPublish string expectedError string @@ -36,6 +33,7 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{ UUID: nodeUUID, }), + expectUnreadablePower: true, }, { name: "updated-power-on-state", @@ -52,8 +50,6 @@ func TestUpdateHardwareState(t *testing.T) { PowerState: "power on", }), hostCurrentlyPowered: false, - - expectedDirty: true, }, { name: "updated-power-off-state", @@ -70,8 +66,6 @@ func TestUpdateHardwareState(t *testing.T) { PowerState: "power off", }), hostCurrentlyPowered: true, - - expectedDirty: true, }, { name: "no-power", @@ -79,7 +73,8 @@ func TestUpdateHardwareState(t *testing.T) { UUID: nodeUUID, PowerState: "None", }), - hostCurrentlyPowered: true, + hostCurrentlyPowered: true, + expectUnreadablePower: true, }, { name: "node-not-found", @@ -88,6 +83,8 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().NodeError(nodeUUID, http.StatusGatewayTimeout), expectedError: "failed to find existing host: failed to find node by ID 33ce8659-7400-4c68-9535-d10766f07a58: Expected HTTP response code \\[200\\].*", + + expectUnreadablePower: true, }, { name: "node-not-found-by-name", @@ -96,12 +93,16 @@ func TestUpdateHardwareState(t *testing.T) { ironic: testserver.NewIronic(t).Ready().NoNode(nodeUUID).NodeError("myhost", http.StatusGatewayTimeout), expectedError: "failed to find existing host: failed to find node by name worker-0: EOF", + + expectUnreadablePower: true, }, { name: "not-ironic-node", ironic: testserver.NewIronic(t).Ready().NoNode(nodeUUID).NoNode("myhost"), expectedError: "Host not registered", + + expectUnreadablePower: true, }, } @@ -136,11 +137,9 @@ func TestUpdateHardwareState(t *testing.T) { t.Fatalf("could not create provisioner: %s", err) } - result, err := prov.UpdateHardwareState() + hwStatus, err := prov.UpdateHardwareState() - assert.Equal(t, tc.expectedDirty, result.Dirty) - assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) - assert.Equal(t, tc.expectedResultError, result.ErrorMessage) + assert.Equal(t, tc.expectUnreadablePower, hwStatus.PoweredOn == nil) assert.Equal(t, tc.expectedPublish, publishedMsg) if tc.expectedError == "" { diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index a8e96377a8..18a7002770 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -54,9 +54,8 @@ type Provisioner interface { // UpdateHardwareState fetches the latest hardware state of the // server and updates the HardwareDetails field of the host with // details. It is expected to do this in the least expensive way - // possible, such as reading from a cache, and return dirty only - // if any state information has changed. - UpdateHardwareState() (result Result, err error) + // possible, such as reading from a cache. + UpdateHardwareState() (hwState HardwareState, err error) // Adopt brings an externally-provisioned host under management by // the provisioner. @@ -102,4 +101,11 @@ type Result struct { ErrorMessage string } +// HardwareState holds the response from an UpdateHardwareState call +type HardwareState struct { + // PoweredOn is a pointer to a bool indicating whether the Host is currently + // powered on. The value is nil if the power state cannot be determined. + PoweredOn *bool +} + var NeedsRegistration = errors.New("Host not registered") From ab894f5c008aef9bb0844456864b1eacb4961bad Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Fri, 8 Jan 2021 23:54:51 -0500 Subject: [PATCH 21/28] Fixture: store state separately to provisioner The fixtureProvisioner object gets recreated for each Reconcile() call, so it cannot be used to store state that is used to make different things happen on each reconciliation. Some of this state was instead being stored in the host Status, which could potentially confound some of the tests: only the controller should modify the status, and it is the thing we are trying to test. Instead, create a Fixture structure that contains persistent state for a particular Host and the provisioner Factory to create a provisioner that has access to that state. Stop writing to the Host's Status from the fixture provisioner. (cherry picked from commit 1279a47489a393377d655b0b20048770056a85ca) Signed-off-by: Honza Pokorny --- .../baremetalhost_controller_test.go | 31 +++------ main.go | 3 +- pkg/provisioner/fixture/fixture.go | 63 ++++++++++--------- 3 files changed, 47 insertions(+), 50 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 3c02a2b966..e00d8e7dc4 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -23,8 +23,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" - "github.com/metal3-io/baremetal-operator/pkg/bmc" - "github.com/metal3-io/baremetal-operator/pkg/provisioner" "github.com/metal3-io/baremetal-operator/pkg/provisioner/fixture" "github.com/metal3-io/baremetal-operator/pkg/utils" ) @@ -88,7 +86,7 @@ func newDefaultHost(t *testing.T) *metal3v1alpha1.BareMetalHost { return newDefaultNamedHost(t.Name(), t) } -func newTestReconcilerWithProvisionerFactory(factory provisioner.Factory, initObjs ...runtime.Object) *BareMetalHostReconciler { +func newTestReconcilerWithFixture(fix *fixture.Fixture, initObjs ...runtime.Object) *BareMetalHostReconciler { c := fakeclient.NewFakeClient(initObjs...) @@ -99,13 +97,14 @@ func newTestReconcilerWithProvisionerFactory(factory provisioner.Factory, initOb return &BareMetalHostReconciler{ Client: c, Scheme: scheme.Scheme, - ProvisionerFactory: factory, + ProvisionerFactory: fix.New, Log: ctrl.Log.WithName("controllers").WithName("BareMetalHost"), } } func newTestReconciler(initObjs ...runtime.Object) *BareMetalHostReconciler { - return newTestReconcilerWithProvisionerFactory(fixture.New, initObjs...) + fix := fixture.Fixture{} + return newTestReconcilerWithFixture(&fix, initObjs...) } type DoneFunc func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool @@ -1033,12 +1032,12 @@ func TestDeleteHost(t *testing.T) { host.DeletionTimestamp = &now host.Status.Provisioning.ID = "made-up-id" badSecret := newBMCCredsSecret("bmc-creds-no-user", "", "Pass") - r := newTestReconciler(host, badSecret) + fix := fixture.Fixture{} + r := newTestReconcilerWithFixture(&fix, host, badSecret) tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { - t.Logf("provisioning id: %q", host.Status.Provisioning.ID) - return host.Status.Provisioning.ID == "" + return fix.Deleted }, ) }) @@ -1156,22 +1155,12 @@ func TestUpdateRootDeviceHints(t *testing.T) { func TestProvisionerIsReady(t *testing.T) { host := newDefaultHost(t) - var prov provisioner.Provisioner - r := newTestReconcilerWithProvisionerFactory(func(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner provisioner.Provisioner, err error) { - if prov == nil { - prov, err = fixture.NewMock(host, bmcCreds, publisher, 5) - } - return prov, err - }, host) + fix := fixture.Fixture{BecomeReadyCounter: 5} + r := newTestReconcilerWithFixture(&fix, host) tryReconcile(t, r, host, func(host *metal3v1alpha1.BareMetalHost, result reconcile.Result) bool { - if prov == nil { - return false - } - - ready, _ := prov.IsReady() - return ready + return host.Status.Provisioning.State != metal3v1alpha1.StateNone }, ) } diff --git a/main.go b/main.go index 1a8d477d58..3bb98d037a 100644 --- a/main.go +++ b/main.go @@ -123,7 +123,8 @@ func main() { if runInTestMode { ctrl.Log.Info("using test provisioner") - return fixture.New(host, bmcCreds, publish) + fix := fixture.Fixture{} + return fix.New(host, bmcCreds, publish) } else if runInDemoMode { ctrl.Log.Info("using demo provisioner") return demo.New(host, bmcCreds, publish) diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 8e4470d7f9..d51cc0f9c6 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -52,25 +52,31 @@ type fixtureProvisioner struct { log logr.Logger // an event publisher for recording significant events publisher provisioner.EventPublisher + // state storage for the Host + state *Fixture +} + +type Fixture struct { + // counter to set the provisioner as ready + BecomeReadyCounter int + // state to manage deletion + Deleted bool // state to manage the two-step adopt process adopted bool - // counter to set the provisioner as ready - becomeReadyCounter int + // state to manage provisioning + image metal3v1alpha1.Image + // state to manage power + poweredOn bool } // New returns a new Ironic FixtureProvisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { - return NewMock(host, bmcCreds, publisher, 0) -} - -// NewMock is used in tests to build a fixture provisioner and inject additional test parameters -func NewMock(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, becomeReadyCounter int) (provisioner.Provisioner, error) { +func (f *Fixture) New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { p := &fixtureProvisioner{ - host: host, - bmcCreds: bmcCreds, - log: log.WithValues("host", host.Name), - publisher: publisher, - becomeReadyCounter: becomeReadyCounter, + host: host, + bmcCreds: bmcCreds, + log: log.WithValues("host", host.Name), + publisher: publisher, + state: f, } return p, nil } @@ -160,6 +166,7 @@ func (p *fixtureProvisioner) InspectHardware(force bool) (result provisioner.Res // reading from a cache. func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { if !p.host.NeedsProvisioning() { + hwState.PoweredOn = &p.state.poweredOn p.log.Info("updating hardware state") } return @@ -168,8 +175,8 @@ func (p *fixtureProvisioner) UpdateHardwareState() (hwState provisioner.Hardware // Adopt allows an externally-provisioned server to be adopted. func (p *fixtureProvisioner) Adopt(force bool) (result provisioner.Result, err error) { p.log.Info("adopting host") - if p.host.Spec.ExternallyProvisioned && !p.adopted { - p.adopted = true + if p.host.Spec.ExternallyProvisioned && !p.state.adopted { + p.state.adopted = true result.Dirty = true result.RequeueAfter = provisionRequeueDelay } @@ -183,10 +190,10 @@ func (p *fixtureProvisioner) Provision(hostConf provisioner.HostConfigData) (res p.log.Info("provisioning image to host", "state", p.host.Status.Provisioning.State) - if p.host.Status.Provisioning.Image.URL == "" { + if p.state.image.URL == "" { p.publisher("ProvisioningComplete", "Image provisioning completed") p.log.Info("moving to done") - p.host.Status.Provisioning.Image = *p.host.Spec.Image + p.state.image = *p.host.Spec.Image result.Dirty = true result.RequeueAfter = provisionRequeueDelay } @@ -207,10 +214,10 @@ func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, // necessary once we really have Fixture doing the deprovisioning // and we can monitor it's status. - if p.host.Status.HardwareDetails != nil { + if p.state.image.URL != "" { p.publisher("DeprovisionStarted", "Image deprovisioning started") p.log.Info("clearing hardware details") - p.host.Status.HardwareDetails = nil + p.state.image = metal3v1alpha1.Image{} result.Dirty = true return result, nil } @@ -225,9 +232,9 @@ func (p *fixtureProvisioner) Deprovision(force bool) (result provisioner.Result, func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { p.log.Info("deleting host") - if p.host.Status.Provisioning.ID != "" { + if !p.state.Deleted { p.log.Info("clearing provisioning id") - p.host.Status.Provisioning.ID = "" + p.state.Deleted = true result.Dirty = true return result, nil } @@ -240,10 +247,10 @@ func (p *fixtureProvisioner) Delete() (result provisioner.Result, err error) { func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) { p.log.Info("ensuring host is powered on") - if !p.host.Status.PoweredOn { + if !p.state.poweredOn { p.publisher("PowerOn", "Host powered on") p.log.Info("changing status") - p.host.Status.PoweredOn = true + p.state.poweredOn = true result.Dirty = true return result, nil } @@ -256,10 +263,10 @@ func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) { func (p *fixtureProvisioner) PowerOff() (result provisioner.Result, err error) { p.log.Info("ensuring host is powered off") - if p.host.Status.PoweredOn { + if p.state.poweredOn { p.publisher("PowerOff", "Host powered off") p.log.Info("changing status") - p.host.Status.PoweredOn = false + p.state.poweredOn = false result.Dirty = true return result, nil } @@ -271,9 +278,9 @@ func (p *fixtureProvisioner) PowerOff() (result provisioner.Result, err error) { func (p *fixtureProvisioner) IsReady() (result bool, err error) { p.log.Info("checking provisioner status") - if p.becomeReadyCounter > 0 { - p.becomeReadyCounter-- + if p.state.BecomeReadyCounter > 0 { + p.state.BecomeReadyCounter-- } - return p.becomeReadyCounter == 0, nil + return p.state.BecomeReadyCounter == 0, nil } From d20781cba4906fab7183f2599422fcdf6eb879db Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Fri, 8 Jan 2021 11:24:24 -0500 Subject: [PATCH 22/28] Prevent provisioners modifying the Host There is no longer any reason for provisioners to modify the BareMetalHost object itself; they should communicate any changes explicitly through return values from the Provisioner API. To prevent this starting to happen again in future, pass only a copy of the BareMetalHost in the provisioner factory, so that no changes made by the provisioner will ever make their way back to the controller nor be written to the k8s API. (cherry picked from commit 0656b7ea800aef87ee2d53da97bd694cb357f9f0) Signed-off-by: Honza Pokorny --- controllers/metal3.io/baremetalhost_controller.go | 2 +- controllers/metal3.io/host_state_machine_test.go | 2 +- main.go | 12 +++++++----- pkg/provisioner/demo/demo.go | 4 ++-- pkg/provisioner/empty/empty.go | 2 +- pkg/provisioner/fixture/fixture.go | 6 +++--- pkg/provisioner/ironic/ironic.go | 8 ++++---- pkg/provisioner/ironic/ironic_test.go | 4 ++-- pkg/provisioner/ironic/result.go | 4 ---- pkg/provisioner/ironic/updateopts_test.go | 4 ++-- pkg/provisioner/provisioner.go | 2 +- 11 files changed, 24 insertions(+), 26 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index aa5cb8174e..d0169ce7ef 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -208,7 +208,7 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R request: request, bmcCredsSecret: bmcCredsSecret, } - prov, err := r.ProvisionerFactory(host, *bmcCreds, info.publishEvent) + prov, err := r.ProvisionerFactory(*host, *bmcCreds, info.publishEvent) if err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to create provisioner") } diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 6ae6145f2e..51c1a97123 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -18,7 +18,7 @@ import ( func testStateMachine(host *metal3v1alpha1.BareMetalHost) *hostStateMachine { r := newTestReconciler() - p, _ := r.ProvisionerFactory(host, bmc.Credentials{}, + p, _ := r.ProvisionerFactory(*host.DeepCopy(), bmc.Credentials{}, func(reason, message string) {}) return newHostStateMachine(host, r, p, true) } diff --git a/main.go b/main.go index 3bb98d037a..e528383805 100644 --- a/main.go +++ b/main.go @@ -118,22 +118,24 @@ func main() { os.Exit(1) } - provisionerFactory := func(host *metal3iov1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish provisioner.EventPublisher) (provisioner.Provisioner, error) { + provisionerFactory := func(host metal3iov1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish provisioner.EventPublisher) (provisioner.Provisioner, error) { isUnmanaged := host.Spec.ExternallyProvisioned && !host.HasBMCDetails() + hostCopy := host.DeepCopy() + if runInTestMode { ctrl.Log.Info("using test provisioner") fix := fixture.Fixture{} - return fix.New(host, bmcCreds, publish) + return fix.New(*hostCopy, bmcCreds, publish) } else if runInDemoMode { ctrl.Log.Info("using demo provisioner") - return demo.New(host, bmcCreds, publish) + return demo.New(*hostCopy, bmcCreds, publish) } else if isUnmanaged { ctrl.Log.Info("using empty provisioner") - return empty.New(host, bmcCreds, publish) + return empty.New(*hostCopy, bmcCreds, publish) } ironic.LogStartup() - return ironic.New(host, bmcCreds, publish) + return ironic.New(*hostCopy, bmcCreds, publish) } if err = (&metal3iocontroller.BareMetalHostReconciler{ diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 982fd4ea19..606bb99aae 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -46,7 +46,7 @@ const ( // and uses Ironic to manage the host. type demoProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // the bmc credentials bmcCreds bmc.Credentials // a logger configured for this host @@ -56,7 +56,7 @@ type demoProvisioner struct { } // New returns a new Ironic Provisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { p := &demoProvisioner{ host: host, bmcCreds: bmcCreds, diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index 842652f9c6..0fbc48802c 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -15,7 +15,7 @@ type emptyProvisioner struct { } // New returns a new Empty Provisioner -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { return &emptyProvisioner{}, nil } diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index d51cc0f9c6..136264383e 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -45,7 +45,7 @@ func (cd *fixtureHostConfigData) MetaData() (string, error) { // and uses Ironic to manage the host. type fixtureProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // the bmc credentials bmcCreds bmc.Credentials // a logger configured for this host @@ -70,9 +70,9 @@ type Fixture struct { } // New returns a new Ironic FixtureProvisioner -func (f *Fixture) New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func (f *Fixture) New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { p := &fixtureProvisioner{ - host: host, + host: *host.DeepCopy(), bmcCreds: bmcCreds, log: log.WithValues("host", host.Name), publisher: publisher, diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index e7a514f39d..768d4cdb33 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -105,7 +105,7 @@ func init() { // and uses Ironic to manage the host. type ironicProvisioner struct { // the host to be managed by this provisioner - host *metal3v1alpha1.BareMetalHost + host metal3v1alpha1.BareMetalHost // a shorter path to the provisioning status data structure status *metal3v1alpha1.ProvisionStatus // access parameters for the BMC @@ -137,7 +137,7 @@ func LogStartup() { // A private function to construct an ironicProvisioner (rather than a // Provisioner interface) in a consistent way for tests. -func newProvisionerWithSettings(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, ironicURL string, ironicAuthSettings clients.AuthConfig, inspectorURL string, inspectorAuthSettings clients.AuthConfig) (*ironicProvisioner, error) { +func newProvisionerWithSettings(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, ironicURL string, ironicAuthSettings clients.AuthConfig, inspectorURL string, inspectorAuthSettings clients.AuthConfig) (*ironicProvisioner, error) { tlsConf := clients.TLSConfig{ TrustedCAFile: ironicTrustedCAFile, InsecureSkipVerify: ironicInsecure, @@ -156,7 +156,7 @@ func newProvisionerWithSettings(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc clientIronic, clientInspector) } -func newProvisionerWithIronicClients(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, clientIronic *gophercloud.ServiceClient, clientInspector *gophercloud.ServiceClient) (*ironicProvisioner, error) { +func newProvisionerWithIronicClients(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher, clientIronic *gophercloud.ServiceClient, clientInspector *gophercloud.ServiceClient) (*ironicProvisioner, error) { bmcAccess, err := bmc.NewAccessDetails(host.Spec.BMC.Address, host.Spec.BMC.DisableCertificateVerification) if err != nil { @@ -182,7 +182,7 @@ func newProvisionerWithIronicClients(host *metal3v1alpha1.BareMetalHost, bmcCred // New returns a new Ironic Provisioner using the global configuration // for finding the Ironic services. -func New(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { +func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher provisioner.EventPublisher) (provisioner.Provisioner, error) { var err error if clientIronicSingleton == nil || clientInspectorSingleton == nil { tlsConf := clients.TLSConfig{ diff --git a/pkg/provisioner/ironic/ironic_test.go b/pkg/provisioner/ironic/ironic_test.go index fbf2cba020..864609a3a6 100644 --- a/pkg/provisioner/ironic/ironic_test.go +++ b/pkg/provisioner/ironic/ironic_test.go @@ -16,10 +16,10 @@ func init() { logf.SetLogger(logf.ZapLogger(true)) } -func makeHost() *metal3v1alpha1.BareMetalHost { +func makeHost() metal3v1alpha1.BareMetalHost { rotational := true - return &metal3v1alpha1.BareMetalHost{ + return metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", diff --git a/pkg/provisioner/ironic/result.go b/pkg/provisioner/ironic/result.go index 462085d605..cb9057e821 100644 --- a/pkg/provisioner/ironic/result.go +++ b/pkg/provisioner/ironic/result.go @@ -16,10 +16,6 @@ func retryAfterDelay(delay time.Duration) (provisioner.Result, error) { }, nil } -func hostUpdated() (provisioner.Result, error) { - return provisioner.Result{Dirty: true}, nil -} - func operationContinuing(delay time.Duration) (provisioner.Result, error) { return provisioner.Result{ Dirty: true, diff --git a/pkg/provisioner/ironic/updateopts_test.go b/pkg/provisioner/ironic/updateopts_test.go index 88714db735..d712064f9e 100644 --- a/pkg/provisioner/ironic/updateopts_test.go +++ b/pkg/provisioner/ironic/updateopts_test.go @@ -83,7 +83,7 @@ func TestGetUpdateOptsForNodeWithRootHints(t *testing.T) { } func TestGetUpdateOptsForNodeVirtual(t *testing.T) { - host := &metal3v1alpha1.BareMetalHost{ + host := metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", @@ -188,7 +188,7 @@ func TestGetUpdateOptsForNodeVirtual(t *testing.T) { } func TestGetUpdateOptsForNodeDell(t *testing.T) { - host := &metal3v1alpha1.BareMetalHost{ + host := metal3v1alpha1.BareMetalHost{ ObjectMeta: metav1.ObjectMeta{ Name: "myhost", Namespace: "myns", diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 18a7002770..70c9b38027 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -17,7 +17,7 @@ Package provisioning defines the API for talking to the provisioning backend. type EventPublisher func(reason, message string) // Factory is the interface for creating new Provisioner objects. -type Factory func(host *metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish EventPublisher) (Provisioner, error) +type Factory func(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publish EventPublisher) (Provisioner, error) // HostConfigData retrieves host configuration data type HostConfigData interface { From 24f06593207cbd5ccee417e7a614d5410e45d2f4 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Mon, 11 Jan 2021 23:08:04 -0500 Subject: [PATCH 23/28] Make actionRegistering() logs less chatty This function is now called on basically every reconcile, so don't log information about things that haven't changed. (cherry picked from commit cbaa73dfaf2fa17b51c4ae0dcb2e7765a1e6a435) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index d0169ce7ef..b944db885b 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -440,8 +440,6 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner return actionError{errors.Wrap(err, "failed to validate BMC access")} } - info.log.Info("response from validate", "provResult", provResult) - if provResult.ErrorMessage != "" { return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } @@ -461,14 +459,15 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner // Reaching this point means the credentials are valid and worked, // so clear any previous error and record the success in the // status block. - info.log.Info("updating credentials success status fields") - registeredNewCreds := !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) - info.host.UpdateGoodCredentials(*info.bmcCredsSecret) - info.log.Info("clearing previous error message") - clearError(info.host) - - if registeredNewCreds { + if !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) { + info.log.Info("updating credentials success status fields") + info.host.UpdateGoodCredentials(*info.bmcCredsSecret) info.publishEvent("BMCAccessValidated", "Verified access to BMC") + } else { + info.log.Info("verified access to the BMC") + } + if clearError(info.host) { + info.log.Info("clearing previous error message") } return actionComplete{} From 19c72b1fbdfff2eb62c57ca468ca7d52f669e514 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Sat, 9 Jan 2021 00:26:40 -0500 Subject: [PATCH 24/28] Don't write Status on every actionContinue When the provisioner could write to the Host's Status subresource and expect it to be persisted, we had to always treat a Dirty flag returned from the provisioner as indicating that the Status was dirty and required a write. (Oddly this meant the Dirty flag was treated with almost the inverse of its intended meaning: when it was returned true the Status was rarely actually dirty, while when it was false the action was complete which always requires a write.) Now that provisioners are no longer allowed to write to the Status, we only need to request a write when the controller has updated the Status. Add a new actionUpdate to cover the couple of cases that are not already covered by actionComplete, and those where ClearError() returns dirty. The actionUpdate type comprises actionContinue so that type checks for the latter handle it correctly. Make actionContinue return false from the Dirty() method, meaning that it will not cause a write to the k8s API. The actionContinueNoWrite type is removed, since it would now be identical to actionContinue. (cherry picked from commit 3102c74cc74ff80da18a17f3f2952138f2690ee2) Signed-off-by: Honza Pokorny --- controllers/metal3.io/action_result.go | 22 ++--- .../metal3.io/baremetalhost_controller.go | 81 +++++++++++++------ controllers/metal3.io/host_state_machine.go | 8 +- 3 files changed, 75 insertions(+), 36 deletions(-) diff --git a/controllers/metal3.io/action_result.go b/controllers/metal3.io/action_result.go index 5f3a9717e3..a746f05c98 100644 --- a/controllers/metal3.io/action_result.go +++ b/controllers/metal3.io/action_result.go @@ -25,17 +25,6 @@ type actionResult interface { Dirty() bool } -// actionContinueNoWrite is a result indicating that the current action is still -// in progress, and that the resource should remain in the same provisioning -// state without writing the status -type actionContinueNoWrite struct { - actionContinue -} - -func (r actionContinueNoWrite) Dirty() bool { - return false -} - // actionContinue is a result indicating that the current action is still // in progress, and that the resource should remain in the same provisioning // state. @@ -51,6 +40,17 @@ func (r actionContinue) Result() (result reconcile.Result, err error) { } func (r actionContinue) Dirty() bool { + return false +} + +// actionUpdate is a result indicating that the current action is still +// in progress, and that the resource should remain in the same provisioning +// state but write new Status data. +type actionUpdate struct { + actionContinue +} + +func (r actionUpdate) Dirty() bool { return true } diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index b944db885b..fe2e24770f 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -419,19 +419,21 @@ func (r *BareMetalHostReconciler) actionUnmanaged(prov provisioner.Provisioner, if info.host.HasBMCDetails() { return actionComplete{} } - return actionContinueNoWrite{actionContinue{unmanagedRetryDelay}} + return actionContinue{unmanagedRetryDelay} } // Test the credentials by connecting to the management controller. func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("registering and validating access to management controller", "credentials", info.host.Status.TriedCredentials) + dirty := false credsChanged := !info.host.Status.TriedCredentials.Match(*info.bmcCredsSecret) if credsChanged { info.log.Info("new credentials") info.host.UpdateTriedCredentials(*info.bmcCredsSecret) info.postSaveCallbacks = append(info.postSaveCallbacks, updatedCredentials.Inc) + dirty = true } provResult, provID, err := prov.ValidateManagementAccess(credsChanged, info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) @@ -447,13 +449,19 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner if provID != "" && info.host.Status.Provisioning.ID != provID { info.log.Info("setting provisioning id", "ID", provID) info.host.Status.Provisioning.ID = provID - provResult.Dirty = true + dirty = true } if provResult.Dirty { info.log.Info("host not ready", "wait", provResult.RequeueAfter) - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + dirty = true + } + if dirty { + return actionUpdate{result} + } + return result } // Reaching this point means the credentials are valid and worked, @@ -463,13 +471,22 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner info.log.Info("updating credentials success status fields") info.host.UpdateGoodCredentials(*info.bmcCredsSecret) info.publishEvent("BMCAccessValidated", "Verified access to BMC") + dirty = true } else { info.log.Info("verified access to the BMC") } if clearError(info.host) { info.log.Info("clearing previous error message") + dirty = true } + if dirty { + // Normally returning actionComplete would be sufficient to ensure + // changes are written, but when this is called in a state other than + // Registering, the state machine will ignore actionComplete and + // proceed with the action function for the current state. + return actionUpdate{} + } return actionComplete{} } @@ -486,12 +503,15 @@ func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, return recordActionFailure(info, metal3v1alpha1.InspectionError, provResult.ErrorMessage) } - clearError(info.host) - if provResult.Dirty || details == nil { - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } + clearError(info.host) info.host.Status.HardwareDetails = details return actionComplete{} } @@ -553,7 +573,7 @@ func (r *BareMetalHostReconciler) actionProvisioning(prov provisioner.Provisione if err := r.Update(context.TODO(), info.host); err != nil { return actionError{errors.Wrap(err, "failed to remove reboot annotations from host")} } - return actionContinueNoWrite{} + return actionContinue{} } provResult, err := prov.Provision(hostConf) @@ -570,8 +590,11 @@ func (r *BareMetalHostReconciler) actionProvisioning(prov provisioner.Provisione // Go back into the queue and wait for the Provision() method // to return false, indicating that it has no more work to // do. - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } // If the provisioner had no work, ensure the image settings match. @@ -603,8 +626,11 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty { - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } info.log.Info("deprovisioning") @@ -619,15 +645,18 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio } if provResult.Dirty { - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } if clearRebootAnnotations(info.host) { if err = r.Update(context.TODO(), info.host); err != nil { return actionError{errors.Wrap(err, "failed to remove reboot annotations from host")} } - return actionContinueNoWrite{} + return actionContinue{} } // After the provisioner is done, clear the provisioning settings @@ -652,7 +681,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info.log.Info("updating power status", "discovered", *hwState.PoweredOn) info.host.Status.PoweredOn = *hwState.PoweredOn clearError(info.host) - return actionContinue{} + return actionUpdate{} } desiredPowerOnState := info.host.Spec.Online @@ -665,7 +694,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, return actionError{errors.Wrap(err, "failed to remove reboot annotation from host")} } - return actionContinueNoWrite{} + return actionContinue{} } } @@ -680,7 +709,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // a delay. steadyStateResult := actionContinue{time.Second * 60} if info.host.Status.PoweredOn == desiredPowerOnState { - return actionContinueNoWrite{steadyStateResult} + return steadyStateResult } info.log.Info("power state change needed", @@ -711,8 +740,11 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, } powerChangeAttempts.With(metricLabels).Inc() }) - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } // The provisioner did not have to do anything to change the power @@ -720,7 +752,7 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // host status field. info.host.Status.PoweredOn = info.host.Spec.Online info.host.Status.ErrorCount = 0 - return steadyStateResult + return actionUpdate{steadyStateResult} } // A host reaching this action handler should be provisioned or externally @@ -736,8 +768,11 @@ func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provi return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) } if provResult.Dirty { - clearError(info.host) - return actionContinue{provResult.RequeueAfter} + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } return r.manageHostPower(prov, info) diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 449782cebe..89a60dd0c9 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -63,7 +63,7 @@ func recordStateBegin(host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.P } } -func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState, time metav1.Time) { +func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState, time metav1.Time) (changed bool) { if prevMetric := host.OperationMetricForState(state); prevMetric != nil { if !prevMetric.Start.IsZero() && prevMetric.End.IsZero() { prevMetric.End = time @@ -71,8 +71,10 @@ func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, sta observer := stateTime[state].With(hostMetricLabels(info.request)) observer.Observe(prevMetric.Duration().Seconds()) }) + changed = true } } + return } func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.ProvisioningState, @@ -193,7 +195,9 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio result = hsm.Reconciler.actionRegistering(hsm.Provisioner, info) if _, complete := result.(actionComplete); complete { if hsm.NextState != metal3v1alpha1.StateRegistering { - recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) + if recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) { + return actionUpdate{} + } } if needsReregister { // Host was re-registered, so requeue and run the state machine on From 4e60bf6bb6e6f61a68cd9dee7ec78c5a233efec7 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Thu, 14 Jan 2021 11:45:43 -0500 Subject: [PATCH 25/28] Stop treating actionRegistering() like a state action All of the action() methods of the reconciler are the handlers used by the host state machine to carry out the action in the current provisioning state, except for actionRegistering(). actionRegistering() is called in every state (by the state machine's ensureRegistered() method) prior to calling the action method for the current state. Rename actionRegistering() to registerHost() to eliminate the implication that it is an action specific to the Registering state. Simplify the return handling by returning nil when validating management access succeeded and there were no changes required to the Status, since this is what we want to return from ensureRegistered() (indicating no need to return early before executing the state's action method). This will occur both in general operation when there is no re-registration required, and also potentially at the conclusion of a re-registration where there was no credentials change and errors had already been cleared (by a previous invocation that returned actionUpdate). At the conclusion of a registration where changes need to be written to the Status, e.g. by clearing a previous error state or updating the known good credentials, actionComplete is returned. This is later translated to actionUpdate by ensureRegistered(). This enables that scenario to be distinguished from that of a change needing to be written while registration is still in progress, which results in actionUpdate. This means there is no longer a need in ensureRegistered() to guess when the registerHost() method might have resulted in a change to the Status. (cherry picked from commit 98d713159488832fcc367e277abf4601bd6ab8f8) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 10 ++---- controllers/metal3.io/host_state_machine.go | 32 +++++++------------ 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index fe2e24770f..502116e22a 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -423,7 +423,7 @@ func (r *BareMetalHostReconciler) actionUnmanaged(prov provisioner.Provisioner, } // Test the credentials by connecting to the management controller. -func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner, info *reconcileInfo) actionResult { +func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info("registering and validating access to management controller", "credentials", info.host.Status.TriedCredentials) dirty := false @@ -481,13 +481,9 @@ func (r *BareMetalHostReconciler) actionRegistering(prov provisioner.Provisioner } if dirty { - // Normally returning actionComplete would be sufficient to ensure - // changes are written, but when this is called in a state other than - // Registering, the state machine will ignore actionComplete and - // proceed with the action function for the current state. - return actionUpdate{} + return actionComplete{} } - return actionComplete{} + return nil } // Ensure we have the information about the hardware on the host. diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 89a60dd0c9..45ba5bb096 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -172,8 +172,6 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio return } - needsReregister := false - switch hsm.NextState { case metal3v1alpha1.StateNone, metal3v1alpha1.StateUnmanaged: // We haven't yet reached the Registration state, so don't attempt @@ -184,32 +182,24 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio return case metal3v1alpha1.StateRegistering: default: - needsReregister = (hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError || - !hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret)) - if needsReregister { + if hsm.Host.Status.ErrorType == metal3v1alpha1.RegistrationError || + !hsm.Host.Status.GoodCredentials.Match(*info.bmcCredsSecret) { info.log.Info("Retrying registration") recordStateBegin(hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) } } - result = hsm.Reconciler.actionRegistering(hsm.Provisioner, info) - if _, complete := result.(actionComplete); complete { - if hsm.NextState != metal3v1alpha1.StateRegistering { - if recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) { - return actionUpdate{} - } - } - if needsReregister { - // Host was re-registered, so requeue and run the state machine on - // the next reconcile - result = actionContinue{} - } else { - // Allow the state machine to run, either because we were just - // reconfirming an existing registration, or because we are in the - // Registering state - result = nil + result = hsm.Reconciler.registerHost(hsm.Provisioner, info) + _, complete := result.(actionComplete) + if (result == nil || complete) && + hsm.NextState != metal3v1alpha1.StateRegistering { + if recordStateEnd(info, hsm.Host, metal3v1alpha1.StateRegistering, metav1.Now()) { + result = actionUpdate{} } } + if complete { + result = actionUpdate{} + } return } From 7bfc8a29f9c13a8e4b894b6ecf4ade5f63af40f3 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 8 Dec 2020 16:11:34 -0500 Subject: [PATCH 26/28] Handle deploy failed state when deprovisioning If provisioning fails and the node ends up in the deploy failed state, we try to recover by deprovisioning. That would work better if we actually handled deploy failed in Deprovision(). (cherry picked from commit a308ddfa556e67a0bc42de415133e4c05e493168) Signed-off-by: Honza Pokorny --- pkg/provisioner/ironic/ironic.go | 2 +- pkg/provisioner/ironic/provision_test.go | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 768d4cdb33..2674774321 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -1341,7 +1341,7 @@ func (p *ironicProvisioner) Deprovision(force bool) (result provisioner.Result, p.log.Info("cleaning") return operationContinuing(deprovisionRequeueDelay) - case nodes.Active: + case nodes.Active, nodes.DeployFail: p.log.Info("starting deprovisioning") p.publisher("DeprovisioningStarted", "Image deprovisioning started") return p.changeNodeProvisionState( diff --git a/pkg/provisioner/ironic/provision_test.go b/pkg/provisioner/ironic/provision_test.go index 2620310312..31ec7a5810 100644 --- a/pkg/provisioner/ironic/provision_test.go +++ b/pkg/provisioner/ironic/provision_test.go @@ -136,6 +136,15 @@ func TestDeprovision(t *testing.T) { expectedRequestAfter: 10, expectedDirty: true, }, + { + name: "deploy failed state", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + ProvisionState: string(nodes.DeployFail), + UUID: nodeUUID, + }), + expectedRequestAfter: 10, + expectedDirty: true, + }, { name: "error state", ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ From 39aecec2543e5b82bc9372217d2f6f12fc8d71c0 Mon Sep 17 00:00:00 2001 From: Zane Bitter Date: Tue, 8 Dec 2020 16:37:32 -0500 Subject: [PATCH 27/28] Don't adopt in deprovisioning if provisioning didn't complete In the deprovisioning state, calling Adopt() on a freshly-reregistered host will fail if we didn't previously complete provisioning because there will be no image data stored in the status to pass to the node for adoption. Only call Adopt() if the previous provisioning completed and recorded the requisite image data. If the node has just been re-registered, it will be in the manageable state anyway, and thus will still get cleaned before it is next provisioned. (cherry picked from commit 24ba8a0ac35e3a18e22b94a089eb19cc92c5e883) Signed-off-by: Honza Pokorny --- .../metal3.io/baremetalhost_controller.go | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 502116e22a..d940abbf55 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -612,26 +612,28 @@ func clearHostProvisioningSettings(host *metal3v1alpha1.BareMetalHost) { } func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisioner, info *reconcileInfo) actionResult { - // Adopt the host in case it has been re-registered during the - // deprovisioning process before it completed - provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) - if err != nil { - return actionError{err} - } - if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) - } - if provResult.Dirty { - result := actionContinue{provResult.RequeueAfter} - if clearError(info.host) { - return actionUpdate{result} + if info.host.Status.Provisioning.Image.URL != "" { + // Adopt the host in case it has been re-registered during the + // deprovisioning process before it completed + provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) + if err != nil { + return actionError{err} + } + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) + } + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result } - return result } info.log.Info("deprovisioning") - provResult, err = prov.Deprovision(info.host.Status.ErrorType == metal3v1alpha1.ProvisioningError) + provResult, err := prov.Deprovision(info.host.Status.ErrorType == metal3v1alpha1.ProvisioningError) if err != nil { return actionError{errors.Wrap(err, "failed to deprovision")} } From 07ffe194d9231402f3b4d9eae9c947ef506031ab Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Mon, 18 Jan 2021 10:51:01 -0500 Subject: [PATCH 28/28] Ensure adoption is retried upon failure Currently when adoption fails, we use RegistrationError but this gets cleared any time registration succeeds. We need a separate signal for adoption failure, which allows us to force retry the adoption again. This adds a new AdoptionError, and ensures that if registration succeeds we don't clear an AdoptionError. Co-authored-by: Andrea Fasano (cherry picked from commit a10a2adb9fe23723d550079b5af2d8da004a6e87) Signed-off-by: Honza Pokorny --- .../metal3.io/v1alpha1/baremetalhost_types.go | 5 +- .../crd/bases/metal3.io_baremetalhosts.yaml | 1 + config/render/capm3.yaml | 1 + .../metal3.io/baremetalhost_controller.go | 25 ++++---- .../metal3.io/host_state_machine_test.go | 62 +++++++++++++++++++ 5 files changed, 82 insertions(+), 12 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 4d93fe683b..7ac672efbe 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -123,6 +123,9 @@ const ( type ErrorType string const ( + // ProvisionedRegistrationError is an error condition occurring when the controller + // is unable to re-register an already provisioned host. + ProvisionedRegistrationError ErrorType = "provisioned registration error" // RegistrationError is an error condition occurring when the // controller is unable to connect to the Host's baseboard management // controller. @@ -521,7 +524,7 @@ type BareMetalHostStatus struct { // ErrorType indicates the type of failure encountered when the // OperationalStatus is OperationalStatusError - // +kubebuilder:validation:Enum=registration error;inspection error;provisioning error;power management error + // +kubebuilder:validation:Enum=provisioned registration error;registration error;inspection error;provisioning error;power management error ErrorType ErrorType `json:"errorType,omitempty"` // LastUpdated identifies when this status was last observed. diff --git a/config/crd/bases/metal3.io_baremetalhosts.yaml b/config/crd/bases/metal3.io_baremetalhosts.yaml index 767b31856a..fe52ca031c 100644 --- a/config/crd/bases/metal3.io_baremetalhosts.yaml +++ b/config/crd/bases/metal3.io_baremetalhosts.yaml @@ -262,6 +262,7 @@ spec: errorType: description: ErrorType indicates the type of failure encountered when the OperationalStatus is OperationalStatusError enum: + - provisioned registration error - registration error - inspection error - provisioning error diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index e99fc20eca..d10547a31c 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -260,6 +260,7 @@ spec: errorType: description: ErrorType indicates the type of failure encountered when the OperationalStatus is OperationalStatusError enum: + - provisioned registration error - registration error - inspection error - provisioning error diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index d940abbf55..d5967ce282 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -280,10 +280,11 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType setErrorMessage(info.host, errorType, errorMessage) eventType := map[metal3v1alpha1.ErrorType]string{ - metal3v1alpha1.RegistrationError: "RegistrationError", - metal3v1alpha1.InspectionError: "InspectionError", - metal3v1alpha1.ProvisioningError: "ProvisioningError", - metal3v1alpha1.PowerManagementError: "PowerManagementError", + metal3v1alpha1.ProvisionedRegistrationError: "ProvisionedRegistrationError", + metal3v1alpha1.RegistrationError: "RegistrationError", + metal3v1alpha1.InspectionError: "InspectionError", + metal3v1alpha1.ProvisioningError: "ProvisioningError", + metal3v1alpha1.PowerManagementError: "PowerManagementError", }[errorType] counter := actionFailureCounters.WithLabelValues(eventType) @@ -467,7 +468,8 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf // Reaching this point means the credentials are valid and worked, // so clear any previous error and record the success in the // status block. - if !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) { + registeredNewCreds := !info.host.Status.GoodCredentials.Match(*info.bmcCredsSecret) + if registeredNewCreds { info.log.Info("updating credentials success status fields") info.host.UpdateGoodCredentials(*info.bmcCredsSecret) info.publishEvent("BMCAccessValidated", "Verified access to BMC") @@ -475,9 +477,10 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf } else { info.log.Info("verified access to the BMC") } - if clearError(info.host) { + + if info.host.Status.ErrorType == metal3v1alpha1.RegistrationError || registeredNewCreds { info.log.Info("clearing previous error message") - dirty = true + dirty = clearError(info.host) } if dirty { @@ -615,12 +618,12 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(prov provisioner.Provisio if info.host.Status.Provisioning.Image.URL != "" { // Adopt the host in case it has been re-registered during the // deprovisioning process before it completed - provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) + provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.ProvisionedRegistrationError) if err != nil { return actionError{err} } if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) + return recordActionFailure(info, metal3v1alpha1.ProvisionedRegistrationError, provResult.ErrorMessage) } if provResult.Dirty { result := actionContinue{provResult.RequeueAfter} @@ -758,12 +761,12 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, // action. We use the Adopt() API to make sure that the provisioner is aware of // the provisioning details. Then we monitor its power status. func (r *BareMetalHostReconciler) actionManageSteadyState(prov provisioner.Provisioner, info *reconcileInfo) actionResult { - provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.RegistrationError) + provResult, err := prov.Adopt(info.host.Status.ErrorType == metal3v1alpha1.ProvisionedRegistrationError) if err != nil { return actionError{err} } if provResult.ErrorMessage != "" { - return recordActionFailure(info, metal3v1alpha1.RegistrationError, provResult.ErrorMessage) + return recordActionFailure(info, metal3v1alpha1.ProvisionedRegistrationError, provResult.ErrorMessage) } if provResult.Dirty { result := actionContinue{provResult.RequeueAfter} diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 51c1a97123..58d9a3d233 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -289,6 +289,59 @@ func TestErrorCountCleared(t *testing.T) { } } +func TestErrorClean(t *testing.T) { + + tests := []struct { + Scenario string + Host *metal3v1alpha1.BareMetalHost + SecretName string + ExpectError bool + }{ + { + Scenario: "clean-after-registration-error", + Host: host(metal3v1alpha1.StateInspecting). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.RegistrationError, "some error", 1). + build(), + }, + { + Scenario: "not-clean-after-provisioned-registration-error", + Host: host(metal3v1alpha1.StateInspecting). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.ProvisionedRegistrationError, "some error", 1). + build(), + ExpectError: true, + }, + { + Scenario: "clean-after-creds-change", + Host: host(metal3v1alpha1.StateReady). + SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.InspectionError, "some error", 1). + build(), + SecretName: "NewCreds", + }, + } + for _, tt := range tests { + t.Run(tt.Scenario, func(t *testing.T) { + prov := &mockProvisioner{} + hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true) + + info := makeDefaultReconcileInfo(tt.Host) + if tt.SecretName != "" { + info.bmcCredsSecret.Name = tt.SecretName + } + + hsm.ReconcileState(info) + + if tt.ExpectError { + assert.Equal(t, tt.Host.Status.ErrorType, v1alpha1.ProvisionedRegistrationError) + assert.NotEmpty(t, tt.Host.Status.ErrorMessage) + } else { + assert.Equal(t, tt.Host.Status.OperationalStatus, v1alpha1.OperationalStatusOK) + assert.Empty(t, tt.Host.Status.ErrorType) + assert.Empty(t, tt.Host.Status.ErrorMessage) + } + }) + } +} + type hostBuilder struct { metal3v1alpha1.BareMetalHost } @@ -334,6 +387,15 @@ func (hb *hostBuilder) SetImageURL(url string) *hostBuilder { return hb } +func (hb *hostBuilder) SetStatusError(opStatus metal3v1alpha1.OperationalStatus, errType metal3v1alpha1.ErrorType, errMsg string, errCount int) *hostBuilder { + hb.Status.OperationalStatus = opStatus + hb.Status.ErrorType = errType + hb.Status.ErrorMessage = errMsg + hb.Status.ErrorCount = errCount + + return hb +} + func makeDefaultReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo { return &reconcileInfo{ log: logf.Log.WithName("controllers").WithName("BareMetalHost").WithName("host_state_machine"),