diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index f6296924fe..01873401f2 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -136,6 +136,9 @@ const ( // PowerManagementError is an error condition occurring when the // controller is unable to modify the power state of the Host. PowerManagementError ErrorType = "power management error" + // TooManyHostsError is an error condition occurring when + // there are too many hosts simultaneously being provisioned + TooManyHostsError ErrorType = "too many hosts error" ) // ProvisioningState defines the states the provisioner will report @@ -520,7 +523,7 @@ type BareMetalHostStatus struct { // ErrorType indicates the type of failure encountered when the // OperationalStatus is OperationalStatusError - // +kubebuilder:validation:Enum=registration error;inspection error;provisioning error;power management error + // +kubebuilder:validation:Enum=registration error;inspection error;provisioning error;power management error;too many hosts error ErrorType ErrorType `json:"errorType,omitempty"` // LastUpdated identifies when this status was last observed. diff --git a/config/crd/bases/metal3.io_baremetalhosts.yaml b/config/crd/bases/metal3.io_baremetalhosts.yaml index 24b64bb8a6..31ec247c75 100644 --- a/config/crd/bases/metal3.io_baremetalhosts.yaml +++ b/config/crd/bases/metal3.io_baremetalhosts.yaml @@ -262,6 +262,7 @@ spec: - inspection error - provisioning error - power management error + - too many hosts error type: string goodCredentials: description: the last credentials we were able to validate as working diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index 1e9f259eee..722c87c44a 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -260,6 +260,7 @@ spec: - inspection error - provisioning error - power management error + - too many hosts error type: string goodCredentials: description: the last credentials we were able to validate as working diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 5aab69c3ec..13a114a409 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -22,6 +22,7 @@ import ( "os" "strconv" "strings" + "sync" "time" "github.com/go-logr/logr" @@ -50,9 +51,21 @@ const ( unmanagedRetryDelay = time.Minute * 10 provisionerNotReadyRetryDelay = time.Second * 30 rebootAnnotationPrefix = "reboot.metal3.io" + maxProvisioningHostsDefault = 20 ) +var maxProvisioningHosts int + func init() { + maxProvisioningHosts = maxProvisioningHostsDefault + if maxHostsStr := os.Getenv("MAX_CONCURRENT_PROVISIONING_HOSTS"); maxHostsStr != "" { + value, err := strconv.Atoi(maxHostsStr) + if err != nil { + fmt.Fprintf(os.Stderr, "Cannot start: Invalid value set for variable MAX_CONCURRENT_PROVISIONING_HOSTS=%s", maxHostsStr) + os.Exit(1) + } + maxProvisioningHosts = value + } } // BareMetalHostReconciler reconciles a BareMetalHost object @@ -61,6 +74,22 @@ type BareMetalHostReconciler struct { Log logr.Logger Scheme *runtime.Scheme ProvisionerFactory provisioner.Factory + + currentProvisioningHosts map[string]struct{} + mu sync.Mutex +} + +// NewBareMetalHostReconciler creates a new reconciler instance +func NewBareMetalHostReconciler(client client.Client, scheme *runtime.Scheme, factory provisioner.Factory) *BareMetalHostReconciler { + r := BareMetalHostReconciler{ + Client: client, + Log: ctrl.Log.WithName("controllers").WithName("BareMetalHost"), + Scheme: scheme, + ProvisionerFactory: factory, + + currentProvisioningHosts: make(map[string]struct{}), + } + return &r } // Instead of passing a zillion arguments to the action of a phase, @@ -914,9 +943,62 @@ func hostHasFinalizer(host *metal3v1alpha1.BareMetalHost) bool { return utils.StringInList(host.Finalizers, metal3v1alpha1.BareMetalHostFinalizer) } +// Resync the set at the startup +func (r *BareMetalHostReconciler) syncProvisioningHosts(mgr ctrl.Manager) { + hosts := &metal3v1alpha1.BareMetalHostList{} + err := mgr.GetAPIReader().List(context.TODO(), hosts) + if err != nil { + os.Exit(1) + } + + for _, host := range hosts.Items { + r.checkProvisioningHost(host, host.Status.Provisioning.State) + } +} + +// Check if there's a free slot for hosts to be provisioned +func (r *BareMetalHostReconciler) checkProvisioningHost(host metal3v1alpha1.BareMetalHost, state metal3v1alpha1.ProvisioningState) bool { + r.mu.Lock() + defer r.mu.Unlock() + + switch state { + case metal3v1alpha1.StateInspecting, metal3v1alpha1.StateProvisioning: + if len(r.currentProvisioningHosts) >= maxProvisioningHosts { + return false + } + r.currentProvisioningHosts[host.Name] = struct{}{} + default: + delete(r.currentProvisioningHosts, host.Name) + } + + return true +} + +// Check if there's a free slot for hosts that have been previously delayed +func (r *BareMetalHostReconciler) checkDelayedHost(info *reconcileInfo) (bool, actionResult) { + r.mu.Lock() + defer r.mu.Unlock() + + switch info.host.Status.ErrorType { + case metal3v1alpha1.TooManyHostsError: + if len(r.currentProvisioningHosts) >= maxProvisioningHosts { + // No available slots, current action delayed + return true, recordActionFailure(info, metal3v1alpha1.TooManyHostsError, "Delayed, too many hosts") + } + + // A slot could be available, let's cleanup the host and retry + info.host.ClearError() + return true, actionContinue{} + } + + return false, nil +} + // SetupWithManager reigsters the reconciler to be run by the manager func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.syncProvisioningHosts(mgr) + maxConcurrentReconciles := 3 if mcrEnv, ok := os.LookupEnv("BMO_CONCURRENCY"); ok { mcr, err := strconv.Atoi(mcrEnv) diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index 25e28b9a22..cc68e430de 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -95,12 +95,7 @@ func newTestReconcilerWithProvisionerFactory(factory provisioner.Factory, initOb bmcSecret := newBMCCredsSecret(defaultSecretName, "User", "Pass") c.Create(goctx.TODO(), bmcSecret) - return &BareMetalHostReconciler{ - Client: c, - Scheme: scheme.Scheme, - ProvisionerFactory: factory, - Log: ctrl.Log.WithName("controllers").WithName("BareMetalHost"), - } + return NewBareMetalHostReconciler(c, scheme.Scheme, factory) } func newTestReconciler(initObjs ...runtime.Object) *BareMetalHostReconciler { diff --git a/controllers/metal3.io/demo_test.go b/controllers/metal3.io/demo_test.go index ba68f96e22..cc8658db93 100644 --- a/controllers/metal3.io/demo_test.go +++ b/controllers/metal3.io/demo_test.go @@ -8,7 +8,6 @@ import ( "k8s.io/client-go/kubernetes/scheme" - ctrl "sigs.k8s.io/controller-runtime" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -24,12 +23,7 @@ func newDemoReconciler(initObjs ...runtime.Object) *BareMetalHostReconciler { bmcSecret := newSecret(defaultSecretName, map[string]string{"username": "User", "password": "Pass"}) c.Create(goctx.TODO(), bmcSecret) - return &BareMetalHostReconciler{ - Client: c, - Scheme: scheme.Scheme, - ProvisionerFactory: demo.New, - Log: ctrl.Log.WithName("controller").WithName("BareMetalHost"), - } + return NewBareMetalHostReconciler(c, scheme.Scheme, demo.New) } // TestDemoRegistrationError tests that a host with the right name reports diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 3427fa47bd..0af8cbe460 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -73,8 +73,13 @@ func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, sta } func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.ProvisioningState, - info *reconcileInfo) { + info *reconcileInfo) bool { if hsm.NextState != initialState { + + if !hsm.Reconciler.checkProvisioningHost(*info.host, hsm.NextState) { + return false + } + info.log.Info("changing provisioning state", "old", initialState, "new", hsm.NextState) @@ -104,11 +109,22 @@ func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.Pro } } } + + return true } -func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) actionResult { +func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) (actionRes actionResult) { initialState := hsm.Host.Status.Provisioning.State - defer hsm.updateHostStateFrom(initialState, info) + defer func() { + if !hsm.updateHostStateFrom(initialState, info) { + actionRes = recordActionFailure(info, metal3v1alpha1.TooManyHostsError, "Delayed, too many hosts") + } + }() + + // Check if any immediate action is required for a delayed host + if delayed, actionRes := hsm.Reconciler.checkDelayedHost(info); delayed { + return actionRes + } if hsm.checkInitiateDelete() { info.log.Info("Initiating host deletion") diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 8df390c0f2..13632b246a 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -205,7 +205,7 @@ func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) { for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { prov := &mockProvisioner{} - hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov) + hsm := newHostStateMachine(tt.Host, newTestReconciler(), prov) info := makeDefaultReconcileInfo(tt.Host) prov.setNextError("some error") @@ -220,7 +220,7 @@ func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) { func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) { bmh := host(metal3v1alpha1.StateRegistering).build() prov := &mockProvisioner{} - hsm := newHostStateMachine(bmh, &BareMetalHostReconciler{}, prov) + hsm := newHostStateMachine(bmh, newTestReconciler(), prov) info := makeDefaultReconcileInfo(bmh) bmh.Status.GoodCredentials = metal3v1alpha1.CredentialsStatus{} @@ -265,7 +265,7 @@ func TestErrorCountCleared(t *testing.T) { for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { prov := &mockProvisioner{} - hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov) + hsm := newHostStateMachine(tt.Host, newTestReconciler(), prov) info := makeDefaultReconcileInfo(tt.Host) info.host.Status.ErrorCount = 1 diff --git a/main.go b/main.go index 657ea93284..dd7b14a8dd 100644 --- a/main.go +++ b/main.go @@ -128,12 +128,8 @@ func main() { ironic.LogStartup() } - if err = (&metal3iocontroller.BareMetalHostReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("BareMetalHost"), - Scheme: mgr.GetScheme(), - ProvisionerFactory: provisionerFactory, - }).SetupWithManager(mgr); err != nil { + reconciler := metal3iocontroller.NewBareMetalHostReconciler(mgr.GetClient(), mgr.GetScheme(), provisionerFactory) + if err = reconciler.SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "BareMetalHost") os.Exit(1) }