diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index 7ac672efbe..98db16bef6 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -116,6 +116,8 @@ const ( // OperationalStatusError is the status value for when the host // has any sort of error. OperationalStatusError OperationalStatus = "error" + + OperationalStatusDelayed = "delayed" ) // ErrorType indicates the class of problem that has caused the Host resource @@ -470,6 +472,18 @@ type CredentialsStatus struct { Version string `json:"credentialsVersion,omitempty"` } +// RebootMode defines known variations of reboot modes +type RebootMode string + +const ( + RebootModeHard RebootMode = "hard" + RebootModeSoft RebootMode = "soft" +) + +type RebootAnnotationArguments struct { + Mode RebootMode `json:"mode"` +} + // Match compares the saved status information with the name and // content of a secret object. func (cs CredentialsStatus) Match(secret corev1.Secret) bool { @@ -519,7 +533,7 @@ type BareMetalHostStatus struct { // after modifying this file // OperationalStatus holds the status of the host - // +kubebuilder:validation:Enum="";OK;discovered;error + // +kubebuilder:validation:Enum="";OK;discovered;error;delayed OperationalStatus OperationalStatus `json:"operationalStatus"` // ErrorType indicates the type of failure encountered when the diff --git a/apis/metal3.io/v1alpha1/zz_generated.deepcopy.go b/apis/metal3.io/v1alpha1/zz_generated.deepcopy.go index 928d79202c..499b596bfc 100644 --- a/apis/metal3.io/v1alpha1/zz_generated.deepcopy.go +++ b/apis/metal3.io/v1alpha1/zz_generated.deepcopy.go @@ -393,6 +393,21 @@ func (in *ProvisionStatus) DeepCopy() *ProvisionStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RebootAnnotationArguments) DeepCopyInto(out *RebootAnnotationArguments) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RebootAnnotationArguments. +func (in *RebootAnnotationArguments) DeepCopy() *RebootAnnotationArguments { + if in == nil { + return nil + } + out := new(RebootAnnotationArguments) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RootDeviceHints) DeepCopyInto(out *RootDeviceHints) { *out = *in diff --git a/config/crd/bases/metal3.io_baremetalhosts.yaml b/config/crd/bases/metal3.io_baremetalhosts.yaml index fe52ca031c..b9f58bd9cb 100644 --- a/config/crd/bases/metal3.io_baremetalhosts.yaml +++ b/config/crd/bases/metal3.io_baremetalhosts.yaml @@ -524,6 +524,7 @@ spec: - OK - discovered - error + - delayed type: string poweredOn: description: indicator for whether or not the host is powered on diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml index d10547a31c..5524ce8c71 100644 --- a/config/render/capm3.yaml +++ b/config/render/capm3.yaml @@ -522,6 +522,7 @@ spec: - OK - discovered - error + - delayed type: string poweredOn: description: indicator for whether or not the host is powered on diff --git a/controllers/metal3.io/action_result.go b/controllers/metal3.io/action_result.go index a746f05c98..0e628c2dd7 100644 --- a/controllers/metal3.io/action_result.go +++ b/controllers/metal3.io/action_result.go @@ -12,7 +12,9 @@ import ( "github.com/metal3-io/baremetal-operator/pkg/provisioner" ) -const maxBackOffCount = 10 +// This is an upper limit for the ErrorCount, so that the max backoff +// timeout will not exceed (roughly) 8 hours +const maxBackOffCount = 9 func init() { rand.Seed(time.Now().UTC().UnixNano()) @@ -54,6 +56,18 @@ func (r actionUpdate) Dirty() bool { return true } +// actionDelayed it's the same of an actionUpdate, but the requeue time +// is calculated using a fixed backoff with jitter +type actionDelayed struct { + actionUpdate +} + +func (r actionDelayed) Result() (result reconcile.Result, err error) { + result.RequeueAfter = calculateBackoff(1) + result.Requeue = true + return +} + // actionComplete is a result indicating that the current action has completed, // and that the resource should transition to the next state. type actionComplete struct { @@ -110,6 +124,16 @@ type actionFailed struct { errorCount int } +// Distribution sample for errorCount values: +// 1 [1m, 2m] +// 2 [2m, 4m] +// 3 [4m, 8m] +// 4 [8m, 16m] +// 5 [16m, 32m] +// 6 [32m, 1h4m] +// 7 [1h4m, 2h8m] +// 8 [2h8m, 4h16m] +// 9 [4h16m, 8h32m] func calculateBackoff(errorCount int) time.Duration { if errorCount > maxBackOffCount { @@ -119,7 +143,7 @@ func calculateBackoff(errorCount int) time.Duration { base := math.Exp2(float64(errorCount)) /* #nosec */ backOff := base - (rand.Float64() * base * 0.5) - backOffDuration := time.Minute * time.Duration(backOff) + backOffDuration := time.Duration(float64(time.Minute) * backOff) return backOffDuration } diff --git a/controllers/metal3.io/action_result_test.go b/controllers/metal3.io/action_result_test.go index cb2fd9dc3a..770a87ee3f 100644 --- a/controllers/metal3.io/action_result_test.go +++ b/controllers/metal3.io/action_result_test.go @@ -11,7 +11,7 @@ import ( func TestBackoffIncrements(t *testing.T) { var backOff time.Duration - for i := 0; i < maxBackOffCount; i++ { + for i := 1; i <= maxBackOffCount; i++ { prev := backOff backOff = calculateBackoff(i) diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index d5967ce282..a63f83e346 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -51,11 +51,9 @@ const ( unmanagedRetryDelay = time.Minute * 10 provisionerNotReadyRetryDelay = time.Second * 30 rebootAnnotationPrefix = "reboot.metal3.io" + inspectAnnotationPrefix = "inspect.metal3.io" ) -func init() { -} - // BareMetalHostReconciler reconciles a BareMetalHost object type BareMetalHostReconciler struct { client.Client @@ -214,7 +212,6 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R } ready, err := prov.IsReady() - if err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to check services availability") } @@ -295,6 +292,15 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType return actionFailed{dirty: true, ErrorType: errorType, errorCount: info.host.Status.ErrorCount} } +func recordActionDelayed(info *reconcileInfo) actionResult { + + counter := delayedProvisioningHostCounters.With(hostMetricLabels(info.request)) + info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc) + + info.host.SetOperationalStatus(metal3v1alpha1.OperationalStatusDelayed) + return actionDelayed{} +} + func (r *BareMetalHostReconciler) credentialsErrorResult(err error, request ctrl.Request, host *metal3v1alpha1.BareMetalHost) (ctrl.Result, error) { switch err.(type) { // In the event a credential secret is defined, but we cannot find it @@ -333,13 +339,40 @@ func (r *BareMetalHostReconciler) credentialsErrorResult(err error, request ctrl } // hasRebootAnnotation checks for existence of reboot annotations and returns true if at least one exist -func hasRebootAnnotation(host *metal3v1alpha1.BareMetalHost) bool { - for annotation := range host.Annotations { +func hasRebootAnnotation(info *reconcileInfo) (hasReboot bool, rebootMode metal3v1alpha1.RebootMode) { + rebootMode = metal3v1alpha1.RebootModeSoft + + for annotation, value := range info.host.GetAnnotations() { if isRebootAnnotation(annotation) { - return true + hasReboot = true + newRebootMode := getRebootMode(value, info) + // If any annotation has asked for a hard reboot, that + // mode takes precedence. + if newRebootMode == metal3v1alpha1.RebootModeHard { + rebootMode = newRebootMode + } + // Don't use a break here as we may have multiple clients setting + // reboot annotations and we always want hard requests honoured } } - return false + return +} + +func getRebootMode(annotation string, info *reconcileInfo) metal3v1alpha1.RebootMode { + + if annotation == "" { + info.log.Info("No reboot annotation value specified, assuming soft-reboot.") + return metal3v1alpha1.RebootModeSoft + } + + annotations := metal3v1alpha1.RebootAnnotationArguments{} + err := json.Unmarshal([]byte(annotation), &annotations) + if err != nil { + info.publishEvent("InvalidAnnotationValue", fmt.Sprintf("could not parse reboot annotation (%s) - invalid json, assuming soft-reboot", annotation)) + info.log.Info(fmt.Sprintf("Could not parse reboot annotation (%q) - invalid json, assuming soft-reboot", annotation)) + return metal3v1alpha1.RebootModeSoft + } + return annotations.Mode } // isRebootAnnotation returns true if the provided annotation is a reboot annotation (either suffixed or not) @@ -359,6 +392,16 @@ func clearRebootAnnotations(host *metal3v1alpha1.BareMetalHost) (dirty bool) { return } +// inspectionDisabled checks for existence of inspect.metal3.io=disabled +// which means we don't inspect even in Inspecting state +func inspectionDisabled(host *metal3v1alpha1.BareMetalHost) bool { + annotations := host.GetAnnotations() + if annotations[inspectAnnotationPrefix] == "disabled" { + return true + } + return false +} + // clearError removes any existing error message. func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) { dirty = host.SetOperationalStatus(metal3v1alpha1.OperationalStatusOK) @@ -491,6 +534,13 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf // Ensure we have the information about the hardware on the host. func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { + + if inspectionDisabled(info.host) { + info.log.Info("inspection disabled by annotation") + info.publishEvent("InspectionSkipped", "disabled by annotation") + return actionComplete{} + } + info.log.Info("inspecting hardware") provResult, details, err := prov.InspectHardware(info.host.Status.ErrorType == metal3v1alpha1.InspectionError) @@ -701,7 +751,9 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, provState := info.host.Status.Provisioning.State isProvisioned := provState == metal3v1alpha1.StateProvisioned || provState == metal3v1alpha1.StateExternallyProvisioned - if hasRebootAnnotation(info.host) && isProvisioned { + + desiredReboot, desiredRebootMode := hasRebootAnnotation(info) + if desiredReboot && isProvisioned { desiredPowerOnState = false } @@ -716,12 +768,13 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner, info.log.Info("power state change needed", "expected", desiredPowerOnState, "actual", info.host.Status.PoweredOn, + "reboot mode", desiredRebootMode, "reboot process", desiredPowerOnState != info.host.Spec.Online) if desiredPowerOnState { provResult, err = prov.PowerOn() } else { - provResult, err = prov.PowerOff() + provResult, err = prov.PowerOff(desiredRebootMode) } if err != nil { return actionError{errors.Wrap(err, "failed to manage power state of host")} diff --git a/controllers/metal3.io/baremetalhost_controller_test.go b/controllers/metal3.io/baremetalhost_controller_test.go index e00d8e7dc4..282b4daf0f 100644 --- a/controllers/metal3.io/baremetalhost_controller_test.go +++ b/controllers/metal3.io/baremetalhost_controller_test.go @@ -20,6 +20,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/event" + logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" @@ -296,6 +297,25 @@ func TestPause(t *testing.T) { ) } +// TestInspectDisabled ensures that Inspection is skipped when disabled +func TestInspectDisabled(t *testing.T) { + host := newDefaultHost(t) + host.Annotations = map[string]string{ + inspectAnnotationPrefix: "disabled", + } + r := newTestReconciler(host) + waitForProvisioningState(t, r, host, metal3v1alpha1.StateMatchProfile) + assert.Nil(t, host.Status.HardwareDetails) +} + +// TestInspectEnabled ensures that Inspection is completed when not disabled +func TestInspectEnabled(t *testing.T) { + host := newDefaultHost(t) + r := newTestReconciler(host) + waitForProvisioningState(t, r, host, metal3v1alpha1.StateMatchProfile) + assert.NotNil(t, host.Status.HardwareDetails) +} + // TestAddFinalizers ensures that the finalizers for the host are // updated as part of reconciling it. func TestAddFinalizers(t *testing.T) { @@ -330,43 +350,106 @@ func TestSetLastUpdated(t *testing.T) { ) } -func TestHasRebootAnnotation(t *testing.T) { +func TestInspectionDisabledAnnotation(t *testing.T) { host := newDefaultHost(t) host.Annotations = make(map[string]string) - if hasRebootAnnotation(host) { - t.Fail() + assert.False(t, inspectionDisabled(host)) + + host.Annotations[inspectAnnotationPrefix] = "disabled" + assert.True(t, inspectionDisabled(host)) +} + +func makeReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo { + return &reconcileInfo{ + log: logf.Log.WithName("controllers").WithName("BareMetalHost").WithName("baremetal_controller"), + host: host, } +} + +func TestHasRebootAnnotation(t *testing.T) { + host := newDefaultHost(t) + info := makeReconcileInfo(host) + host.Annotations = make(map[string]string) + + hasReboot, rebootMode := hasRebootAnnotation(info) + assert.False(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) host.Annotations = make(map[string]string) suffixedAnnotation := rebootAnnotationPrefix + "/foo" host.Annotations[suffixedAnnotation] = "" - if !hasRebootAnnotation(host) { - t.Fail() - } + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) delete(host.Annotations, suffixedAnnotation) host.Annotations[rebootAnnotationPrefix] = "" - if !hasRebootAnnotation(host) { - t.Fail() - } + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) host.Annotations[suffixedAnnotation] = "" - if !hasRebootAnnotation(host) { - t.Fail() - } + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) //two suffixed annotations to simulate multiple clients host.Annotations[suffixedAnnotation+"bar"] = "" - if !hasRebootAnnotation(host) { - t.Fail() - } + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) +} + +func TestHasHardRebootAnnotation(t *testing.T) { + host := newDefaultHost(t) + info := makeReconcileInfo(host) + host.Annotations = make(map[string]string) + + hasReboot, rebootMode := hasRebootAnnotation(info) + assert.False(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) + + host.Annotations = make(map[string]string) + suffixedAnnotation := rebootAnnotationPrefix + "/foo/" + host.Annotations[suffixedAnnotation] = "{\"mode\": \"hard\"}" + + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeHard, rebootMode) + + delete(host.Annotations, suffixedAnnotation) + host.Annotations[rebootAnnotationPrefix] = "{\"mode\": \"soft\"}" + + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) + delete(host.Annotations, suffixedAnnotation) + host.Annotations[rebootAnnotationPrefix] = "{\"bad\"= \"json\"]" + + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) + + host.Annotations[suffixedAnnotation] = "" + + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeSoft, rebootMode) + + //two suffixed annotations to simulate multiple clients + + host.Annotations[suffixedAnnotation+"bar"] = "{\"mode\": \"hard\"}" + + hasReboot, rebootMode = hasRebootAnnotation(info) + assert.True(t, hasReboot) + assert.Equal(t, metal3v1alpha1.RebootModeHard, rebootMode) } // TestRebootWithSuffixlessAnnotation tests full reboot cycle with suffixless @@ -1290,12 +1373,14 @@ func TestUpdateEventHandler(t *testing.T) { func TestErrorCountIncrementsAlways(t *testing.T) { + errorTypes := []metal3v1alpha1.ErrorType{metal3v1alpha1.RegistrationError, metal3v1alpha1.InspectionError, metal3v1alpha1.ProvisioningError, metal3v1alpha1.PowerManagementError} + b := &metal3v1alpha1.BareMetalHost{} assert.Equal(t, b.Status.ErrorCount, 0) - setErrorMessage(b, metal3v1alpha1.RegistrationError, "An error message") - assert.Equal(t, b.Status.ErrorCount, 1) - - setErrorMessage(b, metal3v1alpha1.InspectionError, "Another error message") - assert.Equal(t, b.Status.ErrorCount, 2) + for _, c := range errorTypes { + before := b.Status.ErrorCount + setErrorMessage(b, c, "An error message") + assert.Equal(t, before+1, b.Status.ErrorCount) + } } diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 45ba5bb096..4f9a1c77f0 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -6,6 +6,7 @@ import ( metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" "github.com/metal3-io/baremetal-operator/pkg/provisioner" + "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -77,9 +78,33 @@ func recordStateEnd(info *reconcileInfo, host *metal3v1alpha1.BareMetalHost, sta return } +func (hsm *hostStateMachine) ensureProvisioningCapacity(info *reconcileInfo) actionResult { + hasCapacity, err := hsm.Provisioner.HasProvisioningCapacity() + if err != nil { + return actionError{errors.Wrap(err, "failed to get hosts currently being provisioned")} + } + if !hasCapacity { + return recordActionDelayed(info) + } + + return nil +} + func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.ProvisioningState, - info *reconcileInfo) { + info *reconcileInfo) actionResult { if hsm.NextState != initialState { + + // Check if there is a free slot available when trying to + // provision an host - if not the action will be delayed. + // The check is limited to only the provisioning states to + // avoid putting an excessive pressure on the provisioner + switch hsm.NextState { + case metal3v1alpha1.StateInspecting, metal3v1alpha1.StateProvisioning: + if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil { + return actionRes + } + } + info.log.Info("changing provisioning state", "old", initialState, "new", hsm.NextState) @@ -109,11 +134,47 @@ func (hsm *hostStateMachine) updateHostStateFrom(initialState metal3v1alpha1.Pro } } } + + return nil } -func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) actionResult { +func (hsm *hostStateMachine) checkDelayedHost(info *reconcileInfo) actionResult { + + // Check if there's a free slot for hosts that have been previously delayed + if info.host.Status.OperationalStatus == metal3v1alpha1.OperationalStatusDelayed { + if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil { + return actionRes + } + + // A slot is available, let's cleanup the status and retry + clearError(info.host) + return actionUpdate{} + } + + // Make sure the check is re-applied when provisioning an + // host not yet tracked by the provisioner + switch info.host.Status.Provisioning.State { + case metal3v1alpha1.StateInspecting, metal3v1alpha1.StateProvisioning: + if actionRes := hsm.ensureProvisioningCapacity(info); actionRes != nil { + return actionRes + } + } + + return nil +} + +func (hsm *hostStateMachine) ReconcileState(info *reconcileInfo) (actionRes actionResult) { initialState := hsm.Host.Status.Provisioning.State - defer hsm.updateHostStateFrom(initialState, info) + + defer func() { + if overrideAction := hsm.updateHostStateFrom(initialState, info); overrideAction != nil { + actionRes = overrideAction + } + }() + + if delayedResult := hsm.checkDelayedHost(info); delayedResult != nil { + return delayedResult + } if hsm.checkInitiateDelete() { info.log.Info("Initiating host deletion") diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 58d9a3d233..a1dc1cb70b 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -2,6 +2,7 @@ package controllers import ( "testing" + "time" metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" "github.com/metal3-io/baremetal-operator/pkg/bmc" @@ -11,6 +12,7 @@ import ( "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" "github.com/metal3-io/baremetal-operator/pkg/provisioner" + promutil "github.com/prometheus/client_golang/prometheus/testutil" corev1 "k8s.io/api/core/v1" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -23,6 +25,126 @@ func testStateMachine(host *metal3v1alpha1.BareMetalHost) *hostStateMachine { return newHostStateMachine(host, r, p, true) } +func TestProvisioningCapacity(t *testing.T) { + testCases := []struct { + Scenario string + + HasProvisioningCapacity bool + Host *metal3v1alpha1.BareMetalHost + + ExpectedProvisioningState metal3v1alpha1.ProvisioningState + ExpectedDelayed bool + }{ + { + Scenario: "transition-to-inspecting-delayed", + Host: host(metal3v1alpha1.StateRegistering).build(), + HasProvisioningCapacity: false, + + ExpectedProvisioningState: metal3v1alpha1.StateRegistering, + ExpectedDelayed: true, + }, + { + Scenario: "transition-to-provisioning-delayed", + Host: host(metal3v1alpha1.StateReady).build(), + HasProvisioningCapacity: false, + + ExpectedProvisioningState: metal3v1alpha1.StateReady, + ExpectedDelayed: true, + }, + { + Scenario: "transition-to-inspecting-ok", + Host: host(metal3v1alpha1.StateRegistering).build(), + HasProvisioningCapacity: true, + + ExpectedProvisioningState: metal3v1alpha1.StateInspecting, + ExpectedDelayed: false, + }, + { + Scenario: "transition-to-provisioning-ok", + Host: host(metal3v1alpha1.StateReady).build(), + HasProvisioningCapacity: true, + + ExpectedProvisioningState: metal3v1alpha1.StateProvisioning, + ExpectedDelayed: false, + }, + + { + Scenario: "already-delayed-delayed", + Host: host(metal3v1alpha1.StateReady).SetOperationalStatus(metal3v1alpha1.OperationalStatusDelayed).build(), + HasProvisioningCapacity: false, + + ExpectedProvisioningState: metal3v1alpha1.StateReady, + ExpectedDelayed: true, + }, + { + Scenario: "already-delayed-ok", + Host: host(metal3v1alpha1.StateReady).SetOperationalStatus(metal3v1alpha1.OperationalStatusDelayed).build(), + HasProvisioningCapacity: true, + + ExpectedProvisioningState: metal3v1alpha1.StateReady, + ExpectedDelayed: false, + }, + + { + Scenario: "untracked-inspecting-delayed", + Host: host(metal3v1alpha1.StateInspecting).build(), + HasProvisioningCapacity: false, + + ExpectedProvisioningState: metal3v1alpha1.StateInspecting, + ExpectedDelayed: true, + }, + { + Scenario: "untracked-inspecting-ok", + Host: host(metal3v1alpha1.StateInspecting).build(), + HasProvisioningCapacity: true, + + ExpectedProvisioningState: metal3v1alpha1.StateMatchProfile, + ExpectedDelayed: false, + }, + { + Scenario: "untracked-inspecting-delayed", + Host: host(metal3v1alpha1.StateProvisioning).build(), + HasProvisioningCapacity: false, + + ExpectedProvisioningState: metal3v1alpha1.StateProvisioning, + ExpectedDelayed: true, + }, + { + Scenario: "untracked-provisioning-ok", + Host: host(metal3v1alpha1.StateProvisioning).build(), + HasProvisioningCapacity: true, + + ExpectedProvisioningState: metal3v1alpha1.StateProvisioned, + ExpectedDelayed: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.Scenario, func(t *testing.T) { + prov := newMockProvisioner() + prov.setHasProvisioningCapacity(tc.HasProvisioningCapacity) + hsm := newHostStateMachine(tc.Host, &BareMetalHostReconciler{}, prov, true) + info := makeDefaultReconcileInfo(tc.Host) + delayedProvisioningHostCounters.Reset() + + result := hsm.ReconcileState(info) + + assert.Equal(t, tc.ExpectedProvisioningState, tc.Host.Status.Provisioning.State) + assert.Equal(t, tc.ExpectedDelayed, metal3v1alpha1.OperationalStatusDelayed == tc.Host.Status.OperationalStatus, "Expected OperationalStatusDelayed") + assert.Equal(t, tc.ExpectedDelayed, assert.ObjectsAreEqual(actionDelayed{}, result), "Expected actionDelayed") + + if tc.ExpectedDelayed { + counter, _ := delayedProvisioningHostCounters.GetMetricWith(hostMetricLabels(info.request)) + initialCounterValue := promutil.ToFloat64(counter) + for _, sb := range info.postSaveCallbacks { + sb() + } + assert.Greater(t, promutil.ToFloat64(counter), initialCounterValue) + } + }) + } +} + func TestProvisioningCancelled(t *testing.T) { testCases := []struct { Scenario string @@ -175,116 +297,119 @@ func TestProvisioningCancelled(t *testing.T) { } } -func TestErrorCountIncreasedWhenProvisionerFails(t *testing.T) { +func TestErrorCountIncreasedOnActionFailure(t *testing.T) { tests := []struct { - Scenario string - Host *metal3v1alpha1.BareMetalHost + Scenario string + Host *metal3v1alpha1.BareMetalHost + ProvisionerErrorOn string }{ { - Scenario: "inspecting", - Host: host(metal3v1alpha1.StateInspecting).build(), + Scenario: "registration", + Host: host(metal3v1alpha1.StateRegistering).build(), + ProvisionerErrorOn: "ValidateManagementAccess", }, { - Scenario: "ready", - Host: host(metal3v1alpha1.StateReady).build(), + Scenario: "inspecting", + Host: host(metal3v1alpha1.StateInspecting).build(), + ProvisionerErrorOn: "InspectHardware", }, { - Scenario: "deprovisioning", - Host: host(metal3v1alpha1.StateDeprovisioning).build(), + Scenario: "provisioning", + Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), + ProvisionerErrorOn: "Provision", }, { - Scenario: "provisioning", - Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), + Scenario: "deprovisioning", + Host: host(metal3v1alpha1.StateDeprovisioning).build(), + ProvisionerErrorOn: "Deprovision", }, { - Scenario: "externallyProvisioned", - Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + Scenario: "ready-power-on", + Host: host(metal3v1alpha1.StateReady).SetStatusImageURL("imageSpecUrl").SetStatusPoweredOn(false).build(), + ProvisionerErrorOn: "PowerOn", + }, + { + Scenario: "ready-power-off", + Host: host(metal3v1alpha1.StateReady).SetStatusImageURL("imageSpecUrl").SetOnline(false).build(), + ProvisionerErrorOn: "PowerOff", + }, + { + Scenario: "externally-provisioned-adopt-failed", + Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), + ProvisionerErrorOn: "Adopt", + }, + { + Scenario: "provisioned-adopt-failed", + Host: host(metal3v1alpha1.StateProvisioned).build(), + ProvisionerErrorOn: "Adopt", }, } for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { - prov := &mockProvisioner{} + prov := newMockProvisioner() hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true) info := makeDefaultReconcileInfo(tt.Host) - prov.setNextError("some error") + prov.setNextError(tt.ProvisionerErrorOn, "some error") result := hsm.ReconcileState(info) - assert.Greater(t, tt.Host.Status.ErrorCount, 0) + assert.Equal(t, 1, tt.Host.Status.ErrorCount) assert.True(t, result.Dirty()) }) } } -func TestErrorCountIncreasedWhenRegistrationFails(t *testing.T) { - bmh := host(metal3v1alpha1.StateRegistering).build() - prov := &mockProvisioner{} - hsm := newHostStateMachine(bmh, &BareMetalHostReconciler{}, prov, true) - info := makeDefaultReconcileInfo(bmh) - bmh.Status.GoodCredentials = metal3v1alpha1.CredentialsStatus{} - - prov.setNextError("some error") - result := hsm.ReconcileState(info) - - assert.Greater(t, bmh.Status.ErrorCount, 0) - assert.True(t, result.Dirty()) -} - -func TestErrorCountCleared(t *testing.T) { +func TestErrorCountClearedOnStateTransition(t *testing.T) { tests := []struct { Scenario string Host *metal3v1alpha1.BareMetalHost + TargetState metal3v1alpha1.ProvisioningState PreserveErrorCountOnComplete bool }{ { - Scenario: "registering", - Host: host(metal3v1alpha1.StateRegistering).build(), + Scenario: "registering-to-inspecting", + Host: host(metal3v1alpha1.StateRegistering).build(), + TargetState: metal3v1alpha1.StateInspecting, }, { - Scenario: "inspecting", - Host: host(metal3v1alpha1.StateInspecting).build(), + Scenario: "inspecting-to-matchprofile", + Host: host(metal3v1alpha1.StateInspecting).build(), + TargetState: metal3v1alpha1.StateMatchProfile, }, { - Scenario: "ready", - Host: host(metal3v1alpha1.StateReady).build(), - PreserveErrorCountOnComplete: true, + Scenario: "matchprofile-to-ready", + Host: host(metal3v1alpha1.StateMatchProfile).build(), + TargetState: metal3v1alpha1.StateReady, }, { - Scenario: "deprovisioning", - Host: host(metal3v1alpha1.StateDeprovisioning).build(), + Scenario: "provisioning-to-provisioned", + Host: host(metal3v1alpha1.StateProvisioning).build(), + TargetState: metal3v1alpha1.StateProvisioned, }, { - Scenario: "provisioning", - Host: host(metal3v1alpha1.StateProvisioning).SetImageURL("imageSpecUrl").build(), + Scenario: "deprovisioning-to-ready", + Host: host(metal3v1alpha1.StateDeprovisioning).build(), + TargetState: metal3v1alpha1.StateReady, }, { - Scenario: "externallyProvisioned", - Host: host(metal3v1alpha1.StateExternallyProvisioned).SetExternallyProvisioned().build(), - PreserveErrorCountOnComplete: true, + Scenario: "deprovisioning-to-deleting", + Host: host(metal3v1alpha1.StateDeprovisioning).setDeletion().build(), + TargetState: metal3v1alpha1.StateDeleting, }, } for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { - prov := &mockProvisioner{} + prov := newMockProvisioner() hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true) info := makeDefaultReconcileInfo(tt.Host) info.host.Status.ErrorCount = 1 - prov.setNextResult(true) - result := hsm.ReconcileState(info) - - assert.Equal(t, 1, tt.Host.Status.ErrorCount) - assert.True(t, result.Dirty()) - - prov.setNextResult(false) hsm.ReconcileState(info) - if tt.PreserveErrorCountOnComplete { - assert.Equal(t, 1, tt.Host.Status.ErrorCount) - } else { - assert.Equal(t, 0, tt.Host.Status.ErrorCount) - } + + assert.Equal(t, tt.TargetState, info.host.Status.Provisioning.State) + assert.Equal(t, info.host.Status.ErrorCount, 0) }) } } @@ -303,13 +428,6 @@ func TestErrorClean(t *testing.T) { SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.RegistrationError, "some error", 1). build(), }, - { - Scenario: "not-clean-after-provisioned-registration-error", - Host: host(metal3v1alpha1.StateInspecting). - SetStatusError(metal3v1alpha1.OperationalStatusError, metal3v1alpha1.ProvisionedRegistrationError, "some error", 1). - build(), - ExpectError: true, - }, { Scenario: "clean-after-creds-change", Host: host(metal3v1alpha1.StateReady). @@ -320,7 +438,7 @@ func TestErrorClean(t *testing.T) { } for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { - prov := &mockProvisioner{} + prov := newMockProvisioner() hsm := newHostStateMachine(tt.Host, &BareMetalHostReconciler{}, prov, true) info := makeDefaultReconcileInfo(tt.Host) @@ -331,10 +449,10 @@ func TestErrorClean(t *testing.T) { hsm.ReconcileState(info) if tt.ExpectError { - assert.Equal(t, tt.Host.Status.ErrorType, v1alpha1.ProvisionedRegistrationError) + assert.Equal(t, v1alpha1.ProvisionedRegistrationError, tt.Host.Status.ErrorType) assert.NotEmpty(t, tt.Host.Status.ErrorMessage) } else { - assert.Equal(t, tt.Host.Status.OperationalStatus, v1alpha1.OperationalStatusOK) + assert.Equal(t, v1alpha1.OperationalStatusOK, tt.Host.Status.OperationalStatus) assert.Empty(t, tt.Host.Status.ErrorType) assert.Empty(t, tt.Host.Status.ErrorMessage) } @@ -347,20 +465,36 @@ type hostBuilder struct { } func host(state metal3v1alpha1.ProvisioningState) *hostBuilder { + + creds := metal3v1alpha1.CredentialsStatus{ + Reference: &corev1.SecretReference{ + Name: "secretRefName", + Namespace: "secretNs", + }, + Version: "100", + } + return &hostBuilder{ metal3v1alpha1.BareMetalHost{ + Spec: v1alpha1.BareMetalHostSpec{ + Online: true, + Image: &v1alpha1.Image{ + URL: "not-empty", + }, + RootDeviceHints: &v1alpha1.RootDeviceHints{}, + }, Status: metal3v1alpha1.BareMetalHostStatus{ Provisioning: metal3v1alpha1.ProvisionStatus{ State: state, BootMode: v1alpha1.DefaultBootMode, - }, - GoodCredentials: metal3v1alpha1.CredentialsStatus{ - Reference: &corev1.SecretReference{ - Name: "secretRefName", - Namespace: "secretNs", + Image: v1alpha1.Image{ + URL: "", //needs provisioning }, - Version: "100", }, + GoodCredentials: creds, + TriedCredentials: creds, + OperationalStatus: metal3v1alpha1.OperationalStatusOK, + PoweredOn: true, }, }, } @@ -396,6 +530,32 @@ func (hb *hostBuilder) SetStatusError(opStatus metal3v1alpha1.OperationalStatus, return hb } +func (hb *hostBuilder) SetStatusImageURL(url string) *hostBuilder { + hb.Status.Provisioning.Image.URL = url + return hb +} + +func (hb *hostBuilder) SetStatusPoweredOn(status bool) *hostBuilder { + hb.Status.PoweredOn = status + return hb +} + +func (hb *hostBuilder) SetOnline(status bool) *hostBuilder { + hb.Spec.Online = status + return hb +} + +func (hb *hostBuilder) SetOperationalStatus(status metal3v1alpha1.OperationalStatus) *hostBuilder { + hb.Status.OperationalStatus = status + return hb +} + +func (hb *hostBuilder) setDeletion() *hostBuilder { + date := metav1.Date(2021, time.January, 18, 10, 18, 0, 0, time.UTC) + hb.DeletionTimestamp = &date + return hb +} + func makeDefaultReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo { return &reconcileInfo{ log: logf.Log.WithName("controllers").WithName("BareMetalHost").WithName("host_state_machine"), @@ -411,30 +571,46 @@ func makeDefaultReconcileInfo(host *metal3v1alpha1.BareMetalHost) *reconcileInfo } } +func newMockProvisioner() *mockProvisioner { + return &mockProvisioner{ + hasProvisioningCapacity: true, + nextResults: make(map[string]provisioner.Result), + } +} + type mockProvisioner struct { - nextResult provisioner.Result + hasProvisioningCapacity bool + nextResults map[string]provisioner.Result } -func (m *mockProvisioner) setNextError(msg string) { - m.nextResult = provisioner.Result{ - ErrorMessage: msg, +func (m *mockProvisioner) getNextResultByMethod(name string) (result provisioner.Result) { + if value, ok := m.nextResults[name]; ok { + result = value } + return +} + +func (m *mockProvisioner) setHasProvisioningCapacity(hasCapacity bool) { + m.hasProvisioningCapacity = hasCapacity } -func (m *mockProvisioner) setNextResult(dirty bool) { - m.nextResult = provisioner.Result{ - Dirty: dirty, - ErrorMessage: "", +func (m *mockProvisioner) HasProvisioningCapacity() (result bool, err error) { + return m.hasProvisioningCapacity, nil +} + +func (m *mockProvisioner) setNextError(methodName, msg string) { + m.nextResults[methodName] = provisioner.Result{ + ErrorMessage: msg, } } func (m *mockProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { - return m.nextResult, "", err + return m.getNextResultByMethod("ValidateManagementAccess"), "", err } func (m *mockProvisioner) InspectHardware(force bool) (result provisioner.Result, details *metal3v1alpha1.HardwareDetails, err error) { details = &metal3v1alpha1.HardwareDetails{} - return m.nextResult, details, err + return m.getNextResultByMethod("InspectHardware"), details, err } func (m *mockProvisioner) UpdateHardwareState() (hwState provisioner.HardwareState, err error) { @@ -442,27 +618,27 @@ func (m *mockProvisioner) UpdateHardwareState() (hwState provisioner.HardwareSta } func (m *mockProvisioner) Adopt(force bool) (result provisioner.Result, err error) { - return m.nextResult, err + return m.getNextResultByMethod("Adopt"), err } func (m *mockProvisioner) Provision(configData provisioner.HostConfigData) (result provisioner.Result, err error) { - return m.nextResult, err + return m.getNextResultByMethod("Provision"), err } func (m *mockProvisioner) Deprovision(force bool) (result provisioner.Result, err error) { - return m.nextResult, err + return m.getNextResultByMethod("Deprovision"), err } func (m *mockProvisioner) Delete() (result provisioner.Result, err error) { - return m.nextResult, err + return m.getNextResultByMethod("Delete"), err } func (m *mockProvisioner) PowerOn() (result provisioner.Result, err error) { - return m.nextResult, err + return m.getNextResultByMethod("PowerOn"), err } -func (m *mockProvisioner) PowerOff() (result provisioner.Result, err error) { - return m.nextResult, err +func (m *mockProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) { + return m.getNextResultByMethod("PowerOff"), err } func (m *mockProvisioner) IsReady() (result bool, err error) { diff --git a/controllers/metal3.io/metrics.go b/controllers/metal3.io/metrics.go index 9e08387892..b016f2e68e 100644 --- a/controllers/metal3.io/metrics.go +++ b/controllers/metal3.io/metrics.go @@ -61,6 +61,10 @@ var hostConfigDataError = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "metal3_host_config_data_error_total", Help: "Number of times the operator has failed to retrieve host configuration data", }, []string{labelHostDataType}) +var delayedProvisioningHostCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "metal3_delayed__provisioning_total", + Help: "The number of times hosts have been delayed while provisioning due a busy provisioner", +}, []string{labelHostNamespace, labelHostName}) var slowOperationBuckets = []float64{30, 90, 180, 360, 720, 1440} @@ -111,7 +115,8 @@ func init() { reconcileCounters, reconcileErrorCounter, actionFailureCounters, - powerChangeAttempts) + powerChangeAttempts, + delayedProvisioningHostCounters) for _, collector := range stateTime { metrics.Registry.MustRegister(collector) diff --git a/docs/configuration.md b/docs/configuration.md index 94bfacec36..a958bd984f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -24,6 +24,12 @@ validation. It is highly recommend to not set it to True. `BMO_CONCURRENCY` -- The number of concurrent reconciles performed by the Operator. Default is 3. +`PROVISIONING_LIMIT` -- The desired maximum number of hosts that could be provisioned +simultaneously by the Operator. The Operator will try to enforce this limit, +but overflows could happen in case of slow provisioners and / or higher number of +concurrent reconciles. For such reasons, it is highly recommended to keep +BMO_CONCURRENCY value lower than the requested PROVISIONING_LIMIT. Default is 20. + Kustomization Configuration --------------------------- diff --git a/pkg/provisioner/demo/demo.go b/pkg/provisioner/demo/demo.go index 606bb99aae..067468cfea 100644 --- a/pkg/provisioner/demo/demo.go +++ b/pkg/provisioner/demo/demo.go @@ -66,6 +66,10 @@ func New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credentials, publisher return p, nil } +func (m *demoProvisioner) HasProvisioningCapacity() (result bool, err error) { + return true, nil +} + // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. func (p *demoProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { @@ -283,7 +287,7 @@ func (p *demoProvisioner) PowerOn() (result provisioner.Result, err error) { // PowerOff ensures the server is powered off independently of any image // provisioning operation. -func (p *demoProvisioner) PowerOff() (result provisioner.Result, err error) { +func (p *demoProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) { hostName := p.host.ObjectMeta.Name switch hostName { diff --git a/pkg/provisioner/empty/empty.go b/pkg/provisioner/empty/empty.go index 0fbc48802c..b46520c169 100644 --- a/pkg/provisioner/empty/empty.go +++ b/pkg/provisioner/empty/empty.go @@ -75,7 +75,7 @@ func (p *emptyProvisioner) PowerOn() (provisioner.Result, error) { // PowerOff ensures the server is powered off independently of any image // provisioning operation. -func (p *emptyProvisioner) PowerOff() (provisioner.Result, error) { +func (p *emptyProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (provisioner.Result, error) { return provisioner.Result{}, nil } @@ -83,3 +83,7 @@ func (p *emptyProvisioner) PowerOff() (provisioner.Result, error) { func (p *emptyProvisioner) IsReady() (bool, error) { return true, nil } + +func (p *emptyProvisioner) HasProvisioningCapacity() (result bool, err error) { + return true, nil +} diff --git a/pkg/provisioner/fixture/fixture.go b/pkg/provisioner/fixture/fixture.go index 136264383e..cc2077ee65 100644 --- a/pkg/provisioner/fixture/fixture.go +++ b/pkg/provisioner/fixture/fixture.go @@ -21,6 +21,7 @@ type fixtureHostConfigData struct { metaData string } +// NewHostConfigData creates new host configuration data func NewHostConfigData(userData string, networkData string, metaData string) provisioner.HostConfigData { return &fixtureHostConfigData{ userData: userData, @@ -81,6 +82,10 @@ func (f *Fixture) New(host metal3v1alpha1.BareMetalHost, bmcCreds bmc.Credential return p, nil } +func (p *fixtureProvisioner) HasProvisioningCapacity() (result bool, err error) { + return true, nil +} + // ValidateManagementAccess tests the connection information for the // host to verify that the location and credentials work. func (p *fixtureProvisioner) ValidateManagementAccess(credentialsChanged, force bool) (result provisioner.Result, provID string, err error) { @@ -260,7 +265,7 @@ func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) { // PowerOff ensures the server is powered off independently of any image // provisioning operation. -func (p *fixtureProvisioner) PowerOff() (result provisioner.Result, err error) { +func (p *fixtureProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) { p.log.Info("ensuring host is powered off") if p.state.poweredOn { diff --git a/pkg/provisioner/ironic/ironic.go b/pkg/provisioner/ironic/ironic.go index 2674774321..d755682cb3 100644 --- a/pkg/provisioner/ironic/ironic.go +++ b/pkg/provisioner/ironic/ironic.go @@ -3,6 +3,7 @@ package ironic import ( "fmt" "os" + "strconv" "strings" "time" @@ -39,6 +40,7 @@ var ( ironicInsecure bool ironicAuth clients.AuthConfig inspectorAuth clients.AuthConfig + maxProvisioningHosts int = 20 // Keep pointers to ironic and inspector clients configured with // the global auth settings to reuse the connection between @@ -99,6 +101,15 @@ func init() { if strings.ToLower(ironicInsecureStr) == "true" { ironicInsecure = true } + + if maxHostsStr := os.Getenv("PROVISIONING_LIMIT"); maxHostsStr != "" { + value, err := strconv.Atoi(maxHostsStr) + if err != nil { + fmt.Fprintf(os.Stderr, "Cannot start: Invalid value set for variable PROVISIONING_LIMIT=%s", maxHostsStr) + os.Exit(1) + } + maxProvisioningHosts = value + } } // Provisioner implements the provisioning.Provisioner interface @@ -1493,10 +1504,14 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { // PowerOff ensures the server is powered off independently of any image // provisioning operation. -func (p *ironicProvisioner) PowerOff() (result provisioner.Result, err error) { - p.log.Info("ensuring host is powered off") +func (p *ironicProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) { + p.log.Info(fmt.Sprintf("ensuring host is powered off (mode: %s)", rebootMode)) - result, err = p.softPowerOff() + if rebootMode == metal3v1alpha1.RebootModeHard { + result, err = p.hardPowerOff() + } else { + result, err = p.softPowerOff() + } if err != nil { switch err.(type) { // In case of soft power off is unsupported or has failed, @@ -1579,3 +1594,52 @@ func (p *ironicProvisioner) IsReady() (result bool, err error) { checker := newIronicDependenciesChecker(p.client, p.inspector, p.log) return checker.IsReady() } + +func (p *ironicProvisioner) HasProvisioningCapacity() (result bool, err error) { + + hosts, err := p.loadProvisioningHosts() + if err != nil { + p.log.Error(err, "Unable to get hosts currently being provisioned") + return false, err + } + + // If the current host is already under processing then let's skip the test + if _, ok := hosts[p.host.Name]; ok { + return true, nil + } + + return len(hosts) < maxProvisioningHosts, nil +} + +func (p *ironicProvisioner) loadProvisioningHosts() (hosts map[string]struct{}, err error) { + + hosts = make(map[string]struct{}) + pager := nodes.List(p.client, nodes.ListOpts{ + Fields: []string{"uuid,name,provision_state,driver_internal_info,target_provision_state"}, + }) + if pager.Err != nil { + return nil, pager.Err + } + + page, err := pager.AllPages() + if err != nil { + return nil, err + } + + allNodes, err := nodes.ExtractNodes(page) + if err != nil { + return nil, err + } + + for _, node := range allNodes { + + switch nodes.ProvisionState(node.ProvisionState) { + case nodes.Cleaning, nodes.CleanWait, + nodes.Inspecting, nodes.InspectWait, + nodes.Deploying, nodes.DeployWait: + hosts[node.Name] = struct{}{} + } + } + + return hosts, nil +} diff --git a/pkg/provisioner/ironic/power_test.go b/pkg/provisioner/ironic/power_test.go index 8aeda72434..02ff369440 100644 --- a/pkg/provisioner/ironic/power_test.go +++ b/pkg/provisioner/ironic/power_test.go @@ -9,6 +9,7 @@ import ( "github.com/gophercloud/gophercloud/openstack/baremetalintrospection/v1/introspection" "github.com/stretchr/testify/assert" + metal3v1alpha1 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" "github.com/metal3-io/baremetal-operator/pkg/bmc" "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/clients" "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/testserver" @@ -123,6 +124,7 @@ func TestPowerOff(t *testing.T) { expectedDirty bool expectedError bool expectedRequestAfter int + rebootMode metal3v1alpha1.RebootMode }{ { name: "node-already-power-off", @@ -150,6 +152,18 @@ func TestPowerOff(t *testing.T) { UUID: nodeUUID, }), expectedDirty: true, + rebootMode: metal3v1alpha1.RebootModeSoft, + }, + { + name: "power-off hard", + ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{ + PowerState: powerOn, + TargetPowerState: powerOn, + TargetProvisionState: "", + UUID: nodeUUID, + }), + expectedDirty: true, + rebootMode: metal3v1alpha1.RebootModeHard, }, { name: "power-off wait for Provisioning state", @@ -199,7 +213,8 @@ func TestPowerOff(t *testing.T) { } prov.status.ID = nodeUUID - result, err := prov.PowerOff() + // We pass the RebootMode type here to define the reboot action + result, err := prov.PowerOff(tc.rebootMode) assert.Equal(t, tc.expectedDirty, result.Dirty) assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter) diff --git a/pkg/provisioner/ironic/provisioncapacity_test.go b/pkg/provisioner/ironic/provisioncapacity_test.go new file mode 100644 index 0000000000..f37f0c2737 --- /dev/null +++ b/pkg/provisioner/ironic/provisioncapacity_test.go @@ -0,0 +1,102 @@ +package ironic + +import ( + "fmt" + "testing" + + "github.com/gophercloud/gophercloud/openstack/baremetal/v1/nodes" + "github.com/stretchr/testify/assert" + + "github.com/metal3-io/baremetal-operator/pkg/bmc" + "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/clients" + "github.com/metal3-io/baremetal-operator/pkg/provisioner/ironic/testserver" +) + +func TestHasProvisioningCapacity(t *testing.T) { + + provisioningStates := []nodes.ProvisionState{nodes.Cleaning, nodes.CleanWait, nodes.Inspecting, nodes.InspectWait, nodes.Deploying, nodes.DeployWait} + + cases := []struct { + name string + provisioningLimit int + nodeStates []nodes.ProvisionState + hostName string + + expectedHasCapacity bool + expectedError string + }{ + { + name: "no-capacity", + provisioningLimit: 6, + nodeStates: provisioningStates, + + expectedHasCapacity: false, + }, + { + name: "enough-capacity", + provisioningLimit: 7, + nodeStates: provisioningStates, + + expectedHasCapacity: true, + }, + { + name: "ignore-check-if-already-provisioning", + provisioningLimit: 6, + nodeStates: provisioningStates, + hostName: "node-1", + + expectedHasCapacity: true, + }, + { + name: "enough-capacity-due-not-provisioning-states", + provisioningLimit: 1, + nodeStates: []nodes.ProvisionState{nodes.Active, nodes.AdoptFail, nodes.Adopting, nodes.Available, nodes.CleanFail}, + + expectedHasCapacity: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + + allNodes := []nodes.Node{} + for n, state := range tc.nodeStates { + allNodes = append(allNodes, nodes.Node{ + Name: fmt.Sprintf("node-%d", n), + ProvisionState: string(state), + }) + } + + ironic := testserver.NewIronic(t).Nodes(allNodes).Start() + defer ironic.Stop() + + inspector := testserver.NewInspector(t).Start() + defer inspector.Stop() + + host := makeHost() + host.Name = tc.hostName + + auth := clients.AuthConfig{Type: clients.NoAuth} + + maxProvisioningHosts = tc.provisioningLimit + + prov, err := newProvisionerWithSettings(host, bmc.Credentials{}, nullEventPublisher, + ironic.Endpoint(), auth, inspector.Endpoint(), auth, + ) + if err != nil { + t.Fatalf("could not create provisioner: %s", err) + } + + result, err := prov.HasProvisioningCapacity() + + assert.Equal(t, tc.expectedHasCapacity, result) + + if tc.expectedError == "" { + assert.NoError(t, err) + } else { + assert.Error(t, err) + assert.Regexp(t, tc.expectedError, err.Error()) + } + }) + } +} diff --git a/pkg/provisioner/ironic/testserver/ironic.go b/pkg/provisioner/ironic/testserver/ironic.go index 6a3dea91e4..5e912e35df 100644 --- a/pkg/provisioner/ironic/testserver/ironic.go +++ b/pkg/provisioner/ironic/testserver/ironic.go @@ -254,3 +254,15 @@ func (m *IronicMock) Port(port ports.Port) *IronicMock { return m } + +// Nodes configure the server with a valid response for /v1/nodes +func (m *IronicMock) Nodes(allNodes []nodes.Node) *IronicMock { + resp := struct { + Nodes []nodes.Node `json:"nodes"` + }{ + Nodes: allNodes, + } + + m.ResponseJSON(m.buildURL("/v1/nodes", http.MethodGet), resp) + return m +} diff --git a/pkg/provisioner/ironic/testserver/server.go b/pkg/provisioner/ironic/testserver/server.go index b66a8de7f2..4e35f1ee59 100644 --- a/pkg/provisioner/ironic/testserver/server.go +++ b/pkg/provisioner/ironic/testserver/server.go @@ -70,12 +70,12 @@ func (m *MockServer) Endpoint() string { func (m *MockServer) logRequest(r *http.Request, response string) { m.t.Logf("%s: %s %s -> %s", m.name, r.Method, r.URL, response) - m.Requests += r.RequestURI + ";" + m.Requests += r.URL.Path + ";" bodyRaw, _ := ioutil.ReadAll(r.Body) m.FullRequests = append(m.FullRequests, simpleRequest{ - pattern: r.URL.String(), + pattern: r.URL.Path, method: r.Method, body: string(bodyRaw), }) @@ -93,8 +93,7 @@ func (m *MockServer) Handler(pattern string, handlerFunc http.HandlerFunc) *Mock func (m *MockServer) buildHandler(pattern string) func(http.ResponseWriter, *http.Request) { handler := func(w http.ResponseWriter, r *http.Request) { - - if response, ok := m.responsesByMethod[r.URL.String()][r.Method]; ok { + if response, ok := m.responsesByMethod[r.URL.Path][r.Method]; ok { m.sendData(w, r, response.code, response.payload) return } @@ -224,7 +223,7 @@ func (m *MockServer) AddDefaultResponse(patternWithVars string, httpMethod strin func (m *MockServer) defaultHandler(w http.ResponseWriter, r *http.Request) { - url := r.URL.String() + url := r.URL.Path method := r.Method for _, response := range m.defaultResponses { diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index 70c9b38027..198dfe6f1f 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -81,12 +81,16 @@ type Provisioner interface { PowerOn() (result Result, err error) // PowerOff ensures the server is powered off independently of any image - // provisioning operation. - PowerOff() (result Result, err error) + // provisioning operation. The boolean argument may be used to specify + // if a hard reboot (force power off) is required - true if so. + PowerOff(rebootMode metal3v1alpha1.RebootMode) (result Result, err error) // IsReady checks if the provisioning backend is available to accept // all the incoming requests. IsReady() (result bool, err error) + + // HasProvisioningCapacity checks if the backend has a free provisioning slot for the current host + HasProvisioningCapacity() (result bool, err error) } // Result holds the response from a call in the Provsioner API. diff --git a/vendor/github.com/prometheus/client_golang/prometheus/testutil/lint.go b/vendor/github.com/prometheus/client_golang/prometheus/testutil/lint.go new file mode 100644 index 0000000000..7681877a89 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/testutil/lint.go @@ -0,0 +1,46 @@ +// Copyright 2020 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil/promlint" +) + +// CollectAndLint registers the provided Collector with a newly created pedantic +// Registry. It then calls GatherAndLint with that Registry and with the +// provided metricNames. +func CollectAndLint(c prometheus.Collector, metricNames ...string) ([]promlint.Problem, error) { + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(c); err != nil { + return nil, fmt.Errorf("registering collector failed: %s", err) + } + return GatherAndLint(reg, metricNames...) +} + +// GatherAndLint gathers all metrics from the provided Gatherer and checks them +// with the linter in the promlint package. If any metricNames are provided, +// only metrics with those names are checked. +func GatherAndLint(g prometheus.Gatherer, metricNames ...string) ([]promlint.Problem, error) { + got, err := g.Gather() + if err != nil { + return nil, fmt.Errorf("gathering metrics failed: %s", err) + } + if metricNames != nil { + got = filterMetrics(got, metricNames) + } + return promlint.NewWithMetricFamilies(got).Lint() +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/testutil/promlint/promlint.go b/vendor/github.com/prometheus/client_golang/prometheus/testutil/promlint/promlint.go new file mode 100644 index 0000000000..ec80617062 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/testutil/promlint/promlint.go @@ -0,0 +1,386 @@ +// Copyright 2020 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package promlint provides a linter for Prometheus metrics. +package promlint + +import ( + "fmt" + "io" + "regexp" + "sort" + "strings" + + "github.com/prometheus/common/expfmt" + + dto "github.com/prometheus/client_model/go" +) + +// A Linter is a Prometheus metrics linter. It identifies issues with metric +// names, types, and metadata, and reports them to the caller. +type Linter struct { + // The linter will read metrics in the Prometheus text format from r and + // then lint it, _and_ it will lint the metrics provided directly as + // MetricFamily proto messages in mfs. Note, however, that the current + // constructor functions New and NewWithMetricFamilies only ever set one + // of them. + r io.Reader + mfs []*dto.MetricFamily +} + +// A Problem is an issue detected by a Linter. +type Problem struct { + // The name of the metric indicated by this Problem. + Metric string + + // A description of the issue for this Problem. + Text string +} + +// newProblem is helper function to create a Problem. +func newProblem(mf *dto.MetricFamily, text string) Problem { + return Problem{ + Metric: mf.GetName(), + Text: text, + } +} + +// New creates a new Linter that reads an input stream of Prometheus metrics in +// the Prometheus text exposition format. +func New(r io.Reader) *Linter { + return &Linter{ + r: r, + } +} + +// NewWithMetricFamilies creates a new Linter that reads from a slice of +// MetricFamily protobuf messages. +func NewWithMetricFamilies(mfs []*dto.MetricFamily) *Linter { + return &Linter{ + mfs: mfs, + } +} + +// Lint performs a linting pass, returning a slice of Problems indicating any +// issues found in the metrics stream. The slice is sorted by metric name +// and issue description. +func (l *Linter) Lint() ([]Problem, error) { + var problems []Problem + + if l.r != nil { + d := expfmt.NewDecoder(l.r, expfmt.FmtText) + + mf := &dto.MetricFamily{} + for { + if err := d.Decode(mf); err != nil { + if err == io.EOF { + break + } + + return nil, err + } + + problems = append(problems, lint(mf)...) + } + } + for _, mf := range l.mfs { + problems = append(problems, lint(mf)...) + } + + // Ensure deterministic output. + sort.SliceStable(problems, func(i, j int) bool { + if problems[i].Metric == problems[j].Metric { + return problems[i].Text < problems[j].Text + } + return problems[i].Metric < problems[j].Metric + }) + + return problems, nil +} + +// lint is the entry point for linting a single metric. +func lint(mf *dto.MetricFamily) []Problem { + fns := []func(mf *dto.MetricFamily) []Problem{ + lintHelp, + lintMetricUnits, + lintCounter, + lintHistogramSummaryReserved, + lintMetricTypeInName, + lintReservedChars, + lintCamelCase, + lintUnitAbbreviations, + } + + var problems []Problem + for _, fn := range fns { + problems = append(problems, fn(mf)...) + } + + // TODO(mdlayher): lint rules for specific metrics types. + return problems +} + +// lintHelp detects issues related to the help text for a metric. +func lintHelp(mf *dto.MetricFamily) []Problem { + var problems []Problem + + // Expect all metrics to have help text available. + if mf.Help == nil { + problems = append(problems, newProblem(mf, "no help text")) + } + + return problems +} + +// lintMetricUnits detects issues with metric unit names. +func lintMetricUnits(mf *dto.MetricFamily) []Problem { + var problems []Problem + + unit, base, ok := metricUnits(*mf.Name) + if !ok { + // No known units detected. + return nil + } + + // Unit is already a base unit. + if unit == base { + return nil + } + + problems = append(problems, newProblem(mf, fmt.Sprintf("use base unit %q instead of %q", base, unit))) + + return problems +} + +// lintCounter detects issues specific to counters, as well as patterns that should +// only be used with counters. +func lintCounter(mf *dto.MetricFamily) []Problem { + var problems []Problem + + isCounter := mf.GetType() == dto.MetricType_COUNTER + isUntyped := mf.GetType() == dto.MetricType_UNTYPED + hasTotalSuffix := strings.HasSuffix(mf.GetName(), "_total") + + switch { + case isCounter && !hasTotalSuffix: + problems = append(problems, newProblem(mf, `counter metrics should have "_total" suffix`)) + case !isUntyped && !isCounter && hasTotalSuffix: + problems = append(problems, newProblem(mf, `non-counter metrics should not have "_total" suffix`)) + } + + return problems +} + +// lintHistogramSummaryReserved detects when other types of metrics use names or labels +// reserved for use by histograms and/or summaries. +func lintHistogramSummaryReserved(mf *dto.MetricFamily) []Problem { + // These rules do not apply to untyped metrics. + t := mf.GetType() + if t == dto.MetricType_UNTYPED { + return nil + } + + var problems []Problem + + isHistogram := t == dto.MetricType_HISTOGRAM + isSummary := t == dto.MetricType_SUMMARY + + n := mf.GetName() + + if !isHistogram && strings.HasSuffix(n, "_bucket") { + problems = append(problems, newProblem(mf, `non-histogram metrics should not have "_bucket" suffix`)) + } + if !isHistogram && !isSummary && strings.HasSuffix(n, "_count") { + problems = append(problems, newProblem(mf, `non-histogram and non-summary metrics should not have "_count" suffix`)) + } + if !isHistogram && !isSummary && strings.HasSuffix(n, "_sum") { + problems = append(problems, newProblem(mf, `non-histogram and non-summary metrics should not have "_sum" suffix`)) + } + + for _, m := range mf.GetMetric() { + for _, l := range m.GetLabel() { + ln := l.GetName() + + if !isHistogram && ln == "le" { + problems = append(problems, newProblem(mf, `non-histogram metrics should not have "le" label`)) + } + if !isSummary && ln == "quantile" { + problems = append(problems, newProblem(mf, `non-summary metrics should not have "quantile" label`)) + } + } + } + + return problems +} + +// lintMetricTypeInName detects when metric types are included in the metric name. +func lintMetricTypeInName(mf *dto.MetricFamily) []Problem { + var problems []Problem + n := strings.ToLower(mf.GetName()) + + for i, t := range dto.MetricType_name { + if i == int32(dto.MetricType_UNTYPED) { + continue + } + + typename := strings.ToLower(t) + if strings.Contains(n, "_"+typename+"_") || strings.HasSuffix(n, "_"+typename) { + problems = append(problems, newProblem(mf, fmt.Sprintf(`metric name should not include type '%s'`, typename))) + } + } + return problems +} + +// lintReservedChars detects colons in metric names. +func lintReservedChars(mf *dto.MetricFamily) []Problem { + var problems []Problem + if strings.Contains(mf.GetName(), ":") { + problems = append(problems, newProblem(mf, "metric names should not contain ':'")) + } + return problems +} + +var camelCase = regexp.MustCompile(`[a-z][A-Z]`) + +// lintCamelCase detects metric names and label names written in camelCase. +func lintCamelCase(mf *dto.MetricFamily) []Problem { + var problems []Problem + if camelCase.FindString(mf.GetName()) != "" { + problems = append(problems, newProblem(mf, "metric names should be written in 'snake_case' not 'camelCase'")) + } + + for _, m := range mf.GetMetric() { + for _, l := range m.GetLabel() { + if camelCase.FindString(l.GetName()) != "" { + problems = append(problems, newProblem(mf, "label names should be written in 'snake_case' not 'camelCase'")) + } + } + } + return problems +} + +// lintUnitAbbreviations detects abbreviated units in the metric name. +func lintUnitAbbreviations(mf *dto.MetricFamily) []Problem { + var problems []Problem + n := strings.ToLower(mf.GetName()) + for _, s := range unitAbbreviations { + if strings.Contains(n, "_"+s+"_") || strings.HasSuffix(n, "_"+s) { + problems = append(problems, newProblem(mf, "metric names should not contain abbreviated units")) + } + } + return problems +} + +// metricUnits attempts to detect known unit types used as part of a metric name, +// e.g. "foo_bytes_total" or "bar_baz_milligrams". +func metricUnits(m string) (unit string, base string, ok bool) { + ss := strings.Split(m, "_") + + for unit, base := range units { + // Also check for "no prefix". + for _, p := range append(unitPrefixes, "") { + for _, s := range ss { + // Attempt to explicitly match a known unit with a known prefix, + // as some words may look like "units" when matching suffix. + // + // As an example, "thermometers" should not match "meters", but + // "kilometers" should. + if s == p+unit { + return p + unit, base, true + } + } + } + } + + return "", "", false +} + +// Units and their possible prefixes recognized by this library. More can be +// added over time as needed. +var ( + // map a unit to the appropriate base unit. + units = map[string]string{ + // Base units. + "amperes": "amperes", + "bytes": "bytes", + "celsius": "celsius", // Also allow Celsius because it is common in typical Prometheus use cases. + "grams": "grams", + "joules": "joules", + "kelvin": "kelvin", // SI base unit, used in special cases (e.g. color temperature, scientific measurements). + "meters": "meters", // Both American and international spelling permitted. + "metres": "metres", + "seconds": "seconds", + "volts": "volts", + + // Non base units. + // Time. + "minutes": "seconds", + "hours": "seconds", + "days": "seconds", + "weeks": "seconds", + // Temperature. + "kelvins": "kelvin", + "fahrenheit": "celsius", + "rankine": "celsius", + // Length. + "inches": "meters", + "yards": "meters", + "miles": "meters", + // Bytes. + "bits": "bytes", + // Energy. + "calories": "joules", + // Mass. + "pounds": "grams", + "ounces": "grams", + } + + unitPrefixes = []string{ + "pico", + "nano", + "micro", + "milli", + "centi", + "deci", + "deca", + "hecto", + "kilo", + "kibi", + "mega", + "mibi", + "giga", + "gibi", + "tera", + "tebi", + "peta", + "pebi", + } + + // Common abbreviations that we'd like to discourage. + unitAbbreviations = []string{ + "s", + "ms", + "us", + "ns", + "sec", + "b", + "kb", + "mb", + "gb", + "tb", + "pb", + "m", + "h", + "d", + } +) diff --git a/vendor/github.com/prometheus/client_golang/prometheus/testutil/testutil.go b/vendor/github.com/prometheus/client_golang/prometheus/testutil/testutil.go new file mode 100644 index 0000000000..9af60ce1d2 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/testutil/testutil.go @@ -0,0 +1,230 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides helpers to test code using the prometheus package +// of client_golang. +// +// While writing unit tests to verify correct instrumentation of your code, it's +// a common mistake to mostly test the instrumentation library instead of your +// own code. Rather than verifying that a prometheus.Counter's value has changed +// as expected or that it shows up in the exposition after registration, it is +// in general more robust and more faithful to the concept of unit tests to use +// mock implementations of the prometheus.Counter and prometheus.Registerer +// interfaces that simply assert that the Add or Register methods have been +// called with the expected arguments. However, this might be overkill in simple +// scenarios. The ToFloat64 function is provided for simple inspection of a +// single-value metric, but it has to be used with caution. +// +// End-to-end tests to verify all or larger parts of the metrics exposition can +// be implemented with the CollectAndCompare or GatherAndCompare functions. The +// most appropriate use is not so much testing instrumentation of your code, but +// testing custom prometheus.Collector implementations and in particular whole +// exporters, i.e. programs that retrieve telemetry data from a 3rd party source +// and convert it into Prometheus metrics. +// +// In a similar pattern, CollectAndLint and GatherAndLint can be used to detect +// metrics that have issues with their name, type, or metadata without being +// necessarily invalid, e.g. a counter with a name missing the “_total” suffix. +package testutil + +import ( + "bytes" + "fmt" + "io" + + "github.com/prometheus/common/expfmt" + + dto "github.com/prometheus/client_model/go" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/internal" +) + +// ToFloat64 collects all Metrics from the provided Collector. It expects that +// this results in exactly one Metric being collected, which must be a Gauge, +// Counter, or Untyped. In all other cases, ToFloat64 panics. ToFloat64 returns +// the value of the collected Metric. +// +// The Collector provided is typically a simple instance of Gauge or Counter, or +// – less commonly – a GaugeVec or CounterVec with exactly one element. But any +// Collector fulfilling the prerequisites described above will do. +// +// Use this function with caution. It is computationally very expensive and thus +// not suited at all to read values from Metrics in regular code. This is really +// only for testing purposes, and even for testing, other approaches are often +// more appropriate (see this package's documentation). +// +// A clear anti-pattern would be to use a metric type from the prometheus +// package to track values that are also needed for something else than the +// exposition of Prometheus metrics. For example, you would like to track the +// number of items in a queue because your code should reject queuing further +// items if a certain limit is reached. It is tempting to track the number of +// items in a prometheus.Gauge, as it is then easily available as a metric for +// exposition, too. However, then you would need to call ToFloat64 in your +// regular code, potentially quite often. The recommended way is to track the +// number of items conventionally (in the way you would have done it without +// considering Prometheus metrics) and then expose the number with a +// prometheus.GaugeFunc. +func ToFloat64(c prometheus.Collector) float64 { + var ( + m prometheus.Metric + mCount int + mChan = make(chan prometheus.Metric) + done = make(chan struct{}) + ) + + go func() { + for m = range mChan { + mCount++ + } + close(done) + }() + + c.Collect(mChan) + close(mChan) + <-done + + if mCount != 1 { + panic(fmt.Errorf("collected %d metrics instead of exactly 1", mCount)) + } + + pb := &dto.Metric{} + m.Write(pb) + if pb.Gauge != nil { + return pb.Gauge.GetValue() + } + if pb.Counter != nil { + return pb.Counter.GetValue() + } + if pb.Untyped != nil { + return pb.Untyped.GetValue() + } + panic(fmt.Errorf("collected a non-gauge/counter/untyped metric: %s", pb)) +} + +// CollectAndCount registers the provided Collector with a newly created +// pedantic Registry. It then calls GatherAndCount with that Registry and with +// the provided metricNames. In the unlikely case that the registration or the +// gathering fails, this function panics. (This is inconsistent with the other +// CollectAnd… functions in this package and has historical reasons. Changing +// the function signature would be a breaking change and will therefore only +// happen with the next major version bump.) +func CollectAndCount(c prometheus.Collector, metricNames ...string) int { + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(c); err != nil { + panic(fmt.Errorf("registering collector failed: %s", err)) + } + result, err := GatherAndCount(reg, metricNames...) + if err != nil { + panic(err) + } + return result +} + +// GatherAndCount gathers all metrics from the provided Gatherer and counts +// them. It returns the number of metric children in all gathered metric +// families together. If any metricNames are provided, only metrics with those +// names are counted. +func GatherAndCount(g prometheus.Gatherer, metricNames ...string) (int, error) { + got, err := g.Gather() + if err != nil { + return 0, fmt.Errorf("gathering metrics failed: %s", err) + } + if metricNames != nil { + got = filterMetrics(got, metricNames) + } + + result := 0 + for _, mf := range got { + result += len(mf.GetMetric()) + } + return result, nil +} + +// CollectAndCompare registers the provided Collector with a newly created +// pedantic Registry. It then calls GatherAndCompare with that Registry and with +// the provided metricNames. +func CollectAndCompare(c prometheus.Collector, expected io.Reader, metricNames ...string) error { + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(c); err != nil { + return fmt.Errorf("registering collector failed: %s", err) + } + return GatherAndCompare(reg, expected, metricNames...) +} + +// GatherAndCompare gathers all metrics from the provided Gatherer and compares +// it to an expected output read from the provided Reader in the Prometheus text +// exposition format. If any metricNames are provided, only metrics with those +// names are compared. +func GatherAndCompare(g prometheus.Gatherer, expected io.Reader, metricNames ...string) error { + got, err := g.Gather() + if err != nil { + return fmt.Errorf("gathering metrics failed: %s", err) + } + if metricNames != nil { + got = filterMetrics(got, metricNames) + } + var tp expfmt.TextParser + wantRaw, err := tp.TextToMetricFamilies(expected) + if err != nil { + return fmt.Errorf("parsing expected metrics failed: %s", err) + } + want := internal.NormalizeMetricFamilies(wantRaw) + + return compare(got, want) +} + +// compare encodes both provided slices of metric families into the text format, +// compares their string message, and returns an error if they do not match. +// The error contains the encoded text of both the desired and the actual +// result. +func compare(got, want []*dto.MetricFamily) error { + var gotBuf, wantBuf bytes.Buffer + enc := expfmt.NewEncoder(&gotBuf, expfmt.FmtText) + for _, mf := range got { + if err := enc.Encode(mf); err != nil { + return fmt.Errorf("encoding gathered metrics failed: %s", err) + } + } + enc = expfmt.NewEncoder(&wantBuf, expfmt.FmtText) + for _, mf := range want { + if err := enc.Encode(mf); err != nil { + return fmt.Errorf("encoding expected metrics failed: %s", err) + } + } + + if wantBuf.String() != gotBuf.String() { + return fmt.Errorf(` +metric output does not match expectation; want: + +%s +got: + +%s`, wantBuf.String(), gotBuf.String()) + + } + return nil +} + +func filterMetrics(metrics []*dto.MetricFamily, names []string) []*dto.MetricFamily { + var filtered []*dto.MetricFamily + for _, m := range metrics { + for _, name := range names { + if m.GetName() == name { + filtered = append(filtered, m) + break + } + } + } + return filtered +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 4ef7bafcf3..549e4a862b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -340,6 +340,8 @@ github.com/posener/script github.com/prometheus/client_golang/prometheus github.com/prometheus/client_golang/prometheus/internal github.com/prometheus/client_golang/prometheus/promhttp +github.com/prometheus/client_golang/prometheus/testutil +github.com/prometheus/client_golang/prometheus/testutil/promlint # github.com/prometheus/client_model v0.2.0 github.com/prometheus/client_model/go # github.com/prometheus/common v0.10.0