Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ const (
// OperationalStatusError is the status value for when the host
// has any sort of error.
OperationalStatusError OperationalStatus = "error"

OperationalStatusDelayed = "delayed"
)

// ErrorType indicates the class of problem that has caused the Host resource
Expand Down Expand Up @@ -470,6 +472,18 @@ type CredentialsStatus struct {
Version string `json:"credentialsVersion,omitempty"`
}

// RebootMode defines known variations of reboot modes
type RebootMode string

const (
RebootModeHard RebootMode = "hard"
RebootModeSoft RebootMode = "soft"
)

type RebootAnnotationArguments struct {
Mode RebootMode `json:"mode"`
}

// Match compares the saved status information with the name and
// content of a secret object.
func (cs CredentialsStatus) Match(secret corev1.Secret) bool {
Expand Down Expand Up @@ -519,7 +533,7 @@ type BareMetalHostStatus struct {
// after modifying this file

// OperationalStatus holds the status of the host
// +kubebuilder:validation:Enum="";OK;discovered;error
// +kubebuilder:validation:Enum="";OK;discovered;error;delayed
OperationalStatus OperationalStatus `json:"operationalStatus"`

// ErrorType indicates the type of failure encountered when the
Expand Down
15 changes: 15 additions & 0 deletions apis/metal3.io/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions config/crd/bases/metal3.io_baremetalhosts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ spec:
- OK
- discovered
- error
- delayed
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
1 change: 1 addition & 0 deletions config/render/capm3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ spec:
- OK
- discovered
- error
- delayed
type: string
poweredOn:
description: indicator for whether or not the host is powered on
Expand Down
28 changes: 26 additions & 2 deletions controllers/metal3.io/action_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ import (
"github.com/metal3-io/baremetal-operator/pkg/provisioner"
)

const maxBackOffCount = 10
// This is an upper limit for the ErrorCount, so that the max backoff
// timeout will not exceed (roughly) 8 hours
const maxBackOffCount = 9

func init() {
rand.Seed(time.Now().UTC().UnixNano())
Expand Down Expand Up @@ -54,6 +56,18 @@ func (r actionUpdate) Dirty() bool {
return true
}

// actionDelayed it's the same of an actionUpdate, but the requeue time
// is calculated using a fixed backoff with jitter
type actionDelayed struct {
actionUpdate
}

func (r actionDelayed) Result() (result reconcile.Result, err error) {
result.RequeueAfter = calculateBackoff(1)
result.Requeue = true
return
}

// actionComplete is a result indicating that the current action has completed,
// and that the resource should transition to the next state.
type actionComplete struct {
Expand Down Expand Up @@ -110,6 +124,16 @@ type actionFailed struct {
errorCount int
}

// Distribution sample for errorCount values:
// 1 [1m, 2m]
// 2 [2m, 4m]
// 3 [4m, 8m]
// 4 [8m, 16m]
// 5 [16m, 32m]
// 6 [32m, 1h4m]
// 7 [1h4m, 2h8m]
// 8 [2h8m, 4h16m]
// 9 [4h16m, 8h32m]
func calculateBackoff(errorCount int) time.Duration {

if errorCount > maxBackOffCount {
Expand All @@ -119,7 +143,7 @@ func calculateBackoff(errorCount int) time.Duration {
base := math.Exp2(float64(errorCount))
/* #nosec */
backOff := base - (rand.Float64() * base * 0.5)
backOffDuration := time.Minute * time.Duration(backOff)
backOffDuration := time.Duration(float64(time.Minute) * backOff)
return backOffDuration
}

Expand Down
2 changes: 1 addition & 1 deletion controllers/metal3.io/action_result_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
func TestBackoffIncrements(t *testing.T) {

var backOff time.Duration
for i := 0; i < maxBackOffCount; i++ {
for i := 1; i <= maxBackOffCount; i++ {
prev := backOff
backOff = calculateBackoff(i)

Expand Down
73 changes: 63 additions & 10 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,9 @@ const (
unmanagedRetryDelay = time.Minute * 10
provisionerNotReadyRetryDelay = time.Second * 30
rebootAnnotationPrefix = "reboot.metal3.io"
inspectAnnotationPrefix = "inspect.metal3.io"
)

func init() {
}

// BareMetalHostReconciler reconciles a BareMetalHost object
type BareMetalHostReconciler struct {
client.Client
Expand Down Expand Up @@ -214,7 +212,6 @@ func (r *BareMetalHostReconciler) Reconcile(request ctrl.Request) (result ctrl.R
}

ready, err := prov.IsReady()

if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to check services availability")
}
Expand Down Expand Up @@ -295,6 +292,15 @@ func recordActionFailure(info *reconcileInfo, errorType metal3v1alpha1.ErrorType
return actionFailed{dirty: true, ErrorType: errorType, errorCount: info.host.Status.ErrorCount}
}

func recordActionDelayed(info *reconcileInfo) actionResult {

counter := delayedProvisioningHostCounters.With(hostMetricLabels(info.request))
info.postSaveCallbacks = append(info.postSaveCallbacks, counter.Inc)

info.host.SetOperationalStatus(metal3v1alpha1.OperationalStatusDelayed)
return actionDelayed{}
}

func (r *BareMetalHostReconciler) credentialsErrorResult(err error, request ctrl.Request, host *metal3v1alpha1.BareMetalHost) (ctrl.Result, error) {
switch err.(type) {
// In the event a credential secret is defined, but we cannot find it
Expand Down Expand Up @@ -333,13 +339,40 @@ func (r *BareMetalHostReconciler) credentialsErrorResult(err error, request ctrl
}

// hasRebootAnnotation checks for existence of reboot annotations and returns true if at least one exist
func hasRebootAnnotation(host *metal3v1alpha1.BareMetalHost) bool {
for annotation := range host.Annotations {
func hasRebootAnnotation(info *reconcileInfo) (hasReboot bool, rebootMode metal3v1alpha1.RebootMode) {
rebootMode = metal3v1alpha1.RebootModeSoft

for annotation, value := range info.host.GetAnnotations() {
if isRebootAnnotation(annotation) {
return true
hasReboot = true
newRebootMode := getRebootMode(value, info)
// If any annotation has asked for a hard reboot, that
// mode takes precedence.
if newRebootMode == metal3v1alpha1.RebootModeHard {
rebootMode = newRebootMode
}
// Don't use a break here as we may have multiple clients setting
// reboot annotations and we always want hard requests honoured
}
}
return false
return
}

func getRebootMode(annotation string, info *reconcileInfo) metal3v1alpha1.RebootMode {

if annotation == "" {
info.log.Info("No reboot annotation value specified, assuming soft-reboot.")
return metal3v1alpha1.RebootModeSoft
}

annotations := metal3v1alpha1.RebootAnnotationArguments{}
err := json.Unmarshal([]byte(annotation), &annotations)
if err != nil {
info.publishEvent("InvalidAnnotationValue", fmt.Sprintf("could not parse reboot annotation (%s) - invalid json, assuming soft-reboot", annotation))
info.log.Info(fmt.Sprintf("Could not parse reboot annotation (%q) - invalid json, assuming soft-reboot", annotation))
return metal3v1alpha1.RebootModeSoft
}
return annotations.Mode
}

// isRebootAnnotation returns true if the provided annotation is a reboot annotation (either suffixed or not)
Expand All @@ -359,6 +392,16 @@ func clearRebootAnnotations(host *metal3v1alpha1.BareMetalHost) (dirty bool) {
return
}

// inspectionDisabled checks for existence of inspect.metal3.io=disabled
// which means we don't inspect even in Inspecting state
func inspectionDisabled(host *metal3v1alpha1.BareMetalHost) bool {
annotations := host.GetAnnotations()
if annotations[inspectAnnotationPrefix] == "disabled" {
return true
}
return false
}

// clearError removes any existing error message.
func clearError(host *metal3v1alpha1.BareMetalHost) (dirty bool) {
dirty = host.SetOperationalStatus(metal3v1alpha1.OperationalStatusOK)
Expand Down Expand Up @@ -491,6 +534,13 @@ func (r *BareMetalHostReconciler) registerHost(prov provisioner.Provisioner, inf

// Ensure we have the information about the hardware on the host.
func (r *BareMetalHostReconciler) actionInspecting(prov provisioner.Provisioner, info *reconcileInfo) actionResult {

if inspectionDisabled(info.host) {
info.log.Info("inspection disabled by annotation")
info.publishEvent("InspectionSkipped", "disabled by annotation")
return actionComplete{}
}

info.log.Info("inspecting hardware")

provResult, details, err := prov.InspectHardware(info.host.Status.ErrorType == metal3v1alpha1.InspectionError)
Expand Down Expand Up @@ -701,7 +751,9 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,

provState := info.host.Status.Provisioning.State
isProvisioned := provState == metal3v1alpha1.StateProvisioned || provState == metal3v1alpha1.StateExternallyProvisioned
if hasRebootAnnotation(info.host) && isProvisioned {

desiredReboot, desiredRebootMode := hasRebootAnnotation(info)
if desiredReboot && isProvisioned {
desiredPowerOnState = false
}

Expand All @@ -716,12 +768,13 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
info.log.Info("power state change needed",
"expected", desiredPowerOnState,
"actual", info.host.Status.PoweredOn,
"reboot mode", desiredRebootMode,
"reboot process", desiredPowerOnState != info.host.Spec.Online)

if desiredPowerOnState {
provResult, err = prov.PowerOn()
} else {
provResult, err = prov.PowerOff()
provResult, err = prov.PowerOff(desiredRebootMode)
}
if err != nil {
return actionError{errors.Wrap(err, "failed to manage power state of host")}
Expand Down
Loading