-
Notifications
You must be signed in to change notification settings - Fork 310
Better handling of power management errors #841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5b37ea1
3684785
85c0526
82a2492
e4614b3
67a27dc
b86a290
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1519,7 +1519,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar | |
|
|
||
| // PowerOn ensures the server is powered on independently of any image | ||
| // provisioning operation. | ||
| func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { | ||
| func (p *ironicProvisioner) PowerOn(force bool) (result provisioner.Result, err error) { | ||
| p.log.Info("ensuring host is powered on") | ||
|
|
||
| ironicNode, err := p.getNode() | ||
|
|
@@ -1535,46 +1535,39 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) { | |
| p.log.Info("waiting for power status to change") | ||
| return operationContinuing(powerRequeueDelay) | ||
| } | ||
| result, err = p.changePower(ironicNode, nodes.PowerOn) | ||
| if ironicNode.LastError != "" && !force { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sadasu (maybe I missed some bits but) can you please clarify the reason to check
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andfasano When the 1st call to changePower() resulted in operationContinuing() and when PowerOn() is called again and this time the Last Error is not empty. We do not want to continue to changePower() without reporting the contents of LastError.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My point was that, if LastError is set just after a call to changePower(), maybe an immediate In any case, it seems that in the current code the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know that there's any reason to think a LastError will show up immediately after calling changePower(). In fact, quite the opposite - I'd expect it to almost always fail because of a timeout. When the force flag is passed we return success, which will result in If you call changePower first then either you will never see an error that appears asynchronously (presumably all of them, since the purpose of LastError is recording asynchronous errors) or you would have to only call it when the force flag is set, which means you couldn't change the power without first flagging a power management error.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok |
||
| p.log.Info("PowerOn operation failed", "msg", ironicNode.LastError) | ||
| return operationFailed(fmt.Sprintf("PowerOn operation failed: %s", | ||
| ironicNode.LastError)) | ||
| } | ||
| if result, err = p.changePower(ironicNode, nodes.PowerOn); err == nil { | ||
| p.publisher("PowerOn", "Host powered on") | ||
| return result, nil | ||
| } | ||
| switch err.(type) { | ||
| case nil: | ||
| case HostLockedError: | ||
| return retryAfterDelay(powerRequeueDelay) | ||
| default: | ||
| return transientError(errors.Wrap(err, "failed to power on host")) | ||
| return transientError(errors.Wrap(err, "failed to PowerOn node")) | ||
| } | ||
| p.publisher("PowerOn", "Host powered on") | ||
| } | ||
|
|
||
| return result, nil | ||
| } | ||
|
|
||
| // PowerOff ensures the server is powered off independently of any image | ||
| // provisioning operation. | ||
| func (p *ironicProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) { | ||
| func (p *ironicProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result provisioner.Result, err error) { | ||
| p.log.Info(fmt.Sprintf("ensuring host is powered off (mode: %s)", rebootMode)) | ||
|
|
||
| if rebootMode == metal3v1alpha1.RebootModeHard { | ||
| result, err = p.hardPowerOff() | ||
| } else { | ||
| result, err = p.softPowerOff() | ||
| if rebootMode == metal3v1alpha1.RebootModeSoft { | ||
| return p.softPowerOff() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a little unfortunate that we have to record a user-visible failure here when soft power off isn't supported. This is still an improvement to the readability though, because we're no longer calling |
||
| } | ||
| if err != nil { | ||
| switch err.(type) { | ||
| // In case of soft power off is unsupported or has failed, | ||
| // we activate hard power off. | ||
| case SoftPowerOffUnsupportedError, SoftPowerOffFailed: | ||
| return p.hardPowerOff() | ||
| case HostLockedError: | ||
| return retryAfterDelay(powerRequeueDelay) | ||
| default: | ||
| return transientError(err) | ||
| } | ||
| } | ||
| return result, nil | ||
| // Reboot mode is hard or force flag is set | ||
| return p.hardPowerOff(force) | ||
| } | ||
|
|
||
| // hardPowerOff sends 'power off' request to BM node and waits for the result | ||
| func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error) { | ||
| func (p *ironicProvisioner) hardPowerOff(force bool) (result provisioner.Result, err error) { | ||
| p.log.Info("ensuring host is powered off by \"hard power off\" command") | ||
|
|
||
| ironicNode, err := p.getNode() | ||
|
|
@@ -1583,13 +1576,22 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error | |
| } | ||
|
|
||
| if ironicNode.PowerState != powerOff { | ||
| if ironicNode.LastError != "" && !force { | ||
| p.log.Info("hard power off error", "msg", ironicNode.LastError) | ||
| return operationFailed(ironicNode.LastError) | ||
| } | ||
| if ironicNode.TargetPowerState == powerOff { | ||
| p.log.Info("waiting for power status to change") | ||
| return operationContinuing(powerRequeueDelay) | ||
| } | ||
| result, err = p.changePower(ironicNode, nodes.PowerOff) | ||
| if err != nil { | ||
| return transientError(errors.Wrap(err, "failed to power off host")) | ||
| switch err.(type) { | ||
| case HostLockedError: | ||
| return retryAfterDelay(powerRequeueDelay) | ||
| default: | ||
| return transientError(errors.Wrap(err, "failed to power off host")) | ||
| } | ||
|
stbenjam marked this conversation as resolved.
|
||
| } | ||
| p.publisher("PowerOff", "Host powered off") | ||
| return result, err | ||
|
|
@@ -1621,11 +1623,17 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error | |
| // If the target state is unset while the last error is set, | ||
| // then the last execution of soft power off has failed. | ||
| if targetState == "" && ironicNode.LastError != "" { | ||
| return result, SoftPowerOffFailed{} | ||
|
sadasu marked this conversation as resolved.
|
||
| p.log.Info("soft power off error", "msg", ironicNode.LastError) | ||
| return operationFailed(ironicNode.LastError) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code in this function up to this point is now essentially common to both |
||
| } | ||
| result, err = p.changePower(ironicNode, nodes.SoftPowerOff) | ||
| if err != nil { | ||
| return transientError(err) | ||
| switch err.(type) { | ||
| case HostLockedError: | ||
| return retryAfterDelay(powerRequeueDelay) | ||
| default: | ||
| return transientError(errors.Wrap(err, "failed to power off host")) | ||
| } | ||
| } | ||
| p.publisher("PowerOff", "Host soft powered off") | ||
| } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we be returning operationComplete() here like hardPowerOff() does?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, yep, good spotting. The default value of result is equivalent to what operationComplete() returns so it doesn't technically make any difference, but from a documentation perspective we should return the result of changePower inside the if statement and operationComplete() outside. I must have missed this when I originally did the result functions. |
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.