Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1005,9 +1005,12 @@ func (r *BareMetalHostReconciler) manageHostPower(prov provisioner.Provisioner,
"reboot process", desiredPowerOnState != info.host.Spec.Online)

if desiredPowerOnState {
provResult, err = prov.PowerOn()
provResult, err = prov.PowerOn(info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError)
} else {
provResult, err = prov.PowerOff(desiredRebootMode)
if info.host.Status.ErrorCount > 0 {
desiredRebootMode = metal3v1alpha1.RebootModeHard
}
provResult, err = prov.PowerOff(desiredRebootMode, info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError)
Comment thread
stbenjam marked this conversation as resolved.
}
if err != nil {
return actionError{errors.Wrap(err, "failed to manage power state of host")}
Expand Down
4 changes: 2 additions & 2 deletions controllers/metal3.io/host_state_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1172,11 +1172,11 @@ func (m *mockProvisioner) Detach() (result provisioner.Result, err error) {
return res, err
}

func (m *mockProvisioner) PowerOn() (result provisioner.Result, err error) {
func (m *mockProvisioner) PowerOn(force bool) (result provisioner.Result, err error) {
return m.getNextResultByMethod("PowerOn"), err
}

func (m *mockProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) {
func (m *mockProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result provisioner.Result, err error) {
return m.getNextResultByMethod("PowerOff"), err
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/provisioner/demo/demo.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func (p *demoProvisioner) Detach() (result provisioner.Result, err error) {

// PowerOn ensures the server is powered on independently of any image
// provisioning operation.
func (p *demoProvisioner) PowerOn() (result provisioner.Result, err error) {
func (p *demoProvisioner) PowerOn(force bool) (result provisioner.Result, err error) {

hostName := p.objectMeta.Name
switch hostName {
Expand All @@ -326,7 +326,7 @@ func (p *demoProvisioner) PowerOn() (result provisioner.Result, err error) {

// PowerOff ensures the server is powered off independently of any image
// provisioning operation.
func (p *demoProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) {
func (p *demoProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result provisioner.Result, err error) {

hostName := p.objectMeta.Name
switch hostName {
Expand Down
4 changes: 2 additions & 2 deletions pkg/provisioner/fixture/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ func (p *fixtureProvisioner) Detach() (result provisioner.Result, err error) {

// PowerOn ensures the server is powered on independently of any image
// provisioning operation.
func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) {
func (p *fixtureProvisioner) PowerOn(force bool) (result provisioner.Result, err error) {
p.log.Info("ensuring host is powered on")

if !p.state.poweredOn {
Expand All @@ -307,7 +307,7 @@ func (p *fixtureProvisioner) PowerOn() (result provisioner.Result, err error) {

// PowerOff ensures the server is powered off independently of any image
// provisioning operation.
func (p *fixtureProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) {
func (p *fixtureProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result provisioner.Result, err error) {
p.log.Info("ensuring host is powered off")

if p.state.poweredOn {
Expand Down
9 changes: 0 additions & 9 deletions pkg/provisioner/ironic/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,6 @@ func (e SoftPowerOffUnsupportedError) Error() string {
return "soft power off is unsupported on BMC"
}

// SoftPowerOffFailed is returned when the soft power off command
// finishes with failure.
type SoftPowerOffFailed struct {
}

func (e SoftPowerOffFailed) Error() string {
return "Soft power off has failed on BMC"
}

// HostLockedError is returned when the BMC host is
// locked.
type HostLockedError struct {
Expand Down
64 changes: 36 additions & 28 deletions pkg/provisioner/ironic/ironic.go
Original file line number Diff line number Diff line change
Expand Up @@ -1519,7 +1519,7 @@ func (p *ironicProvisioner) changePower(ironicNode *nodes.Node, target nodes.Tar

// PowerOn ensures the server is powered on independently of any image
// provisioning operation.
func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) {
func (p *ironicProvisioner) PowerOn(force bool) (result provisioner.Result, err error) {
p.log.Info("ensuring host is powered on")

ironicNode, err := p.getNode()
Expand All @@ -1535,46 +1535,39 @@ func (p *ironicProvisioner) PowerOn() (result provisioner.Result, err error) {
p.log.Info("waiting for power status to change")
return operationContinuing(powerRequeueDelay)
}
result, err = p.changePower(ironicNode, nodes.PowerOn)
if ironicNode.LastError != "" && !force {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sadasu (maybe I missed some bits but) can you please clarify the reason to check ironicNode.LastError before invoking changePower? Couldn't be something to be managed just after checking changePower?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@andfasano When the 1st call to changePower() resulted in operationContinuing() and when PowerOn() is called again and this time the Last Error is not empty. We do not want to continue to changePower() without reporting the contents of LastError.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point was that, if LastError is set just after a call to changePower(), maybe an immediate p.getNode() after it would be sufficient to inspect the node without waiting another loop (with the possibility eventually to return immediately an operationFailed).

In any case, it seems that in the current code the operationFailed is invoked just once (the second time). If the problem persists, the subsequent invocations will skip it given that the force flag will result to be set (PowerManagementError)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know that there's any reason to think a LastError will show up immediately after calling changePower(). In fact, quite the opposite - I'd expect it to almost always fail because of a timeout.

When the force flag is passed we return success, which will result in clearError() being called so that next time around the force flag will be unset and we will report an error if another one occurs. This is consistent with all of the other methods that take a force flag.

If you call changePower first then either you will never see an error that appears asynchronously (presumably all of them, since the purpose of LastError is recording asynchronous errors) or you would have to only call it when the force flag is set, which means you couldn't change the power without first flagging a power management error.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok

p.log.Info("PowerOn operation failed", "msg", ironicNode.LastError)
return operationFailed(fmt.Sprintf("PowerOn operation failed: %s",
ironicNode.LastError))
}
if result, err = p.changePower(ironicNode, nodes.PowerOn); err == nil {
p.publisher("PowerOn", "Host powered on")
return result, nil
}
switch err.(type) {
case nil:
case HostLockedError:
return retryAfterDelay(powerRequeueDelay)
default:
return transientError(errors.Wrap(err, "failed to power on host"))
return transientError(errors.Wrap(err, "failed to PowerOn node"))
}
p.publisher("PowerOn", "Host powered on")
}

return result, nil
}

// PowerOff ensures the server is powered off independently of any image
// provisioning operation.
func (p *ironicProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode) (result provisioner.Result, err error) {
func (p *ironicProvisioner) PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result provisioner.Result, err error) {
p.log.Info(fmt.Sprintf("ensuring host is powered off (mode: %s)", rebootMode))

if rebootMode == metal3v1alpha1.RebootModeHard {
result, err = p.hardPowerOff()
} else {
result, err = p.softPowerOff()
if rebootMode == metal3v1alpha1.RebootModeSoft {
return p.softPowerOff()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a little unfortunate that we have to record a user-visible failure here when soft power off isn't supported. This is still an improvement to the readability though, because we're no longer calling softPowerOff() to check the state of a hard power off we fell back to.
I think it should be possible to get the best of both in future with some refactoring...

}
if err != nil {
switch err.(type) {
// In case of soft power off is unsupported or has failed,
// we activate hard power off.
case SoftPowerOffUnsupportedError, SoftPowerOffFailed:
return p.hardPowerOff()
case HostLockedError:
return retryAfterDelay(powerRequeueDelay)
default:
return transientError(err)
}
}
return result, nil
// Reboot mode is hard or force flag is set
return p.hardPowerOff(force)
}

// hardPowerOff sends 'power off' request to BM node and waits for the result
func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error) {
func (p *ironicProvisioner) hardPowerOff(force bool) (result provisioner.Result, err error) {
p.log.Info("ensuring host is powered off by \"hard power off\" command")

ironicNode, err := p.getNode()
Expand All @@ -1583,13 +1576,22 @@ func (p *ironicProvisioner) hardPowerOff() (result provisioner.Result, err error
}

if ironicNode.PowerState != powerOff {
if ironicNode.LastError != "" && !force {
p.log.Info("hard power off error", "msg", ironicNode.LastError)
return operationFailed(ironicNode.LastError)
}
if ironicNode.TargetPowerState == powerOff {
p.log.Info("waiting for power status to change")
return operationContinuing(powerRequeueDelay)
}
result, err = p.changePower(ironicNode, nodes.PowerOff)
if err != nil {
return transientError(errors.Wrap(err, "failed to power off host"))
switch err.(type) {
case HostLockedError:
return retryAfterDelay(powerRequeueDelay)
default:
return transientError(errors.Wrap(err, "failed to power off host"))
}
Comment thread
stbenjam marked this conversation as resolved.
}
p.publisher("PowerOff", "Host powered off")
return result, err
Expand Down Expand Up @@ -1621,11 +1623,17 @@ func (p *ironicProvisioner) softPowerOff() (result provisioner.Result, err error
// If the target state is unset while the last error is set,
// then the last execution of soft power off has failed.
if targetState == "" && ironicNode.LastError != "" {
return result, SoftPowerOffFailed{}
Comment thread
sadasu marked this conversation as resolved.
p.log.Info("soft power off error", "msg", ironicNode.LastError)
return operationFailed(ironicNode.LastError)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code in this function up to this point is now essentially common to both softPowerOff() and hardPowerOff(), so we could extract it back up into PowerOff().
With that done we could allow an immediate fall back from SoftPowerOffUnsupported to trying a hard power off without reporting an error, since we'd then only call softPowerOff() (or hardPowerOff()) at the start of a power off attempt.

}
result, err = p.changePower(ironicNode, nodes.SoftPowerOff)
if err != nil {
return transientError(err)
switch err.(type) {
case HostLockedError:
return retryAfterDelay(powerRequeueDelay)
default:
return transientError(errors.Wrap(err, "failed to power off host"))
}
}
p.publisher("PowerOff", "Host soft powered off")
}
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be returning operationComplete() here like hardPowerOff() does?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, yep, good spotting. The default value of result is equivalent to what operationComplete() returns so it doesn't technically make any difference, but from a documentation perspective we should return the result of changePower inside the if statement and operationComplete() outside. I must have missed this when I originally did the result functions.

Expand Down
77 changes: 75 additions & 2 deletions pkg/provisioner/ironic/power_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ func TestPowerOn(t *testing.T) {
nodeUUID := "33ce8659-7400-4c68-9535-d10766f07a58"
cases := []struct {
name string
force bool
ironic *testserver.IronicMock

expectedDirty bool
expectedError bool
expectedRequestAfter int
expectedErrorResult bool
}{
{
name: "node-already-power-on",
Expand Down Expand Up @@ -75,6 +77,30 @@ func TestPowerOn(t *testing.T) {
expectedRequestAfter: 10,
expectedDirty: true,
},
{
name: "power-on with LastError",
ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{
PowerState: powerOff,
TargetPowerState: powerOff,
UUID: nodeUUID,
LastError: "power on failed",
}),
expectedRequestAfter: 0,
expectedDirty: false,
expectedErrorResult: true,
},
{
name: "power-on with LastError",
force: true,
ironic: testserver.NewIronic(t).Ready().Node(nodes.Node{
PowerState: powerOff,
TargetPowerState: powerOff,
UUID: nodeUUID,
LastError: "power on failed",
}),
expectedError: true,
expectedErrorResult: false,
},
}

for _, tc := range cases {
Expand All @@ -101,7 +127,7 @@ func TestPowerOn(t *testing.T) {
t.Fatalf("could not create provisioner: %s", err)
}

result, err := prov.PowerOn()
result, err := prov.PowerOn(tc.force)

assert.Equal(t, tc.expectedDirty, result.Dirty)
assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter)
Expand All @@ -110,6 +136,9 @@ func TestPowerOn(t *testing.T) {
} else {
assert.Error(t, err)
}
if tc.expectedErrorResult {
assert.Contains(t, result.ErrorMessage, "PowerOn operation failed")
}
})
}
}
Expand All @@ -120,10 +149,12 @@ func TestPowerOff(t *testing.T) {
cases := []struct {
name string
ironic *testserver.IronicMock
force bool

expectedDirty bool
expectedError bool
expectedRequestAfter int
expectedErrorResult bool
rebootMode metal3v1alpha1.RebootMode
}{
{
Expand Down Expand Up @@ -187,6 +218,44 @@ func TestPowerOff(t *testing.T) {
expectedRequestAfter: 10,
expectedDirty: true,
},
{
name: "power-off soft with force",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
PowerState: powerOn,
TargetPowerState: powerOn,
TargetProvisionState: "",
UUID: nodeUUID,
}),
rebootMode: metal3v1alpha1.RebootModeSoft,
force: true,
expectedDirty: true,
},
{
name: "power-off hard with LastError",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
PowerState: powerOn,
TargetPowerState: powerOn,
TargetProvisionState: "",
UUID: nodeUUID,
LastError: "hard power off failed",
}),
rebootMode: metal3v1alpha1.RebootModeHard,
expectedDirty: false,
expectedErrorResult: true,
},
{
name: "power-off hard with force",
ironic: testserver.NewIronic(t).WithDefaultResponses().Node(nodes.Node{
PowerState: powerOn,
TargetPowerState: powerOn,
TargetProvisionState: "",
UUID: nodeUUID,
LastError: "hard power off failed",
}),
rebootMode: metal3v1alpha1.RebootModeHard,
force: true,
expectedDirty: true,
},
}

for _, tc := range cases {
Expand Down Expand Up @@ -214,7 +283,7 @@ func TestPowerOff(t *testing.T) {
}

// We pass the RebootMode type here to define the reboot action
result, err := prov.PowerOff(tc.rebootMode)
result, err := prov.PowerOff(tc.rebootMode, tc.force)

assert.Equal(t, tc.expectedDirty, result.Dirty)
assert.Equal(t, time.Second*time.Duration(tc.expectedRequestAfter), result.RequeueAfter)
Expand All @@ -223,6 +292,10 @@ func TestPowerOff(t *testing.T) {
} else {
assert.Error(t, err)
}
if tc.expectedErrorResult {
assert.Contains(t, result.ErrorMessage, "hard power off failed")
}

})
}
}
4 changes: 2 additions & 2 deletions pkg/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,12 @@ type Provisioner interface {

// PowerOn ensures the server is powered on independently of any image
// provisioning operation.
PowerOn() (result Result, err error)
PowerOn(force bool) (result Result, err error)

// PowerOff ensures the server is powered off independently of any image
// provisioning operation. The boolean argument may be used to specify
// if a hard reboot (force power off) is required - true if so.
PowerOff(rebootMode metal3v1alpha1.RebootMode) (result Result, err error)
PowerOff(rebootMode metal3v1alpha1.RebootMode, force bool) (result Result, err error)

// IsReady checks if the provisioning backend is available to accept
// all the incoming requests.
Expand Down