Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pkg/controller/baremetalhost/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ func (hsm *hostStateMachine) handleRegistrationError(info *reconcileInfo) action
hsm.NextState = metal3v1alpha1.StateRegistering
return actionComplete{}
}
if hsm.Host.Spec.Image != nil {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If credentials are wrong, and image exists, this code will constantly retry registration, isn't it?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct. We have another PR up to introduce similar retry behavior for other failures.

info.log.Info("Image is set; will retry registration")
hsm.NextState = metal3v1alpha1.StateRegistering
return actionComplete{}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This requeues with no delay. Since handleRegistration also requeues with no delay after a failure, this means if the credentials are wrong we will be constantly cycling the status in a tight loop.

I think we need to merge #610 instead of returning actionComplete here. With that patch if we continue to return actionFailed then we will retry with an appropriate backoff.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense.

}
return actionFailed{}
}

Expand Down
62 changes: 44 additions & 18 deletions pkg/provisioner/ironic/ironic.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,11 +398,6 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r
}
}

// ironicNode, err = nodes.Get(p.client, p.status.ID).Extract()
// if err != nil {
// return result, errors.Wrap(err, "failed to get provisioning state in ironic")
// }

p.log.Info("current provision state",
"lastError", ironicNode.LastError,
"current", ironicNode.ProvisionState,
Expand Down Expand Up @@ -594,15 +589,7 @@ func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, er
return result, nil
}

func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) {

hwProf, err := hardware.GetProfile(p.host.HardwareProfile())

if err != nil {
return updates, errors.Wrap(err,
fmt.Sprintf("Could not start provisioning with bad hardware profile %s",
p.host.HardwareProfile()))
}
func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node, forProvisioning bool) (updates nodes.UpdateOpts, err error) {

// image_source
var op nodes.UpdateOp
Expand All @@ -622,7 +609,10 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update
},
)

checksum, checksumType, _ := p.host.GetImageChecksum()
checksum, checksumType, imageOK := p.host.GetImageChecksum()
if !imageOK {
return updates, fmt.Errorf("missing or invalid image details")
}

// image_os_hash_algo
if _, ok := ironicNode.InstanceInfo["image_os_hash_algo"]; !ok {
Expand Down Expand Up @@ -703,6 +693,18 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update
},
)

if !forProvisioning {
return updates, nil
}

hwProf, err := hardware.GetProfile(p.host.HardwareProfile())

if err != nil {
return updates, errors.Wrap(err,
fmt.Sprintf("Could not start provisioning with bad hardware profile %s",
p.host.HardwareProfile()))
}

// root_gb
//
// FIXME(dhellmann): We have to provide something for the disk
Expand Down Expand Up @@ -791,8 +793,7 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update
)

// boot_mode
_, ok := ironicNode.InstanceInfo["deploy_boot_mode"]
if !ok {
if _, ok := ironicNode.InstanceInfo["deploy_boot_mode"]; !ok {
op = nodes.AddOp
} else {
op = nodes.ReplaceOp
Expand All @@ -818,7 +819,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p

p.log.Info("starting provisioning", "node properties", ironicNode.Properties)

updates, err := p.getUpdateOptsForNode(ironicNode)
updates, err := p.getUpdateOptsForNode(ironicNode, true)
_, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract()
switch err.(type) {
case nil:
Expand Down Expand Up @@ -892,6 +893,31 @@ func (p *ironicProvisioner) Adopt() (result provisioner.Result, err error) {
err = fmt.Errorf("Invalid state for adopt: %s",
ironicNode.ProvisionState)
case nodes.Manageable:
if p.host.Spec.Image == nil {
// Without an image, adoption will fail. We can stop and
// wait for the image to be set, but we need to do that in
// a way that ensures we don't enter the states where we
// try to manage the host power status. Setting our own
// error message here avoids exposing the error message
// from Ironic that talks about fields in Ironic with
// names the user may not recognize.
result.ErrorMessage = "Image details missing for externally provisioned server."
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this actually an error? Can't we consider getting to the Manageable state a success? It means we got past Verifying, which is the important thing.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't go any further without the image. So unless we add a new state to the BMH state machine for hosts without images, we have to do something here to keep it from progressing and ending up stuck in a failure when adoption doesn't work.

Maybe we do need another state?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're in the ExternallyProvisioned state... what further stuff do we have to do? Getting to Manageable means we can control the power, doesn't it?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we have to adopt the host (see line 921).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've just noticed that we allow the Host to freely switch between the ExternallyProvisioned and Ready states without deprovisioning it first. That's likely a bug.

We have to adopt the host if we are in the Provisioned state. And we might want to adopt the host if we know the image so that Host can transition directly from ExternallyProvisioned->Provisioned without cleaning/inspection (although we don't support that today... we just go from ExternallyProvisioned->Ready without cleaning lol). But I'm not aware of any reason now or in the future that we would have to adopt an externally-provisioned host when we haven't been told what image (if any) is expected to be running on it.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've just noticed that we allow the Host to freely switch between the ExternallyProvisioned and Ready states without deprovisioning it first. That's likely a bug.

Fun. Maybe? I could go either way.

We have to adopt the host if we are in the Provisioned state. And we might want to adopt the host if we know the image so that Host can transition directly from ExternallyProvisioned->Provisioned without cleaning/inspection (although we don't support that today... we just go from ExternallyProvisioned->Ready without cleaning lol). But I'm not aware of any reason now or in the future that we would have to adopt an externally-provisioned host when we haven't been told what image (if any) is expected to be running on it.

According to @dtantsur or @juliakreger when I was writing that code, Ironic only monitors the power state of hosts that are adopted. Maybe I'm mis-remembering that, though? In any case, adoption today requires the image, and that's why this fix is preventing adoption until the image is set.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to @dtantsur or @juliakreger when I was writing that code, Ironic only monitors the power state of hosts that are adopted.

Ah, I hadn't heard that. It's possible - adopting moves it to the active state (i.e. provisioned), so that shouldn't be required per se, but it may be we need to get to the available state before we could manage the power. If you don't have an image, the way to get there is via cleaning, which we don't want to do on an externally provisioned host (until it goes back to Ready!).

If that's the case, then I can see why we would treat this as an error, since it means we can't change the power state while externally provisioned.

It's tempting to just pass some bogus image, but there's no way to replace it with the real one if we get it later without dropping the Ironic DB.

return
}

updates, err := p.getUpdateOptsForNode(ironicNode, false)
_, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract()
switch err.(type) {
case nil:
case gophercloud.ErrDefault409:
p.log.Info("could not update host settings in ironic while adopting, busy")
result.Dirty = true
return result, nil
default:
return result, errors.Wrap(err,
"failed to update host settings in ironic while adopting")
}

return p.changeNodeProvisionState(
ironicNode,
nodes.ProvisionStateOpts{
Expand Down
11 changes: 6 additions & 5 deletions pkg/provisioner/ironic/ironic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ func TestGetUpdateOptsForNodeWithRootHints(t *testing.T) {
prov, err := newProvisioner(makeHost(), bmc.Credentials{}, eventPublisher)
ironicNode := &nodes.Node{}

patches, err := prov.getUpdateOptsForNode(ironicNode)
patches, err := prov.getUpdateOptsForNode(ironicNode, true)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -219,7 +219,7 @@ func TestGetUpdateOptsForNodeVirtual(t *testing.T) {
prov, err := newProvisioner(host, bmc.Credentials{}, eventPublisher)
ironicNode := &nodes.Node{}

patches, err := prov.getUpdateOptsForNode(ironicNode)
patches, err := prov.getUpdateOptsForNode(ironicNode, true)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -314,7 +314,7 @@ func TestGetUpdateOptsForNodeDell(t *testing.T) {
prov, err := newProvisioner(host, bmc.Credentials{}, eventPublisher)
ironicNode := &nodes.Node{}

patches, err := prov.getUpdateOptsForNode(ironicNode)
patches, err := prov.getUpdateOptsForNode(ironicNode, true)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -468,7 +468,7 @@ func TestGetUpdateOptsForNodeBootMode(t *testing.T) {
}

prov, err := newProvisioner(&host, bmc.Credentials{}, eventPublisher)
patches, err := prov.getUpdateOptsForNode(&tc.Node)
patches, err := prov.getUpdateOptsForNode(&tc.Node, true)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -506,7 +506,8 @@ func makeHost() *metal3v1alpha1.BareMetalHost {
},
Spec: metal3v1alpha1.BareMetalHostSpec{
Image: &metal3v1alpha1.Image{
URL: "not-empty",
URL: "not-empty",
Checksum: "not-empty",
},
Online: true,
HardwareProfile: "libvirt",
Expand Down