-
Notifications
You must be signed in to change notification settings - Fork 310
handle externally provisioned hosts without image settings #609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -206,6 +206,11 @@ func (hsm *hostStateMachine) handleRegistrationError(info *reconcileInfo) action | |
| hsm.NextState = metal3v1alpha1.StateRegistering | ||
| return actionComplete{} | ||
| } | ||
| if hsm.Host.Spec.Image != nil { | ||
| info.log.Info("Image is set; will retry registration") | ||
| hsm.NextState = metal3v1alpha1.StateRegistering | ||
| return actionComplete{} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This requeues with no delay. Since handleRegistration also requeues with no delay after a failure, this means if the credentials are wrong we will be constantly cycling the status in a tight loop. I think we need to merge #610 instead of returning actionComplete here. With that patch if we continue to return actionFailed then we will retry with an appropriate backoff.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. |
||
| } | ||
| return actionFailed{} | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -398,11 +398,6 @@ func (p *ironicProvisioner) ValidateManagementAccess(credentialsChanged bool) (r | |
| } | ||
| } | ||
|
|
||
| // ironicNode, err = nodes.Get(p.client, p.status.ID).Extract() | ||
| // if err != nil { | ||
| // return result, errors.Wrap(err, "failed to get provisioning state in ironic") | ||
| // } | ||
|
|
||
| p.log.Info("current provision state", | ||
| "lastError", ironicNode.LastError, | ||
| "current", ironicNode.ProvisionState, | ||
|
|
@@ -594,15 +589,7 @@ func (p *ironicProvisioner) UpdateHardwareState() (result provisioner.Result, er | |
| return result, nil | ||
| } | ||
|
|
||
| func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (updates nodes.UpdateOpts, err error) { | ||
|
|
||
| hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) | ||
|
|
||
| if err != nil { | ||
| return updates, errors.Wrap(err, | ||
| fmt.Sprintf("Could not start provisioning with bad hardware profile %s", | ||
| p.host.HardwareProfile())) | ||
| } | ||
| func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node, forProvisioning bool) (updates nodes.UpdateOpts, err error) { | ||
|
|
||
| // image_source | ||
| var op nodes.UpdateOp | ||
|
|
@@ -622,7 +609,10 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update | |
| }, | ||
| ) | ||
|
|
||
| checksum, checksumType, _ := p.host.GetImageChecksum() | ||
| checksum, checksumType, imageOK := p.host.GetImageChecksum() | ||
| if !imageOK { | ||
| return updates, fmt.Errorf("missing or invalid image details") | ||
| } | ||
|
|
||
| // image_os_hash_algo | ||
| if _, ok := ironicNode.InstanceInfo["image_os_hash_algo"]; !ok { | ||
|
|
@@ -703,6 +693,18 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update | |
| }, | ||
| ) | ||
|
|
||
| if !forProvisioning { | ||
| return updates, nil | ||
| } | ||
|
|
||
| hwProf, err := hardware.GetProfile(p.host.HardwareProfile()) | ||
|
|
||
| if err != nil { | ||
| return updates, errors.Wrap(err, | ||
| fmt.Sprintf("Could not start provisioning with bad hardware profile %s", | ||
| p.host.HardwareProfile())) | ||
| } | ||
|
|
||
| // root_gb | ||
| // | ||
| // FIXME(dhellmann): We have to provide something for the disk | ||
|
|
@@ -791,8 +793,7 @@ func (p *ironicProvisioner) getUpdateOptsForNode(ironicNode *nodes.Node) (update | |
| ) | ||
|
|
||
| // boot_mode | ||
| _, ok := ironicNode.InstanceInfo["deploy_boot_mode"] | ||
| if !ok { | ||
| if _, ok := ironicNode.InstanceInfo["deploy_boot_mode"]; !ok { | ||
| op = nodes.AddOp | ||
| } else { | ||
| op = nodes.ReplaceOp | ||
|
|
@@ -818,7 +819,7 @@ func (p *ironicProvisioner) startProvisioning(ironicNode *nodes.Node, hostConf p | |
|
|
||
| p.log.Info("starting provisioning", "node properties", ironicNode.Properties) | ||
|
|
||
| updates, err := p.getUpdateOptsForNode(ironicNode) | ||
| updates, err := p.getUpdateOptsForNode(ironicNode, true) | ||
| _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() | ||
| switch err.(type) { | ||
| case nil: | ||
|
|
@@ -892,6 +893,31 @@ func (p *ironicProvisioner) Adopt() (result provisioner.Result, err error) { | |
| err = fmt.Errorf("Invalid state for adopt: %s", | ||
| ironicNode.ProvisionState) | ||
| case nodes.Manageable: | ||
| if p.host.Spec.Image == nil { | ||
| // Without an image, adoption will fail. We can stop and | ||
| // wait for the image to be set, but we need to do that in | ||
| // a way that ensures we don't enter the states where we | ||
| // try to manage the host power status. Setting our own | ||
| // error message here avoids exposing the error message | ||
| // from Ironic that talks about fields in Ironic with | ||
| // names the user may not recognize. | ||
| result.ErrorMessage = "Image details missing for externally provisioned server." | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually an error? Can't we consider getting to the Manageable state a success? It means we got past Verifying, which is the important thing.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't go any further without the image. So unless we add a new state to the BMH state machine for hosts without images, we have to do something here to keep it from progressing and ending up stuck in a failure when adoption doesn't work. Maybe we do need another state?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're in the ExternallyProvisioned state... what further stuff do we have to do? Getting to Manageable means we can control the power, doesn't it?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, we have to adopt the host (see line 921).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've just noticed that we allow the Host to freely switch between the ExternallyProvisioned and Ready states without deprovisioning it first. That's likely a bug. We have to adopt the host if we are in the Provisioned state. And we might want to adopt the host if we know the image so that Host can transition directly from ExternallyProvisioned->Provisioned without cleaning/inspection (although we don't support that today... we just go from ExternallyProvisioned->Ready without cleaning lol). But I'm not aware of any reason now or in the future that we would have to adopt an externally-provisioned host when we haven't been told what image (if any) is expected to be running on it.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Fun. Maybe? I could go either way.
According to @dtantsur or @juliakreger when I was writing that code, Ironic only monitors the power state of hosts that are adopted. Maybe I'm mis-remembering that, though? In any case, adoption today requires the image, and that's why this fix is preventing adoption until the image is set.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ah, I hadn't heard that. It's possible - adopting moves it to the active state (i.e. provisioned), so that shouldn't be required per se, but it may be we need to get to the available state before we could manage the power. If you don't have an image, the way to get there is via cleaning, which we don't want to do on an externally provisioned host (until it goes back to Ready!). If that's the case, then I can see why we would treat this as an error, since it means we can't change the power state while externally provisioned. It's tempting to just pass some bogus image, but there's no way to replace it with the real one if we get it later without dropping the Ironic DB. |
||
| return | ||
| } | ||
|
|
||
| updates, err := p.getUpdateOptsForNode(ironicNode, false) | ||
| _, err = nodes.Update(p.client, ironicNode.UUID, updates).Extract() | ||
| switch err.(type) { | ||
| case nil: | ||
| case gophercloud.ErrDefault409: | ||
| p.log.Info("could not update host settings in ironic while adopting, busy") | ||
| result.Dirty = true | ||
| return result, nil | ||
| default: | ||
| return result, errors.Wrap(err, | ||
| "failed to update host settings in ironic while adopting") | ||
| } | ||
|
|
||
| return p.changeNodeProvisionState( | ||
| ironicNode, | ||
| nodes.ProvisionStateOpts{ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If credentials are wrong, and image exists, this code will constantly retry registration, isn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that's correct. We have another PR up to introduce similar retry behavior for other failures.