- 
                Notifications
    You must be signed in to change notification settings 
- Fork 60
[reconfigurator] Planner should wait for all MGS updates before proceeding to zones #9001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 40 commits
5b0d884
              477cb9b
              9f4968d
              b8766df
              0e5bd57
              0c5197a
              5a1a74f
              60240f2
              ad2e83e
              36c788e
              ed81591
              4efa776
              14a43ac
              b040913
              70f8495
              e1ea3d2
              f2c7c2a
              3293c4f
              6d866d6
              e8f027b
              1da94b0
              d0d37e6
              03b7432
              4f15d18
              dd8f911
              23e8f54
              f47e1cd
              4fb55c4
              34a3906
              0781a21
              5c04f22
              dc2f232
              b070a69
              fcdaa5b
              1c6a46c
              eecfe7c
              f6f2c1e
              8c3d064
              05d2043
              957ac0f
              e3a47fd
              9f5ca2f
              e863155
              cc8e07a
              2444040
              19e8e26
              a78d9b1
              1298942
              b29203f
              45f9dbd
              7d9d1a0
              0e367a9
              4d18593
              7e4e36e
              f921c50
              cb197db
              01391ed
              52d399f
              46ef2b2
              4ca45ba
              536996d
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -12,6 +12,7 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; | |
| use nexus_types::deployment::PendingMgsUpdate; | ||
| use nexus_types::deployment::PendingMgsUpdateDetails; | ||
| use nexus_types::deployment::PendingMgsUpdateHostPhase1Details; | ||
| use nexus_types::deployment::planning_report::FailedMgsUpdateReason; | ||
| use nexus_types::inventory::BaseboardId; | ||
| use nexus_types::inventory::Collection; | ||
| use omicron_common::api::external::TufArtifactMeta; | ||
|  | @@ -32,7 +33,7 @@ use tufaceous_artifact::ArtifactKind; | |
| /// | ||
| /// This is generated by the planning process whenever it also generates host | ||
| /// phase 1 updates. | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
| pub(crate) struct PendingHostPhase2Changes { | ||
| by_sled: BTreeMap<SledUuid, (M2Slot, BlueprintHostPhase2DesiredContents)>, | ||
| } | ||
|  | @@ -270,24 +271,28 @@ pub(super) fn try_make_update( | |
| baseboard_id: &Arc<BaseboardId>, | ||
| inventory: &Collection, | ||
| current_artifacts: &TufRepoDescription, | ||
| ) -> Option<(PendingMgsUpdate, PendingHostPhase2Changes)> { | ||
| ) -> Result< | ||
| Option<(PendingMgsUpdate, PendingHostPhase2Changes)>, | ||
| FailedMgsUpdateReason, | ||
| > { | ||
| let Some(sp_info) = inventory.sps.get(baseboard_id) else { | ||
| warn!( | ||
|         
                  karencfv marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| log, | ||
| "cannot configure host OS update for board \ | ||
| (missing SP info from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::SpNotInInventory); | ||
| }; | ||
|  | ||
| // Only configure host OS updates for sleds. | ||
| // | ||
| // We don't bother logging a return value of `None` for non-sleds, because | ||
| // we will never attempt to configure an update for them (nor should we). | ||
| // For the same reason, we do not return an error. | ||
| match sp_info.sp_type { | ||
| SpType::Sled => (), | ||
| SpType::Power | SpType::Switch => return None, | ||
| SpType::Power | SpType::Switch => return Ok(None), | ||
| } | ||
|  | ||
| let Some(sled_agent) = inventory.sled_agents.iter().find(|sled_agent| { | ||
|  | @@ -299,7 +304,7 @@ pub(super) fn try_make_update( | |
| (missing sled-agent info from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::SledAgentInfoNotInInventory); | ||
| }; | ||
| let Some(last_reconciliation) = sled_agent.last_reconciliation.as_ref() | ||
| else { | ||
|  | @@ -309,7 +314,7 @@ pub(super) fn try_make_update( | |
| (missing last reconciliation details from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::LastReconciliationNotInInventory); | ||
| }; | ||
| let boot_disk = match &last_reconciliation.boot_partitions.boot_disk { | ||
| Ok(boot_disk) => *boot_disk, | ||
|  | @@ -323,7 +328,9 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "err" => err, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::UnableToDetermineBootDisk( | ||
| err.to_string(), | ||
| )); | ||
| } | ||
| }; | ||
| let active_phase_2_hash = | ||
|  | @@ -340,7 +347,11 @@ pub(super) fn try_make_update( | |
| "boot_disk" => ?boot_disk, | ||
| "err" => err, | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::UnableToRetrieveBootDiskPhase2Image( | ||
| err.to_string(), | ||
| ), | ||
| ); | ||
| } | ||
| }; | ||
|  | ||
|  | @@ -353,7 +364,7 @@ pub(super) fn try_make_update( | |
| (inventory missing current active host phase 1 slot)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::ActiveHostPhase1SlotNotInInventory); | ||
| }; | ||
|  | ||
| // TODO-correctness What should we do if the active phase 1 slot doesn't | ||
|  | @@ -376,7 +387,9 @@ pub(super) fn try_make_update( | |
| "active_phase_1_slot" => ?active_phase_1_slot, | ||
| "boot_disk" => ?boot_disk, | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::ActiveHostPhase1SlotBootDiskMismatch, | ||
| ); | ||
| } | ||
|  | ||
| let Some(active_phase_1_hash) = inventory | ||
|  | @@ -390,7 +403,7 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "slot" => ?active_phase_1_slot, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::ActiveHostPhase1HashNotInInventory); | ||
| }; | ||
|  | ||
| let Some(inactive_phase_1_hash) = inventory | ||
|  | @@ -407,7 +420,9 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "slot" => ?active_phase_1_slot.toggled(), | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::InactiveHostPhase1HashNotInInventory, | ||
| ); | ||
| }; | ||
|  | ||
| let mut phase_1_artifacts = Vec::with_capacity(1); | ||
|  | @@ -433,7 +448,7 @@ pub(super) fn try_make_update( | |
| (no phase 1 artifact)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::NoMatchingArtifactFound); | ||
| } | ||
| (_, []) => { | ||
| warn!( | ||
|  | @@ -442,7 +457,7 @@ pub(super) fn try_make_update( | |
| (no phase 2 artifact)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::NoMatchingArtifactFound); | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add some context to  Another thought, although this might be nonsense after reading the rest of the PR - we could potentially have different error types for each kind of update, so we could have more host-specific error variants, and then combine the error types in a higher-level enum? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense, I'll address this in the follow up PR that will break up  | ||
| } | ||
| // "TUF is broken" cases: have multiple of one or the other. This | ||
| // should be impossible unless we shipped a TUF repo with multiple | ||
|  | @@ -457,7 +472,7 @@ pub(super) fn try_make_update( | |
| "num-phase-1-images" => phase_1_artifacts.len(), | ||
| "num-phase-2-images" => phase_2_artifacts.len(), | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::TooManyMatchingArtifacts); | ||
| } | ||
| }; | ||
|  | ||
|  | @@ -469,7 +484,7 @@ pub(super) fn try_make_update( | |
| // this sled will fail to boot if it were rebooted now.) | ||
| if active_phase_2_hash == phase_2_artifact.hash { | ||
| debug!(log, "no host OS update needed for board"; baseboard_id); | ||
| return None; | ||
| return Ok(None); | ||
| } | ||
|  | ||
| // Before we can proceed with the phase 1 update, we need sled-agent to | ||
|  | @@ -485,7 +500,7 @@ pub(super) fn try_make_update( | |
| phase_2_artifact, | ||
| ); | ||
|  | ||
| Some(( | ||
| Ok(Some(( | ||
| PendingMgsUpdate { | ||
| baseboard_id: baseboard_id.clone(), | ||
| sp_type: sp_info.sp_type, | ||
|  | @@ -505,7 +520,7 @@ pub(super) fn try_make_update( | |
| artifact_version: phase_1_artifact.id.version.clone(), | ||
| }, | ||
| pending_host_phase_2_changes, | ||
| )) | ||
| ))) | ||
| } | ||
|  | ||
| #[cfg(test)] | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't necessarily need to change this on this PR, but this type is pretty beefy. I wonder if it would be clearer for us to define an enum with three states; something like (names are hard)
so details like "
Ok(None)means there's no update needed" are more explicit?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like that idea! I've left some TODOs to finish up in a follow up PR in cb197db