-
Notifications
You must be signed in to change notification settings - Fork 61
[reconfigurator] Planner should wait for all MGS updates before proceeding to zones #9001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 35 commits
5b0d884
477cb9b
9f4968d
b8766df
0e5bd57
0c5197a
5a1a74f
60240f2
ad2e83e
36c788e
ed81591
4efa776
14a43ac
b040913
70f8495
e1ea3d2
f2c7c2a
3293c4f
6d866d6
e8f027b
1da94b0
d0d37e6
03b7432
4f15d18
dd8f911
23e8f54
f47e1cd
4fb55c4
34a3906
0781a21
5c04f22
dc2f232
b070a69
fcdaa5b
1c6a46c
eecfe7c
f6f2c1e
8c3d064
05d2043
957ac0f
e3a47fd
9f5ca2f
e863155
cc8e07a
2444040
19e8e26
a78d9b1
1298942
b29203f
45f9dbd
7d9d1a0
0e367a9
4d18593
7e4e36e
f921c50
cb197db
01391ed
52d399f
46ef2b2
4ca45ba
536996d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; | |
| use nexus_types::deployment::PendingMgsUpdate; | ||
| use nexus_types::deployment::PendingMgsUpdateDetails; | ||
| use nexus_types::deployment::PendingMgsUpdateHostPhase1Details; | ||
| use nexus_types::deployment::planning_report::FailedMgsUpdateReason; | ||
| use nexus_types::inventory::BaseboardId; | ||
| use nexus_types::inventory::Collection; | ||
| use omicron_common::api::external::TufArtifactMeta; | ||
|
|
@@ -32,7 +33,7 @@ use tufaceous_artifact::ArtifactKind; | |
| /// | ||
| /// This is generated by the planning process whenever it also generates host | ||
| /// phase 1 updates. | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
| pub(crate) struct PendingHostPhase2Changes { | ||
| by_sled: BTreeMap<SledUuid, (M2Slot, BlueprintHostPhase2DesiredContents)>, | ||
| } | ||
|
|
@@ -270,24 +271,28 @@ pub(super) fn try_make_update( | |
| baseboard_id: &Arc<BaseboardId>, | ||
| inventory: &Collection, | ||
| current_artifacts: &TufRepoDescription, | ||
| ) -> Option<(PendingMgsUpdate, PendingHostPhase2Changes)> { | ||
| ) -> Result< | ||
| Option<(PendingMgsUpdate, PendingHostPhase2Changes)>, | ||
| FailedMgsUpdateReason, | ||
| > { | ||
| let Some(sp_info) = inventory.sps.get(baseboard_id) else { | ||
| warn!( | ||
karencfv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| log, | ||
| "cannot configure host OS update for board \ | ||
| (missing SP info from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::SpNotInInventory); | ||
| }; | ||
|
|
||
| // Only configure host OS updates for sleds. | ||
| // | ||
| // We don't bother logging a return value of `None` for non-sleds, because | ||
| // we will never attempt to configure an update for them (nor should we). | ||
| // For the same reason, we do not return an error. | ||
| match sp_info.sp_type { | ||
| SpType::Sled => (), | ||
| SpType::Power | SpType::Switch => return None, | ||
| SpType::Power | SpType::Switch => return Ok(None), | ||
| } | ||
|
|
||
| let Some(sled_agent) = inventory.sled_agents.iter().find(|sled_agent| { | ||
|
|
@@ -299,7 +304,7 @@ pub(super) fn try_make_update( | |
| (missing sled-agent info from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::SledAgentInfoNotInInventory); | ||
| }; | ||
| let Some(last_reconciliation) = sled_agent.last_reconciliation.as_ref() | ||
| else { | ||
|
|
@@ -309,7 +314,7 @@ pub(super) fn try_make_update( | |
| (missing last reconciliation details from inventory)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::LastReconciliationNotInInventory); | ||
| }; | ||
| let boot_disk = match &last_reconciliation.boot_partitions.boot_disk { | ||
| Ok(boot_disk) => *boot_disk, | ||
|
|
@@ -323,7 +328,9 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "err" => err, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::UnableToDetermineBootDisk( | ||
| err.to_string(), | ||
| )); | ||
| } | ||
| }; | ||
| let active_phase_2_hash = | ||
|
|
@@ -340,7 +347,11 @@ pub(super) fn try_make_update( | |
| "boot_disk" => ?boot_disk, | ||
| "err" => err, | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::UnableToRetrieveBootDiskPhase2Image( | ||
| err.to_string(), | ||
| ), | ||
| ); | ||
| } | ||
| }; | ||
|
|
||
|
|
@@ -353,7 +364,7 @@ pub(super) fn try_make_update( | |
| (inventory missing current active host phase 1 slot)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::ActiveHostPhase1SlotNotInInventory); | ||
| }; | ||
|
|
||
| // TODO-correctness What should we do if the active phase 1 slot doesn't | ||
|
|
@@ -376,7 +387,9 @@ pub(super) fn try_make_update( | |
| "active_phase_1_slot" => ?active_phase_1_slot, | ||
| "boot_disk" => ?boot_disk, | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::ActiveHostPhase1SlotBootDiskMismatch, | ||
| ); | ||
| } | ||
|
|
||
| let Some(active_phase_1_hash) = inventory | ||
|
|
@@ -390,7 +403,7 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "slot" => ?active_phase_1_slot, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::ActiveHostPhase1HashNotInInventory); | ||
| }; | ||
|
|
||
| let Some(inactive_phase_1_hash) = inventory | ||
|
|
@@ -407,7 +420,9 @@ pub(super) fn try_make_update( | |
| baseboard_id, | ||
| "slot" => ?active_phase_1_slot.toggled(), | ||
| ); | ||
| return None; | ||
| return Err( | ||
| FailedMgsUpdateReason::InactiveHostPhase1HashNotInInventory, | ||
| ); | ||
| }; | ||
|
|
||
| let mut phase_1_artifacts = Vec::with_capacity(1); | ||
|
|
@@ -434,7 +449,7 @@ pub(super) fn try_make_update( | |
| (no phase 1 artifact)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::NoMatchingArtifactFound); | ||
| } | ||
| (_, []) => { | ||
| warn!( | ||
|
|
@@ -443,7 +458,7 @@ pub(super) fn try_make_update( | |
| (no phase 2 artifact)"; | ||
| baseboard_id, | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::NoMatchingArtifactFound); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add some context to Another thought, although this might be nonsense after reading the rest of the PR - we could potentially have different error types for each kind of update, so we could have more host-specific error variants, and then combine the error types in a higher-level enum?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense, I'll address this in the follow up PR that will break up |
||
| } | ||
| // "TUF is broken" cases: have multiple of one or the other. This | ||
| // should be impossible unless we shipped a TUF repo with multiple | ||
|
|
@@ -458,7 +473,7 @@ pub(super) fn try_make_update( | |
| "num-phase-1-images" => phase_1_artifacts.len(), | ||
| "num-phase-2-images" => phase_2_artifacts.len(), | ||
| ); | ||
| return None; | ||
| return Err(FailedMgsUpdateReason::TooManyMatchingArtifacts); | ||
| } | ||
| }; | ||
|
|
||
|
|
@@ -470,7 +485,7 @@ pub(super) fn try_make_update( | |
| // this sled will fail to boot if it were rebooted now.) | ||
| if active_phase_2_hash == phase_2_artifact.hash { | ||
| debug!(log, "no host OS update needed for board"; baseboard_id); | ||
| return None; | ||
| return Ok(None); | ||
| } | ||
|
|
||
| // Before we can proceed with the phase 1 update, we need sled-agent to | ||
|
|
@@ -486,7 +501,7 @@ pub(super) fn try_make_update( | |
| phase_2_artifact, | ||
| ); | ||
|
|
||
| Some(( | ||
| Ok(Some(( | ||
| PendingMgsUpdate { | ||
| baseboard_id: baseboard_id.clone(), | ||
| sp_type: sp_info.sp_type, | ||
|
|
@@ -506,7 +521,7 @@ pub(super) fn try_make_update( | |
| artifact_version: phase_1_artifact.id.version.clone(), | ||
| }, | ||
| pending_host_phase_2_changes, | ||
| )) | ||
| ))) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't necessarily need to change this on this PR, but this type is pretty beefy. I wonder if it would be clearer for us to define an enum with three states; something like (names are hard)
so details like "
Ok(None)means there's no update needed" are more explicit?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like that idea! I've left some TODOs to finish up in a follow up PR in cb197db