Skip to content
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
5b0d884
scaffolding
karencfv Aug 29, 2025
477cb9b
Some thoughts
karencfv Aug 30, 2025
9f4968d
Implement functionality for RoT bootloader
karencfv Sep 3, 2025
b8766df
Plumb skipped updates through
karencfv Sep 3, 2025
0e5bd57
populate SkippedMgsUpdates
karencfv Sep 3, 2025
0c5197a
remove unnecessary checks
karencfv Sep 3, 2025
5a1a74f
do the todos
karencfv Sep 3, 2025
60240f2
expectorate
karencfv Sep 3, 2025
ad2e83e
Make SkippedMgsUpdates a vec because we need all records
karencfv Sep 4, 2025
36c788e
fix no pending updates bug
karencfv Sep 4, 2025
ed81591
Clean up
karencfv Sep 4, 2025
4efa776
error type clean up
karencfv Sep 4, 2025
14a43ac
clean up tuple mess
karencfv Sep 4, 2025
b040913
clean up
karencfv Sep 4, 2025
70f8495
fully working sample
karencfv Sep 5, 2025
e1ea3d2
refactor try_make_update
karencfv Sep 5, 2025
f2c7c2a
use builder pattern
karencfv Sep 5, 2025
3293c4f
remove unnecessary struct
karencfv Sep 5, 2025
6d866d6
improve error messages
karencfv Sep 5, 2025
e8f027b
clippy
karencfv Sep 5, 2025
1da94b0
Make the tests pass
karencfv Sep 5, 2025
d0d37e6
Fix openapi generation
karencfv Sep 5, 2025
03b7432
Mull over tests
karencfv Sep 5, 2025
4f15d18
at least the error is different now 😑
karencfv Sep 15, 2025
dd8f911
Fix test_update_boundary_ntp and test_update_crucible_pantry
karencfv Sep 16, 2025
23e8f54
fix test_update_cockroach
karencfv Sep 16, 2025
f47e1cd
finally all the tests pass
karencfv Sep 16, 2025
4fb55c4
merge main
karencfv Sep 16, 2025
34a3906
fix after merge
karencfv Sep 16, 2025
0781a21
clean up
karencfv Sep 17, 2025
5c04f22
add test
karencfv Sep 17, 2025
dc2f232
Clean up
karencfv Sep 17, 2025
b070a69
merge main
karencfv Sep 17, 2025
fcdaa5b
Fix after merge
karencfv Sep 17, 2025
1c6a46c
moar cleanup
karencfv Sep 17, 2025
eecfe7c
remove debugging println
karencfv Sep 17, 2025
f6f2c1e
merge main
karencfv Sep 22, 2025
8c3d064
extract target release testing into it's own test
karencfv Sep 22, 2025
05d2043
wasn't a typo
karencfv Sep 22, 2025
957ac0f
test clean up logfiles
karencfv Sep 22, 2025
e3a47fd
remove unncessesary logging
karencfv Sep 23, 2025
9f5ca2f
Address comments
karencfv Sep 23, 2025
e863155
Remove unnecessary test
karencfv Sep 23, 2025
cc8e07a
Bail on failed update and improve testing
karencfv Sep 23, 2025
2444040
address style comments
karencfv Sep 23, 2025
19e8e26
Get rid of SkippedMgsUpdates
karencfv Sep 23, 2025
a78d9b1
use blocked instead of skipped
karencfv Sep 23, 2025
1298942
address comments
karencfv Sep 24, 2025
b29203f
Merge main
karencfv Sep 24, 2025
45f9dbd
expectorate
karencfv Sep 24, 2025
7d9d1a0
tests are passing 🎉
karencfv Sep 24, 2025
0e367a9
remove unnecessary blueprint updates
karencfv Sep 24, 2025
4d18593
jfc merge again
karencfv Sep 24, 2025
7e4e36e
generate openapi spec
karencfv Sep 24, 2025
f921c50
fmt
karencfv Sep 24, 2025
cb197db
add the todos
karencfv Sep 24, 2025
01391ed
fmt
karencfv Sep 24, 2025
52d399f
Address comments
karencfv Sep 24, 2025
46ef2b2
merge main
karencfv Sep 24, 2025
4ca45ba
fixes, expectorations, and openapi doc gen after merge with main
karencfv Sep 24, 2025
536996d
fmt 😑
karencfv Sep 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu
> # we added has no disks.
> blueprint-plan dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51
INFO skipping noop image source check for all sleds, reason: no target release is currently set
WARN cannot issue more MGS-driven updates (no current artifacts)
INFO system in initial release state no update artifacts available (no update necessary)
generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21
planning report for blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1:
* no zpools in service for NTP zones on sleds: 00320471-945d-413c-85e7-03e091a70b3c
Expand Down
4 changes: 2 additions & 2 deletions dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ T ENA ID PARENT

> blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
INFO skipping noop image source check for all sleds, reason: no target release is currently set
WARN cannot issue more MGS-driven updates (no current artifacts)
INFO system in initial release state no update artifacts available (no update necessary)
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
empty planning report for blueprint 86db3308-f817-4626-8838-4085949a6a41.

Expand Down Expand Up @@ -1837,7 +1837,7 @@ INTERNAL DNS STATUS
> # sled to be expunged.
> blueprint-plan latest
INFO skipping noop image source check for all sleds, reason: no target release is currently set
WARN cannot issue more MGS-driven updates (no current artifacts)
INFO system in initial release state no update artifacts available (no update necessary)
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
empty planning report for blueprint 86db3308-f817-4626-8838-4085949a6a41.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ empty planning report for blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0.
> # blueprint-plan will place a new external DNS zone, diff DNS to see the new zone has `ns<N>` and NS records.
> blueprint-plan 366b0b68-d80e-4bc1-abd3-dc69837847e0
INFO skipping noop image source check for all sleds, reason: no target release is currently set
WARN cannot issue more MGS-driven updates (no current artifacts)
INFO system in initial release state no update artifacts available (no update necessary)
generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0
planning report for blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2:
* discretionary zones placed:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ external DNS:
> # Planning a new blueprint will now replace the expunged zone, with new records for its replacement.
> blueprint-plan 58d5e830-0884-47d8-a7cd-b2b3751adeb4
INFO skipping noop image source check for all sleds, reason: no target release is currently set
WARN cannot issue more MGS-driven updates (no current artifacts)
INFO system in initial release state no update artifacts available (no update necessary)
generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4
planning report for blueprint af934083-59b5-4bf6-8966-6fb5292c29e1:
* discretionary zones placed:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,7 @@ planning report for blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1:
* skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts
* 1 pending MGS update:
* model0:serial0: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion })
* zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.)
* zone updates waiting on pending MGS updates (RoT bootloader / RoT / SP / Host OS)


> blueprint-show latest
Expand Down Expand Up @@ -1879,7 +1879,7 @@ planning report for blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1:
* skipping noop zone image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts
* 1 pending MGS update:
* model0:serial0: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion })
* zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.)
* zone updates waiting on pending MGS updates (RoT bootloader / RoT / SP / Host OS)



Expand Down

Large diffs are not rendered by default.

51 changes: 33 additions & 18 deletions nexus/reconfigurator/planning/src/mgs_updates/host_phase_1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents;
use nexus_types::deployment::PendingMgsUpdate;
use nexus_types::deployment::PendingMgsUpdateDetails;
use nexus_types::deployment::PendingMgsUpdateHostPhase1Details;
use nexus_types::deployment::planning_report::FailedMgsUpdateReason;
use nexus_types::inventory::BaseboardId;
use nexus_types::inventory::Collection;
use omicron_common::api::external::TufArtifactMeta;
Expand All @@ -32,7 +33,7 @@ use tufaceous_artifact::ArtifactKind;
///
/// This is generated by the planning process whenever it also generates host
/// phase 1 updates.
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct PendingHostPhase2Changes {
by_sled: BTreeMap<SledUuid, (M2Slot, BlueprintHostPhase2DesiredContents)>,
}
Expand Down Expand Up @@ -270,24 +271,28 @@ pub(super) fn try_make_update(
baseboard_id: &Arc<BaseboardId>,
inventory: &Collection,
current_artifacts: &TufRepoDescription,
) -> Option<(PendingMgsUpdate, PendingHostPhase2Changes)> {
) -> Result<
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't necessarily need to change this on this PR, but this type is pretty beefy. I wonder if it would be clearer for us to define an enum with three states; something like (names are hard)

enum UpdateAttempt {
    NoUpdateNeeded,
    Planned(PendingMgsUpdate, PendingHostPhase2Changes),
    Error(FailedMgsUpdateReason),
}

so details like "Ok(None) means there's no update needed" are more explicit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that idea! I've left some TODOs to finish up in a follow up PR in cb197db

Option<(PendingMgsUpdate, PendingHostPhase2Changes)>,
FailedMgsUpdateReason,
> {
let Some(sp_info) = inventory.sps.get(baseboard_id) else {
warn!(
log,
"cannot configure host OS update for board \
(missing SP info from inventory)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::SpNotInInventory);
};

// Only configure host OS updates for sleds.
//
// We don't bother logging a return value of `None` for non-sleds, because
// we will never attempt to configure an update for them (nor should we).
// For the same reason, we do not return an error.
match sp_info.sp_type {
SpType::Sled => (),
SpType::Power | SpType::Switch => return None,
SpType::Power | SpType::Switch => return Ok(None),
}

let Some(sled_agent) = inventory.sled_agents.iter().find(|sled_agent| {
Expand All @@ -299,7 +304,7 @@ pub(super) fn try_make_update(
(missing sled-agent info from inventory)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::SledAgentInfoNotInInventory);
};
let Some(last_reconciliation) = sled_agent.last_reconciliation.as_ref()
else {
Expand All @@ -309,7 +314,7 @@ pub(super) fn try_make_update(
(missing last reconciliation details from inventory)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::LastReconciliationNotInInventory);
};
let boot_disk = match &last_reconciliation.boot_partitions.boot_disk {
Ok(boot_disk) => *boot_disk,
Expand All @@ -323,7 +328,9 @@ pub(super) fn try_make_update(
baseboard_id,
"err" => err,
);
return None;
return Err(FailedMgsUpdateReason::UnableToDetermineBootDisk(
err.to_string(),
));
}
};
let active_phase_2_hash =
Expand All @@ -340,7 +347,11 @@ pub(super) fn try_make_update(
"boot_disk" => ?boot_disk,
"err" => err,
);
return None;
return Err(
FailedMgsUpdateReason::UnableToRetrieveBootDiskPhase2Image(
err.to_string(),
),
);
}
};

Expand All @@ -353,7 +364,7 @@ pub(super) fn try_make_update(
(inventory missing current active host phase 1 slot)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::ActiveHostPhase1SlotNotInInventory);
};

// TODO-correctness What should we do if the active phase 1 slot doesn't
Expand All @@ -376,7 +387,9 @@ pub(super) fn try_make_update(
"active_phase_1_slot" => ?active_phase_1_slot,
"boot_disk" => ?boot_disk,
);
return None;
return Err(
FailedMgsUpdateReason::ActiveHostPhase1SlotBootDiskMismatch,
);
}

let Some(active_phase_1_hash) = inventory
Expand All @@ -390,7 +403,7 @@ pub(super) fn try_make_update(
baseboard_id,
"slot" => ?active_phase_1_slot,
);
return None;
return Err(FailedMgsUpdateReason::ActiveHostPhase1HashNotInInventory);
};

let Some(inactive_phase_1_hash) = inventory
Expand All @@ -407,7 +420,9 @@ pub(super) fn try_make_update(
baseboard_id,
"slot" => ?active_phase_1_slot.toggled(),
);
return None;
return Err(
FailedMgsUpdateReason::InactiveHostPhase1HashNotInInventory,
);
};

let mut phase_1_artifacts = Vec::with_capacity(1);
Expand All @@ -433,7 +448,7 @@ pub(super) fn try_make_update(
(no phase 1 artifact)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::NoMatchingArtifactFound);
}
(_, []) => {
warn!(
Expand All @@ -442,7 +457,7 @@ pub(super) fn try_make_update(
(no phase 2 artifact)";
baseboard_id,
);
return None;
return Err(FailedMgsUpdateReason::NoMatchingArtifactFound);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add some context to NoMatchingArtifactFound? It may be obvious for the other kinds of updates, but for the host it looks like we've lost whether the problem is phase 1 or phase 2.

Another thought, although this might be nonsense after reading the rest of the PR - we could potentially have different error types for each kind of update, so we could have more host-specific error variants, and then combine the error types in a higher-level enum?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, I'll address this in the follow up PR that will break up FailedMgsUpdateReason cb197db

}
// "TUF is broken" cases: have multiple of one or the other. This
// should be impossible unless we shipped a TUF repo with multiple
Expand All @@ -457,7 +472,7 @@ pub(super) fn try_make_update(
"num-phase-1-images" => phase_1_artifacts.len(),
"num-phase-2-images" => phase_2_artifacts.len(),
);
return None;
return Err(FailedMgsUpdateReason::TooManyMatchingArtifacts);
}
};

Expand All @@ -469,7 +484,7 @@ pub(super) fn try_make_update(
// this sled will fail to boot if it were rebooted now.)
if active_phase_2_hash == phase_2_artifact.hash {
debug!(log, "no host OS update needed for board"; baseboard_id);
return None;
return Ok(None);
}

// Before we can proceed with the phase 1 update, we need sled-agent to
Expand All @@ -485,7 +500,7 @@ pub(super) fn try_make_update(
phase_2_artifact,
);

Some((
Ok(Some((
PendingMgsUpdate {
baseboard_id: baseboard_id.clone(),
sp_type: sp_info.sp_type,
Expand All @@ -505,7 +520,7 @@ pub(super) fn try_make_update(
artifact_version: phase_1_artifact.id.version.clone(),
},
pending_host_phase_2_changes,
))
)))
}

#[cfg(test)]
Expand Down
Loading
Loading