Skip to content
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
17d9ae4
Fix docstring for BlueprintZoneImageSource
plotnick Mar 26, 2025
a87f755
Plumb target release TUF repo through planner
plotnick Mar 26, 2025
851128a
Plan zone updates from TUF repo
plotnick Mar 26, 2025
85e94bb
Plan according to RFD 565 §9
plotnick Apr 23, 2025
194a8a3
Don't update an already up-to-date zone
plotnick Apr 28, 2025
50c1169
Don't trust inventory zones' image_source
plotnick May 2, 2025
ad7103e
Fix failing tests
plotnick May 3, 2025
55b8e0e
Rename datastore methods: update_tuf_* → tuf_*
plotnick May 10, 2025
a7c1e17
Fix typed UUID
plotnick May 10, 2025
57f7154
Rename ControlPlaneZone → NonNexusOmicronZone
plotnick May 10, 2025
77b2156
Simplify OrderedComponent logic
plotnick May 10, 2025
e31fc60
Simplify out-of-date zone collection
plotnick May 10, 2025
7f1f522
Make planning error an error
plotnick May 10, 2025
050933e
Explicitly list which zone kinds get in-place updates
plotnick May 10, 2025
2daabf5
Uncomment test assertion
plotnick May 10, 2025
88733d2
Type-safe fake-zone names
plotnick May 10, 2025
2e0ee41
Merge branch 'main' into plan-target-release
plotnick May 10, 2025
3ae171e
Fix doc bug from renamed method
plotnick May 10, 2025
499106a
Merge branch 'main' into plan-target-release
plotnick May 16, 2025
959e1e4
Merge branch 'main' into plan-target-release
plotnick May 19, 2025
3611b42
Refactor zone update readiness check
plotnick May 21, 2025
b2f09a3
Merge branch 'main' into plan-target-release
plotnick May 22, 2025
70edce7
Check zone image source in inventory
plotnick May 22, 2025
29d2093
initial draft: planning SP, some pieces missing
davepacheco May 28, 2025
38dc53a
add docs
davepacheco Jun 3, 2025
f28b929
fix openapi
davepacheco Jun 4, 2025
bd7bcd6
update comments based on review feedback
davepacheco Jun 4, 2025
bf2ab7c
review feedback
davepacheco Jun 4, 2025
a862939
Merge branch 'dap/sp-planning' into dap/sp-planning-real
davepacheco Jun 4, 2025
7acf01d
Merge remote-tracking branch 'origin/plan-target-release' into dap/sp…
davepacheco Jun 4, 2025
8182a46
incorporate SP updates into planner
davepacheco Jun 4, 2025
78fbd8c
Merge remote-tracking branch 'origin/main' into dap/sp-planning-real
davepacheco Jun 4, 2025
aa00005
Merge branch 'main' into plan-target-release
davepacheco Jun 4, 2025
f715a9a
Merge remote-tracking branch 'origin/plan-target-release' into dap/sp…
davepacheco Jun 4, 2025
7705d1c
rustfmt
davepacheco Jun 4, 2025
ceaaec3
fix tests
davepacheco Jun 4, 2025
709ae6d
remove XXX
davepacheco Jun 5, 2025
ad44964
Merge commit '2d077d531260590a56908d0ee0186f38c0d96d7a' into dap/sp-p…
davepacheco Jun 6, 2025
52a368e
Merge commit 'd2aa2c5f207b9918fab30a7b95790b6bfd29d75d' into dap/sp-p…
davepacheco Jun 6, 2025
51a41f1
Merge branch 'main' into dap/sp-planning-real
davepacheco Jun 6, 2025
ce355a2
Merge branch 'main' into dap/sp-planning-real
davepacheco Jun 11, 2025
ac66c42
fix test
davepacheco Jun 12, 2025
405b29c
Merge branch 'main' into dap/sp-planning-real
davepacheco Jun 13, 2025
29df3f6
review feedback
davepacheco Jun 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions dev-tools/omdb/tests/usage_errors.out
Original file line number Diff line number Diff line change
Expand Up @@ -507,19 +507,22 @@ Options:
Show sleds that match the given filter

Possible values:
- all: All sleds in the system, regardless of policy or state
- commissioned: All sleds that are currently part of the control plane cluster
- decommissioned: All sleds that were previously part of the control plane
- all: All sleds in the system, regardless of policy or state
- commissioned: All sleds that are currently part of the control plane
cluster
- decommissioned: All sleds that were previously part of the control plane
cluster but have been decommissioned
- discretionary: Sleds that are eligible for discretionary services
- in-service: Sleds that are in service (even if they might not be eligible
for discretionary services)
- query-during-inventory: Sleds whose sled agents should be queried for inventory
- reservation-create: Sleds on which reservations can be created
- vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules
- vpc-firewall: Sleds which should be sent VPC firewall rules
- tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated onto
them
- discretionary: Sleds that are eligible for discretionary services
- in-service: Sleds that are in service (even if they might not be
eligible for discretionary services)
- query-during-inventory: Sleds whose sled agents should be queried for inventory
- reservation-create: Sleds on which reservations can be created
- vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing
rules
- vpc-firewall: Sleds which should be sent VPC firewall rules
- tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated
onto them
- sps-updated-by-reconfigurator: Sleds whose SPs should be updated by Reconfigurator

--log-level <LOG_LEVEL>
log level filter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ WARN failed to place all new desired InternalDns zones, placed: 0, wanted_to_pla
INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count: 0
WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
WARN cannot issue more SP updates (no current artifacts)
INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count
INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
WARN cannot issue more SP updates (no current artifacts)
INFO some zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,7 @@ INFO added zone to sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, kind: In
INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
WARN cannot issue more SP updates (no current artifacts)
INFO some zones not yet up-to-date, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4
Expand Down
5 changes: 5 additions & 0 deletions nexus/db-queries/src/db/datastore/deployment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1884,6 +1884,7 @@ mod tests {
use nexus_types::external_api::views::PhysicalDiskState;
use nexus_types::external_api::views::SledPolicy;
use nexus_types::external_api::views::SledState;
use nexus_types::inventory::BaseboardId;
use nexus_types::inventory::Collection;
use omicron_common::address::IpRange;
use omicron_common::address::Ipv6Subnet;
Expand Down Expand Up @@ -2002,6 +2003,10 @@ mod tests {
policy: SledPolicy::provisionable(),
state: SledState::Active,
resources,
baseboard_id: BaseboardId {
part_number: String::from("unused"),
serial_number: String::from("unused"),
},
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,13 @@ impl<'a> BlueprintBuilder<'a> {
Ok(self.resource_allocator()?.inject_untracked_external_dns_ip(addr)?)
}

pub fn pending_mgs_updates_replace_all(
&mut self,
updates: nexus_types::deployment::PendingMgsUpdates,
) {
self.pending_mgs_updates = updates;
}

pub fn pending_mgs_update_insert(
&mut self,
update: nexus_types::deployment::PendingMgsUpdate,
Expand Down
70 changes: 69 additions & 1 deletion nexus/reconfigurator/planning/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ use crate::blueprint_builder::Error;
use crate::blueprint_builder::Operation;
use crate::blueprint_editor::DisksEditError;
use crate::blueprint_editor::SledEditError;
use crate::mgs_updates::plan_mgs_updates;
use crate::planner::omicron_zone_placement::PlacementError;
use gateway_client::types::SpType;
use nexus_sled_agent_shared::inventory::OmicronZoneType;
use nexus_sled_agent_shared::inventory::ZoneKind;
use nexus_types::deployment::Blueprint;
Expand Down Expand Up @@ -50,6 +52,11 @@ pub use self::rng::SledPlannerRng;
mod omicron_zone_placement;
pub(crate) mod rng;

enum UpdateStepResult {
ContinueToNextStep,
Waiting,
}

pub struct Planner<'a> {
log: Logger,
input: &'a PlanningInput,
Expand Down Expand Up @@ -115,7 +122,11 @@ impl<'a> Planner<'a> {
self.do_plan_expunge()?;
self.do_plan_add()?;
self.do_plan_decommission()?;
self.do_plan_zone_updates()?;
if let UpdateStepResult::ContinueToNextStep =
self.do_plan_mgs_updates()?
{
self.do_plan_zone_updates()?;
}
Copy link
Contributor

@jgallagher jgallagher Jun 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be nitpicky, but it seems a little weird that if we don't get ContinueToNextStep, we skip the immediate next step but still do the rest of planning (which admittedly is not very much). Maybe this should be specific to "can we do further updates"? (#8284 / #8285 / #8298 are all closely related, so also fine to defer this until we do some combination of them.)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I've been assuming we'd iterate on the exact control flow / pattern here as we add more steps.

self.do_plan_cockroachdb_settings();
Ok(())
}
Expand Down Expand Up @@ -901,6 +912,63 @@ impl<'a> Planner<'a> {
Ok(())
}

/// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of
/// date.
fn do_plan_mgs_updates(&mut self) -> Result<UpdateStepResult, Error> {
// Determine which baseboards we will consider updating.
//
// Sleds may be present but not adopted as part of the control plane.
// In deployed systems, this would probably only happen if a sled was
// about to be added. In dev/test environments, it's common to leave
// some number of sleds out of the control plane for various reasons.
// Inventory will still report them, but we don't want to touch them.
//
// For better or worse, switches and PSCs do not have the same idea of
// being adopted into the control plane. If they're present, they're
// part of the system, and we will update them.
Comment on lines +951 to +953
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have an issue about this? I don't think trust quorum interacts with non-sled components at all, but I think in the fullness of time we want some kind of auth on the management network, which presumably involves the control plane being aware of PSCs/switches and knowing whether or not it's okay to talk to them?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We certainly could. I'm not sure exactly what I'd file at this point. Something like: "eventually we will have the business requirement to lock down this network, and when we do, we'll have to better manage the lifecycle of these components". Or "lifecycle of switches and PSCs could be controlled, like sleds".

I know @bnaecker has been thinking about this a bit in the context of multi-rack.

let included_sled_baseboards: BTreeSet<_> = self
.input
.all_sleds(SledFilter::SpsUpdatedByReconfigurator)
.map(|(_sled_id, details)| &details.baseboard_id)
.collect();
let included_baseboards =
self.inventory
.sps
.iter()
.filter_map(|(baseboard_id, sp_state)| {
let do_include = match sp_state.sp_type {
SpType::Sled => included_sled_baseboards
.contains(baseboard_id.as_ref()),
SpType::Power => true,
SpType::Switch => true,
};
do_include.then_some(baseboard_id.clone())
})
.collect();

// Compute the new set of PendingMgsUpdates.
let current_updates =
&self.blueprint.parent_blueprint().pending_mgs_updates;
let current_artifacts = self.input.tuf_repo();
let next = plan_mgs_updates(
&self.log,
&self.inventory,
&included_baseboards,
&current_updates,
current_artifacts,
1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a comment for what this is, or put it in a constant? I assume it's "number of upgrades to attempt"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call. Added in 29df3f6.

);

// TODO This is not quite right. See oxidecomputer/omicron#8285.
let rv = if next.is_empty() {
UpdateStepResult::ContinueToNextStep
} else {
UpdateStepResult::Waiting
};
self.blueprint.pending_mgs_updates_replace_all(next);
Ok(rv)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't look like it's possible for this step to fail - do you think it will be in the future? Or could we change the return type to just UpdateStepResult?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed it to not fail in 29df3f6.

(I had it this way because I thought it was clearer to have the planning functions have the same signature, and because I think it's possible we will want to allow this to fail, but I think we'll work this out in the follow-on PRs like #8284 so I'm happy to keep it simpler for now.)

}

/// Update at most one existing zone to use a new image source.
fn do_plan_zone_updates(&mut self) -> Result<(), Error> {
// We are only interested in non-decommissioned sleds.
Expand Down
12 changes: 12 additions & 0 deletions nexus/reconfigurator/planning/src/system.rs
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,18 @@ impl SystemDescription {
policy: sled.policy,
state: sled.state,
resources: sled.resources.clone(),
baseboard_id: BaseboardId {
part_number: sled
.inventory_sled_agent
.baseboard
.model()
.to_owned(),
serial_number: sled
.inventory_sled_agent
.baseboard
.identifier()
.to_owned(),
},
};
builder.add_sled(sled.sled_id, sled_details)?;
}
Expand Down
5 changes: 5 additions & 0 deletions nexus/reconfigurator/preparation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use nexus_types::deployment::SledResources;
use nexus_types::deployment::UnstableReconfiguratorState;
use nexus_types::identity::Asset;
use nexus_types::identity::Resource;
use nexus_types::inventory::BaseboardId;
use nexus_types::inventory::Collection;
use omicron_common::address::IpRange;
use omicron_common::address::Ipv6Subnet;
Expand Down Expand Up @@ -274,6 +275,10 @@ impl PlanningInputFromDb<'_> {
policy: sled_row.policy(),
state: sled_row.state().into(),
resources: SledResources { subnet, zpools },
baseboard_id: BaseboardId {
part_number: sled_row.part_number().to_owned(),
serial_number: sled_row.serial_number().to_owned(),
},
};
// TODO-cleanup use `TypedUuid` everywhere
let sled_id = SledUuid::from_untyped_uuid(sled_id);
Expand Down
11 changes: 11 additions & 0 deletions nexus/types/src/deployment/planning_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::external_api::views::PhysicalDiskState;
use crate::external_api::views::SledPolicy;
use crate::external_api::views::SledProvisionPolicy;
use crate::external_api::views::SledState;
use crate::inventory::BaseboardId;
use chrono::DateTime;
use chrono::Utc;
use clap::ValueEnum;
Expand Down Expand Up @@ -724,6 +725,9 @@ pub enum SledFilter {

/// Sleds which should have TUF repo artifacts replicated onto them.
TufArtifactReplication,

/// Sleds whose SPs should be updated by Reconfigurator
SpsUpdatedByReconfigurator,
}

impl SledFilter {
Expand Down Expand Up @@ -780,6 +784,7 @@ impl SledPolicy {
SledFilter::VpcRouting => true,
SledFilter::VpcFirewall => true,
SledFilter::TufArtifactReplication => true,
SledFilter::SpsUpdatedByReconfigurator => true,
},
SledPolicy::InService {
provision_policy: SledProvisionPolicy::NonProvisionable,
Expand All @@ -794,6 +799,7 @@ impl SledPolicy {
SledFilter::VpcRouting => true,
SledFilter::VpcFirewall => true,
SledFilter::TufArtifactReplication => true,
SledFilter::SpsUpdatedByReconfigurator => true,
},
SledPolicy::Expunged => match filter {
SledFilter::All => true,
Expand All @@ -806,6 +812,7 @@ impl SledPolicy {
SledFilter::VpcRouting => false,
SledFilter::VpcFirewall => false,
SledFilter::TufArtifactReplication => false,
SledFilter::SpsUpdatedByReconfigurator => false,
},
}
}
Expand Down Expand Up @@ -840,6 +847,7 @@ impl SledState {
SledFilter::VpcRouting => true,
SledFilter::VpcFirewall => true,
SledFilter::TufArtifactReplication => true,
SledFilter::SpsUpdatedByReconfigurator => true,
},
SledState::Decommissioned => match filter {
SledFilter::All => true,
Expand All @@ -852,6 +860,7 @@ impl SledState {
SledFilter::VpcRouting => false,
SledFilter::VpcFirewall => false,
SledFilter::TufArtifactReplication => false,
SledFilter::SpsUpdatedByReconfigurator => false,
},
}
}
Expand Down Expand Up @@ -1058,6 +1067,8 @@ pub struct SledDetails {
pub state: SledState,
/// current resources allocated to this sled
pub resources: SledResources,
/// baseboard id for this sled
pub baseboard_id: BaseboardId,
}

#[derive(Debug, thiserror::Error)]
Expand Down
Loading