From 17d9ae464de524293810c412a8e6ae7e213c2365 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:01:56 -0600 Subject: [PATCH 01/32] Fix docstring for BlueprintZoneImageSource --- nexus/types/src/deployment.rs | 2 +- openapi/nexus-internal.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 769f74a46f7..16502409892 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -960,7 +960,7 @@ impl fmt::Display for BlueprintZoneDisposition { } } -/// Where a blueprint's image source is located. +/// Where the zone's image source is located. /// /// This is the blueprint version of [`OmicronZoneImageSource`]. #[derive( diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a9c6d4fa074..6ce81159b8a 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2786,7 +2786,7 @@ ] }, "BlueprintZoneImageSource": { - "description": "Where a blueprint's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", + "description": "Where the zone's image source is located.\n\nThis is the blueprint version of [`OmicronZoneImageSource`].", "oneOf": [ { "description": "This zone's image source is whatever happens to be on the sled's \"install\" dataset.\n\nThis is whatever was put in place at the factory or by the latest MUPdate. The image used here can vary by sled and even over time (if the sled gets MUPdated again).\n\nHistorically, this was the only source for zone images. In an system with automated control-plane-driven update we expect to only use this variant in emergencies where the system had to be recovered via MUPdate.", From a87f75524b69136b8d15ce81f8007786a40abe7a Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:03:34 -0600 Subject: [PATCH 02/32] Plumb target release TUF repo through planner --- Cargo.lock | 1 + nexus/db-queries/src/db/datastore/update.rs | 36 +++++++++++++++++++- nexus/reconfigurator/execution/src/dns.rs | 1 + nexus/reconfigurator/planning/Cargo.toml | 1 + nexus/reconfigurator/planning/src/system.rs | 4 +++ nexus/reconfigurator/preparation/src/lib.rs | 19 ++++++++++- nexus/src/app/update.rs | 2 +- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/types/src/deployment/planning_input.rs | 13 +++++++ 9 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a14b71ecddc..6b5fb5eff54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6441,6 +6441,7 @@ dependencies = [ "oxnet", "proptest", "rand 0.8.5", + "semver 1.0.25", "sled-agent-client", "slog", "slog-error-chain", diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index cd0127cf979..6a9b62f2ef0 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -22,6 +22,7 @@ use omicron_common::api::external::{ self, CreateResult, DataPageParams, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, }; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use swrite::{SWrite, swrite}; @@ -106,8 +107,41 @@ impl DataStore { }) } + /// Returns a TUF repo description. + pub async fn update_tuf_repo_get_by_id( + &self, + opctx: &OpContext, + repo_id: TypedUuid, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use nexus_db_schema::schema::tuf_repo::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let repo_id = repo_id.into_untyped_uuid(); + let repo = dsl::tuf_repo + .filter(dsl::id.eq(repo_id)) + .select(TufRepo::as_select()) + .first_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::TufRepo, + LookupType::ById(repo_id), + ), + ) + })?; + + let artifacts = artifacts_for_repo(repo.id.into(), &conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + Ok(TufRepoDescription { repo, artifacts }) + } + /// Returns the TUF repo description corresponding to this system version. - pub async fn update_tuf_repo_get( + pub async fn update_tuf_repo_get_by_version( &self, opctx: &OpContext, system_version: SemverVersion, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index ad93ee5a1bf..2c77e99145e 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1424,6 +1424,7 @@ mod test { target_crucible_pantry_zone_count: CRUCIBLE_PANTRY_REDUNDANCY, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, log, } .build() diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 695e65eb584..465c838741d 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -29,6 +29,7 @@ omicron-uuid-kinds.workspace = true once_cell.workspace = true oxnet.workspace = true rand.workspace = true +semver.workspace = true sled-agent-client.workspace = true slog.workspace = true slog-error-chain.workspace = true diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index ad389795f50..f4b947ecc59 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -44,6 +44,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::address::get_sled_address; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; @@ -96,6 +97,7 @@ pub struct SystemDescription { external_dns_version: Generation, clickhouse_policy: Option, oximeter_read_policy: OximeterReadPolicy, + tuf_repo: Option, } impl SystemDescription { @@ -175,6 +177,7 @@ impl SystemDescription { external_dns_version: Generation::new(), clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, } } @@ -427,6 +430,7 @@ impl SystemDescription { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 497d0353124..48da2704e69 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -39,6 +39,7 @@ use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::LookupType; +use omicron_common::api::external::TufRepoDescription; use omicron_common::disk::DiskIdentity; use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; use omicron_common::policy::COCKROACHDB_REDUNDANCY; @@ -78,6 +79,7 @@ pub struct PlanningInputFromDb<'a> { pub cockroachdb_settings: &'a CockroachDbSettings, pub clickhouse_policy: Option, pub oximeter_read_policy: OximeterReadPolicy, + pub tuf_repo: Option, pub log: &'a Logger, } @@ -140,11 +142,24 @@ impl PlanningInputFromDb<'_> { .cockroachdb_settings(opctx) .await .internal_context("fetching cockroachdb settings")?; - let clickhouse_policy = datastore .clickhouse_policy_get_latest(opctx) .await .internal_context("fetching clickhouse policy")?; + let target_release = datastore + .target_release_get_current(opctx) + .await + .internal_context("fetching current target release")?; + let tuf_repo = match target_release.tuf_repo_id { + None => None, + Some(repo_id) => Some( + datastore + .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; let oximeter_read_policy = datastore .oximeter_read_policy_get_latest(opctx) @@ -171,6 +186,7 @@ impl PlanningInputFromDb<'_> { cockroachdb_settings: &cockroachdb_settings, clickhouse_policy, oximeter_read_policy, + tuf_repo, } .build() .internal_context("assembling planning_input")?; @@ -194,6 +210,7 @@ impl PlanningInputFromDb<'_> { .target_crucible_pantry_zone_count, clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), + tuf_repo: self.tuf_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index d0e1d05e5d8..5ba1b9e0a40 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -88,7 +88,7 @@ impl super::Nexus { let tuf_repo_description = self .db_datastore - .update_tuf_repo_get(opctx, system_version.into()) + .update_tuf_repo_get_by_version(opctx, system_version.into()) .await .map_err(HttpError::from)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 3b6010ff4d0..ed0089f4c8d 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6575,7 +6575,7 @@ impl NexusExternalApi for NexusExternalApiImpl { // Fetch the TUF repo metadata and update the target release. let tuf_repo_id = nexus .datastore() - .update_tuf_repo_get(&opctx, system_version.into()) + .update_tuf_repo_get_by_version(&opctx, system_version.into()) .await? .repo .id; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index d6faffd6961..e4b300b39bb 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -23,6 +23,7 @@ use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::internal::shared::SourceNatConfigError; use omicron_common::disk::DiskIdentity; use omicron_common::policy::SINGLE_NODE_CLICKHOUSE_REDUNDANCY; @@ -152,6 +153,10 @@ impl PlanningInput { .unwrap_or(0) } + pub fn tuf_repo(&self) -> Option<&TufRepoDescription> { + self.policy.tuf_repo.as_ref() + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } @@ -918,6 +923,13 @@ pub struct Policy { /// Eventually we will only allow reads from a cluster and this policy will /// no longer exist. pub oximeter_read_policy: OximeterReadPolicy, + + /// Desired system software release repository. + /// + /// New zones will use artifacts in this repo as their image sources, + /// and at most one extant zone may be modified to use it or replaced + /// with one that does. + pub tuf_repo: Option, } #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -1087,6 +1099,7 @@ impl PlanningInputBuilder { target_crucible_pantry_zone_count: 0, clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), + tuf_repo: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), From 851128a72f8f11c1e17e959c813beb8c252449d9 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 26 Mar 2025 13:04:01 -0600 Subject: [PATCH 03/32] Plan zone updates from TUF repo --- .../output/cmd-expunge-newly-added-stdout | 1 + nexus-sled-agent-shared/src/inventory.rs | 19 + .../planning/src/blueprint_builder/builder.rs | 65 +- nexus/reconfigurator/planning/src/planner.rs | 562 +++++++++++++++++- 4 files changed, 632 insertions(+), 15 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout index 6c0d2f67e32..c1c471f0476 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout @@ -628,6 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +INFO all zones up-to-date INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 5b8fdb39133..29c5924ed10 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -617,6 +617,25 @@ impl ZoneKind { ZoneKind::Oximeter => "oximeter", } } + + /// Return a string used as an artifact name for control-plane zones. + /// This is **not guaranteed** to be stable. + pub fn artifact_name(self) -> &'static str { + match self { + ZoneKind::BoundaryNtp => "ntp", + ZoneKind::Clickhouse => "clickhouse", + ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse", + ZoneKind::CockroachDb => "cockroachdb", + ZoneKind::Crucible => "crucible-zone", + ZoneKind::CruciblePantry => "crucible-pantry-zone", + ZoneKind::ExternalDns => "external-dns", + ZoneKind::InternalDns => "internal-dns", + ZoneKind::InternalNtp => "ntp", + ZoneKind::Nexus => "nexus", + ZoneKind::Oximeter => "oximeter", + } + } } /// Where Sled Agent should get the image for a zone. diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 864b0511845..46989d3f75a 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -33,6 +33,7 @@ use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; +use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -82,6 +83,7 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use thiserror::Error; +use tufaceous_artifact::KnownArtifactKind; use super::ClickhouseZonesThatShouldBeRunning; use super::clickhouse::ClickhouseAllocator; @@ -1159,13 +1161,14 @@ impl<'a> BlueprintBuilder<'a> { gz_address: dns_subnet.gz_address(), gz_address_index, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool: zpool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) @@ -1211,13 +1214,14 @@ impl<'a> BlueprintBuilder<'a> { dns_address, nic, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1250,13 +1254,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: self.rng.sled_rng(sled_id).next_zone(), filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone)?; @@ -1402,13 +1407,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: nexus_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1427,13 +1433,14 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: oximeter_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1451,13 +1458,14 @@ impl<'a> BlueprintBuilder<'a> { ); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: pantry_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1485,13 +1493,14 @@ impl<'a> BlueprintBuilder<'a> { dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, }); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1511,13 +1520,14 @@ impl<'a> BlueprintBuilder<'a> { address, dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, }); + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id, filesystem_pool: pool_name, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1539,13 +1549,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1567,13 +1578,14 @@ impl<'a> BlueprintBuilder<'a> { }, ); let filesystem_pool = pool_name; + let image_source = self.zone_image_source(zone_type.kind()); let zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, id: zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }; self.sled_add_zone(sled_id, zone) } @@ -1693,6 +1705,7 @@ impl<'a> BlueprintBuilder<'a> { }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; + let image_source = self.zone_image_source(zone_type.kind()); self.sled_add_zone( sled_id, @@ -1701,7 +1714,7 @@ impl<'a> BlueprintBuilder<'a> { id: new_zone_id, filesystem_pool, zone_type, - image_source: BlueprintZoneImageSource::InstallDataset, + image_source, }, ) } @@ -1889,6 +1902,36 @@ impl<'a> BlueprintBuilder<'a> { ) { self.pending_mgs_updates.remove(baseboard_id); } + + /// Try to find an artifact in the release repo that contains an image + /// for a zone of the given kind. Defaults to the install dataset. + pub(crate) fn zone_image_source( + &self, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + self.input + .tuf_repo() + .and_then(|repo| { + repo.artifacts + .iter() + .find(|artifact| { + artifact + .id + .kind + .to_known() + .map(|kind| matches!(kind, KnownArtifactKind::Zone)) + .unwrap_or(false) + && artifact.id.name == zone_kind.artifact_name() + }) + .map(|artifact| BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: artifact.id.version.clone(), + }, + hash: artifact.hash, + }) + }) + .unwrap_or(BlueprintZoneImageSource::InstallDataset) + } } // Helper to validate that the system hasn't gone off the rails. There should diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 1ee0f8ddda0..58b7d8c8857 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -18,6 +18,7 @@ use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; +use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -36,6 +37,8 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; +use std::cmp::Ordering; +use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -110,14 +113,11 @@ impl<'a> Planner<'a> { } fn do_plan(&mut self) -> Result<(), Error> { - // We perform planning in two loops: the first one turns expunged sleds - // into expunged zones, and the second one adds services. - self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; + self.do_plan_zone_image_source()?; self.do_plan_cockroachdb_settings(); - Ok(()) } @@ -877,6 +877,185 @@ impl<'a> Planner<'a> { Ok(()) } + /// Update at most one existing zone to use a new image source. + fn do_plan_zone_image_source(&mut self) -> Result<(), Error> { + // We are only interested in non-decommissioned sleds. + let sleds = self + .input + .all_sleds(SledFilter::Commissioned) + .map(|(id, _details)| id) + .collect::>(); + + // Wait for all current zones to become up-to-date. + let inventory_zones = self + .inventory + .all_omicron_zones() + .map(|z| (z.id, z)) + .collect::>(); + for sled_id in sleds.iter().cloned() { + if self + .blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .any(|zone| { + inventory_zones + .get(&zone.id) + .map(|z| { + z.image_source != zone.image_source.clone().into() + }) + .unwrap_or(false) + }) + { + info!( + self.log, "zones not yet up-to-date"; + "sled_id" => %sled_id, + ); + return Ok(()); + } + } + + // Choose among the out-of-date zones one with the fewest dependencies. + let mut out_of_date_zones = sleds + .into_iter() + .flat_map(|sled_id| { + self.blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter_map(|zone| { + (zone.image_source + != self + .blueprint + .zone_image_source(zone.zone_type.kind())) + .then(|| (sled_id, zone.clone())) + }) + .collect::>() + }) + .collect::>(); + + sort_zones_by_deps(&mut out_of_date_zones); + if let Some((sled_id, zone)) = out_of_date_zones.first() { + return self.update_zone_image_source(*sled_id, zone); + } + + info!(self.log, "all zones up-to-date"); + Ok(()) + } + + /// Update a zone to use a new image source, either in-place or by + /// expunging & replacing it (depending on the kind of zone). + fn update_zone_image_source( + &mut self, + sled_id: SledUuid, + zone: &BlueprintZoneConfig, + ) -> Result<(), Error> { + let zone_kind = zone.zone_type.kind(); + let image_source = self.blueprint.zone_image_source(zone_kind); + match zone_kind { + ZoneKind::Crucible + | ZoneKind::CockroachDb + | ZoneKind::Clickhouse => { + info!( + self.log, "updating zone image source in-place"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "image_source" => %image_source, + ); + self.blueprint.sled_set_zone_source( + sled_id, + zone.id, + image_source, + )?; + } + ZoneKind::BoundaryNtp => { + info!( + self.log, "replacing out-of-date BoundaryNtp zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint + .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?; + } + ZoneKind::ClickhouseKeeper => { + info!( + self.log, "replacing out-of-date ClickhouseKeeper zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_clickhouse_keeper(sled_id)?; + } + ZoneKind::ClickhouseServer => { + info!( + self.log, "replacing out-of-date ClickhouseServer zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_clickhouse_server(sled_id)?; + } + ZoneKind::CruciblePantry => { + info!( + self.log, "replacing out-of-date CruciblePantry zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_crucible_pantry(sled_id)?; + } + ZoneKind::ExternalDns => { + info!( + self.log, "replacing out-of-date ExternalDns zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_external_dns(sled_id)?; + } + ZoneKind::InternalDns => { + info!( + self.log, "replacing out-of-date InternalDns zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_internal_dns(sled_id)?; + } + ZoneKind::InternalNtp => { + info!( + self.log, "replacing out-of-date InternalNtp zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_ensure_zone_ntp(sled_id)?; + } + ZoneKind::Nexus => { + info!( + self.log, "replacing out-of-date Nexus zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_nexus(sled_id)?; + } + ZoneKind::Oximeter => { + info!( + self.log, "replacing out-of-date Oximeter zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + self.blueprint.sled_add_zone_oximeter(sled_id)?; + } + } + Ok(()) + } + fn do_plan_cockroachdb_settings(&mut self) { // Figure out what we should set the CockroachDB "preserve downgrade // option" setting to based on the planning input. @@ -972,6 +1151,20 @@ impl<'a> Planner<'a> { } } +/// Sort in place a vector of zone configurations by API dependencies +/// (RFD 565 §6). For now, we encode a simplified dependency graph manually; +/// down the road, we should programmatically import it from `ls-apis`. +fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { + zones.sort_by(|a, b| match (a.1.zone_type.kind(), b.1.zone_type.kind()) { + (s, t) if s == t => Ordering::Equal, + (ZoneKind::CruciblePantry, _) => Ordering::Less, + (_, ZoneKind::CruciblePantry) => Ordering::Greater, + (_, ZoneKind::Nexus) => Ordering::Less, + (ZoneKind::Nexus, _) => Ordering::Greater, + (_, _) => Ordering::Equal, + }) +} + /// The reason a sled's zones need to be expunged. /// /// This is used only for introspection and logging -- it's not part of the @@ -1000,6 +1193,8 @@ pub(crate) mod test { use nexus_types::deployment::BlueprintDiffSummary; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneImageSource; + use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; @@ -1010,16 +1205,25 @@ pub(crate) mod test { use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::external_api::views::SledState; use omicron_common::api::external::Generation; + use omicron_common::api::external::TufArtifactMeta; + use omicron_common::api::external::TufRepoDescription; + use omicron_common::api::external::TufRepoMeta; use omicron_common::disk::DatasetKind; use omicron_common::disk::DiskIdentity; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; + use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::ZpoolUuid; + use semver::Version; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::HashMap; use std::net::IpAddr; + use tufaceous_artifact::ArtifactHash; + use tufaceous_artifact::ArtifactKind; + use tufaceous_artifact::ArtifactVersion; + use tufaceous_artifact::KnownArtifactKind; use typed_rng::TypedUuidRng; // Generate a ClickhousePolicy ignoring fields we don't care about for @@ -4337,4 +4541,354 @@ pub(crate) mod test { logctx.cleanup_successful(); } + + #[test] + fn test_sort_zones_by_deps() { + static TEST_NAME: &str = "sort_zones_by_deps"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Collect zone configs from our example system. + let (_collection, _input, blueprint) = example(&log, TEST_NAME); + let mut zones = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .map(|(sled_id, z)| (sled_id, z.clone())) + .collect::>(); + + // Sort them and verify the ordering constraints. + sort_zones_by_deps(&mut zones); + let mut pantry = false; + let mut nexus = false; + for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { + match kind { + ZoneKind::CruciblePantry => { + assert!(!nexus); + pantry = true; + } + ZoneKind::Nexus => { + assert!(pantry); + nexus = true; + } + _ => { + assert!(pantry); + assert!(!nexus); + } + } + } + + logctx.cleanup_successful(); + } + + #[test] + fn test_plan_tuf_repo() { + static TEST_NAME: &str = "plan_tuf_repo"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .build(); + let input = example.input; + let collection = example.collection; + verify_blueprint(&blueprint1); + + // We'll be manually updating our inventory collection's zones + // from blueprints. + let mut update_collection_from_blueprint = + |blueprint: &Blueprint| -> Collection { + for (&sled_id, config) in blueprint.sleds.iter() { + example + .system + .sled_set_omicron_zones( + sled_id, + config + .clone() + .into_in_service_sled_config() + .zones_config, + ) + .expect("can't set omicron zones for sled"); + } + example.system.to_collection_builder().unwrap().build() + }; + + // We should start with no specified TUF repo and nothing to do. + assert!(input.tuf_repo().is_none()); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint1, + &input, + &collection, + TEST_NAME, + ); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a trivial TUF repo. + let mut input_builder = input.into_builder(); + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: ArtifactHash([0; 32]), + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(0, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![], + }); + let input = input_builder.build(); + let blueprint2 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))) + .plan() + .expect("plan for trivial TUF repo"); + + // All zones should still be sourced from the install dataset. + assert!( + blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Manually specify a TUF repo with some fake zone images. + // Only the name and kind of the artifacts matter. + let mut input_builder = input.into_builder(); + let version = ArtifactVersion::new_static("1.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + let artifacts = vec![ + TufArtifactMeta { + id: ArtifactId { + name: String::from("crucible-pantry-zone"), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + }, + TufArtifactMeta { + id: ArtifactId { + name: String::from("nexus"), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + }, + ]; + input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts, + }); + + // Some helper predicates for the assertions below. + let is_up_to_date = |zone: &BlueprintZoneConfig| -> bool { + zone.image_source == image_source + }; + let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() && is_up_to_date(zone) + }; + let is_up_to_date_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() && is_up_to_date(zone) + }; + + // Add a new Nexus zone. + input_builder.policy_mut().target_nexus_zone_count = + input_builder.policy_mut().target_nexus_zone_count + 1; + + let input = input_builder.build(); + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint1, + &input, + "test_blueprint3", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) + .plan() + .expect("plan for non-trivial TUF repo"); + + // There should be new Nexus & Crucible Pantry zones that use the new artifact. + let summary = blueprint3.diff_since_blueprint(&blueprint2); + let mut added_count = 0; + for sled_cfg in summary.diff.sleds.modified_values_diff() { + added_count += sled_cfg.zones.added.len(); + for z in sled_cfg.zones.added.values() { + assert!(is_up_to_date_nexus(z) || is_up_to_date_pantry(z)); + } + } + assert_eq!(added_count, 2); + + // Mutate the inventory to include the updated Nexus & Pantry zones. + let collection = update_collection_from_blueprint(&blueprint3); + + // Replan with the new inventory. + let blueprint4 = Planner::new_based_on( + log.clone(), + &blueprint3, + &input, + "test_blueprint4", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp4"))) + .plan() + .expect("replan with updated inventory"); + + // Now two Crucible Pantry zones should use the new artifact. + assert_eq!( + blueprint4 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_pantry(z)) + .count(), + 2 + ); + + // One more cycle for the third Crucible Pantry to update. + let collection = update_collection_from_blueprint(&blueprint4); + let blueprint5 = Planner::new_based_on( + log.clone(), + &blueprint4, + &input, + "test_blueprint5", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp5"))) + .plan() + .expect("replan with updated inventory"); + + assert_eq!( + blueprint5 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_pantry(z)) + .count(), + CRUCIBLE_PANTRY_REDUNDANCY + ); + + // The next few planning cycles should get us up-to-date Nexus zones. + let collection = update_collection_from_blueprint(&blueprint5); + let blueprint6 = Planner::new_based_on( + log.clone(), + &blueprint5, + &input, + "test_blueprint6", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp6"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint6 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 2, + ); + + let collection = update_collection_from_blueprint(&blueprint6); + let blueprint7 = Planner::new_based_on( + log.clone(), + &blueprint6, + &input, + "test_blueprint7", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp7"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint7 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 3, + ); + + let collection = update_collection_from_blueprint(&blueprint7); + let blueprint8 = Planner::new_based_on( + log.clone(), + &blueprint7, + &input, + "test_blueprint8", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp8"))) + .plan() + .expect("replan with updated inventory"); + assert_eq!( + blueprint8 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + 4, + ); + + // And the last pass is just to expunge an old Nexus zone. + let collection = update_collection_from_blueprint(&blueprint8); + let blueprint9 = Planner::new_based_on( + log.clone(), + &blueprint8, + &input, + "test_blueprint9", + &collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp9"))) + .plan() + .expect("replan with updated inventory"); + + let summary = blueprint9.diff_since_blueprint(&blueprint8); + assert_eq!(summary.diff.sleds.added.len(), 0); + assert_eq!(summary.diff.sleds.removed.len(), 0); + assert_eq!(summary.diff.sleds.modified().count(), 1); + + // Everything's up-to-date in Kansas City! + let collection = update_collection_from_blueprint(&blueprint9); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint9, + &input, + &collection, + TEST_NAME, + ); + + logctx.cleanup_successful(); + } } From 85e94bb67cd146d2e5bf210a372da0c24a958f68 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 23 Apr 2025 13:42:25 -0600 Subject: [PATCH 04/32] =?UTF-8?q?Plan=20according=20to=20RFD=20565=20?= =?UTF-8?q?=C2=A79?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/src/api/external/mod.rs | 4 + nexus-sled-agent-shared/src/inventory.rs | 20 +- .../src/db/datastore/target_release.rs | 23 +- nexus/reconfigurator/execution/src/dns.rs | 1 + .../planning/src/blueprint_builder/builder.rs | 71 ++- nexus/reconfigurator/planning/src/planner.rs | 523 +++++++++--------- .../planning/src/planner/update_sequence.rs | 108 ++++ nexus/reconfigurator/planning/src/system.rs | 3 + nexus/reconfigurator/preparation/src/lib.rs | 23 + nexus/types/src/deployment.rs | 12 + nexus/types/src/deployment/planning_input.rs | 13 +- 11 files changed, 509 insertions(+), 292 deletions(-) create mode 100644 nexus/reconfigurator/planning/src/planner/update_sequence.rs diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 60325218f1c..cc4ac7fd11d 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -748,6 +748,10 @@ impl Generation { ); Generation(next_gen) } + + pub const fn prev(&self) -> Option { + if self.0 > 1 { Some(Generation(self.0 - 1)) } else { None } + } } impl<'de> Deserialize<'de> for Generation { diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 29c5924ed10..a5f7bc3e438 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -16,6 +16,7 @@ use omicron_common::{ DatasetManagementStatus, DatasetsConfig, DiskManagementStatus, DiskVariant, OmicronPhysicalDisksConfig, }, + update::ArtifactId, zpool_name::ZpoolName, }; use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid}; @@ -26,7 +27,7 @@ use serde::{Deserialize, Serialize}; // depend on sled-hardware-types. pub use sled_hardware_types::Baseboard; use strum::EnumIter; -use tufaceous_artifact::ArtifactHash; +use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; /// Identifies information about disks which may be attached to Sleds. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] @@ -492,13 +493,14 @@ impl OmicronZoneType { /// /// # String representations of this type /// -/// There are no fewer than four string representations for this type, all +/// There are no fewer than five string representations for this type, all /// slightly different from each other. /// /// 1. [`Self::zone_prefix`]: Used to construct zone names. /// 2. [`Self::service_prefix`]: Used to construct SMF service names. /// 3. [`Self::name_prefix`]: Used to construct `Name` instances. /// 4. [`Self::report_str`]: Used for reporting and testing. +/// 5. [`Self::artifact_name`]: Used to match TUF artifact names. /// /// There is no `Display` impl to ensure that users explicitly choose the /// representation they want. (Please play close attention to this! The @@ -636,6 +638,20 @@ impl ZoneKind { ZoneKind::Oximeter => "oximeter", } } + + /// Return true if an artifact represents a control plane zone image + /// of this kind. + pub fn is_control_plane_zone_artifact( + self, + artifact_id: &ArtifactId, + ) -> bool { + artifact_id + .kind + .to_known() + .map(|kind| matches!(kind, KnownArtifactKind::Zone)) + .unwrap_or(false) + && artifact_id.name == self.artifact_name() + } } /// Where Sled Agent should get the image for a zone. diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index e98a5ce5209..e5886699ac7 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -7,7 +7,9 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; -use crate::db::model::{SemverVersion, TargetRelease, TargetReleaseSource}; +use crate::db::model::{ + Generation, SemverVersion, TargetRelease, TargetReleaseSource, +}; use async_bb8_diesel::AsyncRunQueryDsl as _; use diesel::insert_into; use diesel::prelude::*; @@ -44,6 +46,25 @@ impl DataStore { Ok(current) } + /// Fetch a target release by generation number. + pub async fn target_release_get_generation( + &self, + opctx: &OpContext, + generation: Generation, + ) -> LookupResult> { + opctx + .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) + .await?; + let conn = self.pool_connection_authorized(opctx).await?; + dsl::target_release + .select(TargetRelease::as_select()) + .filter(dsl::generation.eq(generation)) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Insert a new target release row and return it. It will only become /// the current target release if its generation is larger than any /// existing row. diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 2c77e99145e..5e226828658 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1425,6 +1425,7 @@ mod test { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, log, } .build() diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 46989d3f75a..d3f7854d089 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -14,6 +14,7 @@ use crate::blueprint_editor::ExternalSnatNetworkingChoice; use crate::blueprint_editor::NoAvailableDnsSubnets; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; +use crate::planner::OrderedComponent; use crate::planner::ZoneExpungeReason; use crate::planner::rng::PlannerRng; use anyhow::Context as _; @@ -33,7 +34,6 @@ use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; -use nexus_types::deployment::BlueprintZoneImageVersion; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; @@ -57,6 +57,7 @@ use omicron_common::address::DNS_PORT; use omicron_common::address::NTP_PORT; use omicron_common::address::ReservedRackSubnet; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufRepoDescription; use omicron_common::api::external::Vni; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; @@ -83,7 +84,6 @@ use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use thiserror::Error; -use tufaceous_artifact::KnownArtifactKind; use super::ClickhouseZonesThatShouldBeRunning; use super::clickhouse::ClickhouseAllocator; @@ -1903,34 +1903,55 @@ impl<'a> BlueprintBuilder<'a> { self.pending_mgs_updates.remove(baseboard_id); } - /// Try to find an artifact in the release repo that contains an image - /// for a zone of the given kind. Defaults to the install dataset. + fn zone_image_artifact( + repo: Option<&TufRepoDescription>, + zone_kind: ZoneKind, + ) -> BlueprintZoneImageSource { + repo.and_then(|repo| { + repo.artifacts + .iter() + .find(|artifact| { + zone_kind.is_control_plane_zone_artifact(&artifact.id) + }) + .map(BlueprintZoneImageSource::from_available_artifact) + }) + .unwrap_or(BlueprintZoneImageSource::InstallDataset) + } + + /// Try to find an artifact in either the current or previous release repo + /// that contains an image for a zone of the given kind; see RFD 565 §9. + /// Defaults to the install dataset. pub(crate) fn zone_image_source( &self, zone_kind: ZoneKind, ) -> BlueprintZoneImageSource { - self.input - .tuf_repo() - .and_then(|repo| { - repo.artifacts - .iter() - .find(|artifact| { - artifact - .id - .kind - .to_known() - .map(|kind| matches!(kind, KnownArtifactKind::Zone)) - .unwrap_or(false) - && artifact.id.name == zone_kind.artifact_name() - }) - .map(|artifact| BlueprintZoneImageSource::Artifact { - version: BlueprintZoneImageVersion::Available { - version: artifact.id.version.clone(), - }, - hash: artifact.hash, + let new_repo = self.input.tuf_repo(); + let old_repo = self.input.old_repo(); + let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); + let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); + if let Some(prev) = OrderedComponent::from(zone_kind).prev() { + if prev >= OrderedComponent::ControlPlaneZone + && self.sled_ids_with_zones().any(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .any(|z| { + let kind = z.zone_type.kind(); + let old_artifact = + Self::zone_image_artifact(old_repo, kind); + OrderedComponent::from(kind) == prev + && z.image_source == old_artifact }) - }) - .unwrap_or(BlueprintZoneImageSource::InstallDataset) + }) + { + old_artifact + } else { + new_artifact + } + } else { + new_artifact + } } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 58b7d8c8857..dd39b53fb7c 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -37,7 +37,6 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; -use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -47,9 +46,11 @@ use self::omicron_zone_placement::OmicronZonePlacement; use self::omicron_zone_placement::OmicronZonePlacementSledState; pub use self::rng::PlannerRng; pub use self::rng::SledPlannerRng; +pub(crate) use self::update_sequence::OrderedComponent; mod omicron_zone_placement; pub(crate) mod rng; +mod update_sequence; pub struct Planner<'a> { log: Logger, @@ -116,7 +117,7 @@ impl<'a> Planner<'a> { self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; - self.do_plan_zone_image_source()?; + self.do_plan_zone_updates()?; self.do_plan_cockroachdb_settings(); Ok(()) } @@ -878,7 +879,7 @@ impl<'a> Planner<'a> { } /// Update at most one existing zone to use a new image source. - fn do_plan_zone_image_source(&mut self) -> Result<(), Error> { + fn do_plan_zone_updates(&mut self) -> Result<(), Error> { // We are only interested in non-decommissioned sleds. let sleds = self .input @@ -938,7 +939,7 @@ impl<'a> Planner<'a> { sort_zones_by_deps(&mut out_of_date_zones); if let Some((sled_id, zone)) = out_of_date_zones.first() { - return self.update_zone_image_source(*sled_id, zone); + return self.update_or_expunge_zone(*sled_id, zone); } info!(self.log, "all zones up-to-date"); @@ -946,8 +947,8 @@ impl<'a> Planner<'a> { } /// Update a zone to use a new image source, either in-place or by - /// expunging & replacing it (depending on the kind of zone). - fn update_zone_image_source( + /// expunging it and letting it be replaced in a future iteration. + fn update_or_expunge_zone( &mut self, sled_id: SledUuid, zone: &BlueprintZoneConfig, @@ -962,6 +963,7 @@ impl<'a> Planner<'a> { self.log, "updating zone image source in-place"; "sled_id" => %sled_id, "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), "image_source" => %image_source, ); self.blueprint.sled_set_zone_source( @@ -970,87 +972,14 @@ impl<'a> Planner<'a> { image_source, )?; } - ZoneKind::BoundaryNtp => { + _ => { info!( - self.log, "replacing out-of-date BoundaryNtp zone"; + self.log, "expunging out-of-date zone"; "sled_id" => %sled_id, "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), ); self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint - .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?; - } - ZoneKind::ClickhouseKeeper => { - info!( - self.log, "replacing out-of-date ClickhouseKeeper zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_clickhouse_keeper(sled_id)?; - } - ZoneKind::ClickhouseServer => { - info!( - self.log, "replacing out-of-date ClickhouseServer zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_clickhouse_server(sled_id)?; - } - ZoneKind::CruciblePantry => { - info!( - self.log, "replacing out-of-date CruciblePantry zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_crucible_pantry(sled_id)?; - } - ZoneKind::ExternalDns => { - info!( - self.log, "replacing out-of-date ExternalDns zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_external_dns(sled_id)?; - } - ZoneKind::InternalDns => { - info!( - self.log, "replacing out-of-date InternalDns zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_internal_dns(sled_id)?; - } - ZoneKind::InternalNtp => { - info!( - self.log, "replacing out-of-date InternalNtp zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_ensure_zone_ntp(sled_id)?; - } - ZoneKind::Nexus => { - info!( - self.log, "replacing out-of-date Nexus zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_nexus(sled_id)?; - } - ZoneKind::Oximeter => { - info!( - self.log, "replacing out-of-date Oximeter zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; - self.blueprint.sled_add_zone_oximeter(sled_id)?; } } Ok(()) @@ -1151,17 +1080,12 @@ impl<'a> Planner<'a> { } } -/// Sort in place a vector of zone configurations by API dependencies -/// (RFD 565 §6). For now, we encode a simplified dependency graph manually; -/// down the road, we should programmatically import it from `ls-apis`. +/// Sort in place a vector of sled-specific zone configurations by API +/// dependencies (RFD 565 §6). fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { - zones.sort_by(|a, b| match (a.1.zone_type.kind(), b.1.zone_type.kind()) { - (s, t) if s == t => Ordering::Equal, - (ZoneKind::CruciblePantry, _) => Ordering::Less, - (_, ZoneKind::CruciblePantry) => Ordering::Greater, - (_, ZoneKind::Nexus) => Ordering::Less, - (ZoneKind::Nexus, _) => Ordering::Greater, - (_, _) => Ordering::Equal, + zones.sort_by(|a, b| { + OrderedComponent::from(a.1.zone_type.kind()) + .cmp(&OrderedComponent::from(b.1.zone_type.kind())) }) } @@ -1179,6 +1103,7 @@ pub(crate) enum ZoneExpungeReason { pub(crate) mod test { use super::*; use crate::blueprint_builder::test::verify_blueprint; + use crate::example::ExampleSystem; use crate::example::ExampleSystemBuilder; use crate::example::SimRngState; use crate::example::example; @@ -1211,6 +1136,7 @@ pub(crate) mod test { use omicron_common::disk::DatasetKind; use omicron_common::disk::DiskIdentity; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; + use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PhysicalDiskUuid; @@ -4557,31 +4483,44 @@ pub(crate) mod test { // Sort them and verify the ordering constraints. sort_zones_by_deps(&mut zones); - let mut pantry = false; let mut nexus = false; for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { match kind { - ZoneKind::CruciblePantry => { - assert!(!nexus); - pantry = true; - } ZoneKind::Nexus => { - assert!(pantry); nexus = true; } _ => { - assert!(pantry); assert!(!nexus); } } } + assert!(nexus); logctx.cleanup_successful(); } + /// Manually update the example system's inventory collection's zones + /// from a blueprint. + fn update_collection_from_blueprint( + example: &mut ExampleSystem, + blueprint: &Blueprint, + ) { + for (&sled_id, config) in blueprint.sleds.iter() { + example + .system + .sled_set_omicron_zones( + sled_id, + config.clone().into_in_service_sled_config().zones_config, + ) + .expect("can't set omicron zones for sled"); + } + example.collection = + example.system.to_collection_builder().unwrap().build(); + } + #[test] - fn test_plan_tuf_repo() { - static TEST_NAME: &str = "plan_tuf_repo"; + fn test_update_crucible_pantry() { + static TEST_NAME: &str = "update_crucible_pantry"; let logctx = test_setup_log(TEST_NAME); let log = logctx.log.clone(); @@ -4592,36 +4531,15 @@ pub(crate) mod test { rng.next_system_rng(), ) .build(); - let input = example.input; - let collection = example.collection; verify_blueprint(&blueprint1); - // We'll be manually updating our inventory collection's zones - // from blueprints. - let mut update_collection_from_blueprint = - |blueprint: &Blueprint| -> Collection { - for (&sled_id, config) in blueprint.sleds.iter() { - example - .system - .sled_set_omicron_zones( - sled_id, - config - .clone() - .into_in_service_sled_config() - .zones_config, - ) - .expect("can't set omicron zones for sled"); - } - example.system.to_collection_builder().unwrap().build() - }; - // We should start with no specified TUF repo and nothing to do. - assert!(input.tuf_repo().is_none()); + assert!(example.input.tuf_repo().is_none()); assert_planning_makes_no_changes( &logctx.log, &blueprint1, - &input, - &collection, + &example.input, + &example.collection, TEST_NAME, ); @@ -4636,7 +4554,7 @@ pub(crate) mod test { ); // Manually specify a trivial TUF repo. - let mut input_builder = input.into_builder(); + let mut input_builder = example.input.clone().into_builder(); input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { repo: TufRepoMeta { hash: ArtifactHash([0; 32]), @@ -4653,7 +4571,7 @@ pub(crate) mod test { &blueprint1, &input, "test_blueprint2", - &collection, + &example.collection, ) .expect("can't create planner") .with_rng(PlannerRng::from_seed((TEST_NAME, "bp2"))) @@ -4662,7 +4580,7 @@ pub(crate) mod test { // All zones should still be sourced from the install dataset. assert!( - blueprint1 + blueprint2 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( z.image_source, @@ -4670,8 +4588,9 @@ pub(crate) mod test { )) ); - // Manually specify a TUF repo with some fake zone images. - // Only the name and kind of the artifacts matter. + // Manually specify a TUF repo with fake zone images for Crucible Pantry + // and Nexus. Only the name and kind of the artifacts matter. The Nexus + // artifact is only there to make sure the planner *doesn't* use it. let mut input_builder = input.into_builder(); let version = ArtifactVersion::new_static("1.0.0-freeform") .expect("can't parse artifact version"); @@ -4714,181 +4633,259 @@ pub(crate) mod test { }); // Some helper predicates for the assertions below. - let is_up_to_date = |zone: &BlueprintZoneConfig| -> bool { - zone.image_source == image_source + let is_old_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) }; - let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { - zone.zone_type.is_nexus() && is_up_to_date(zone) + let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_crucible_pantry() + && matches!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ) }; let is_up_to_date_pantry = |zone: &BlueprintZoneConfig| -> bool { - zone.zone_type.is_crucible_pantry() && is_up_to_date(zone) + zone.zone_type.is_crucible_pantry() + && zone.image_source == image_source }; - // Add a new Nexus zone. + // Request another Nexus zone. This should *not* use the new artifact, + // since not all of its dependencies can be updated. input_builder.policy_mut().target_nexus_zone_count = input_builder.policy_mut().target_nexus_zone_count + 1; - let input = input_builder.build(); - let blueprint3 = Planner::new_based_on( - log.clone(), - &blueprint1, - &input, - "test_blueprint3", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) - .plan() - .expect("plan for non-trivial TUF repo"); - // There should be new Nexus & Crucible Pantry zones that use the new artifact. - let summary = blueprint3.diff_since_blueprint(&blueprint2); - let mut added_count = 0; - for sled_cfg in summary.diff.sleds.modified_values_diff() { - added_count += sled_cfg.zones.added.len(); - for z in sled_cfg.zones.added.values() { - assert!(is_up_to_date_nexus(z) || is_up_to_date_pantry(z)); - } + // We should now have three sets of expunge/add iterations for the + // Crucible Pantry zones. + let mut parent = blueprint2; + for i in 3..=8 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + parent = blueprint; } - assert_eq!(added_count, 2); - - // Mutate the inventory to include the updated Nexus & Pantry zones. - let collection = update_collection_from_blueprint(&blueprint3); - // Replan with the new inventory. - let blueprint4 = Planner::new_based_on( - log.clone(), - &blueprint3, - &input, - "test_blueprint4", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp4"))) - .plan() - .expect("replan with updated inventory"); - - // Now two Crucible Pantry zones should use the new artifact. + // All Crucible Pantries should now be updated. assert_eq!( - blueprint4 + parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), - 2 + CRUCIBLE_PANTRY_REDUNDANCY ); - // One more cycle for the third Crucible Pantry to update. - let collection = update_collection_from_blueprint(&blueprint4); - let blueprint5 = Planner::new_based_on( + // One more iteration for the last old zone to be expunged. + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( log.clone(), - &blueprint4, + &parent, &input, - "test_blueprint5", - &collection, + "last_blueprint", + &example.collection, ) .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp5"))) + .with_rng(PlannerRng::from_seed((TEST_NAME, "last_bp"))) .plan() - .expect("replan with updated inventory"); + .expect("replan for last blueprint"); assert_eq!( - blueprint5 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_pantry(z)) + blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // The next few planning cycles should get us up-to-date Nexus zones. - let collection = update_collection_from_blueprint(&blueprint5); - let blueprint6 = Planner::new_based_on( - log.clone(), - &blueprint5, - &input, - "test_blueprint6", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp6"))) - .plan() - .expect("replan with updated inventory"); + // We won't be able to update Nexus until *all* of its dependent zones + // are updated. Stay tuned for the next test! assert_eq!( - blueprint6 + parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) + .filter(|(_, z)| is_old_nexus(z)) .count(), - 2, + NEXUS_REDUNDANCY + 1, ); - let collection = update_collection_from_blueprint(&blueprint6); - let blueprint7 = Planner::new_based_on( - log.clone(), - &blueprint6, + // Everything's up-to-date in Kansas City! + update_collection_from_blueprint(&mut example, &blueprint); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint, &input, - "test_blueprint7", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp7"))) - .plan() - .expect("replan with updated inventory"); - assert_eq!( - blueprint7 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) - .count(), - 3, + &example.collection, + TEST_NAME, ); - let collection = update_collection_from_blueprint(&blueprint7); - let blueprint8 = Planner::new_based_on( - log.clone(), - &blueprint7, - &input, - "test_blueprint8", - &collection, + logctx.cleanup_successful(); + } + + #[test] + fn test_update_all_zones() { + static TEST_NAME: &str = "update_all_zones"; + let logctx = test_setup_log(TEST_NAME); + let log = logctx.log.clone(); + + // Use our example system. + let mut rng = SimRngState::from_seed(TEST_NAME); + let (mut example, blueprint1) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp8"))) - .plan() - .expect("replan with updated inventory"); - assert_eq!( - blueprint8 + .build(); + verify_blueprint(&blueprint1); + + // All zones should be sourced from the install dataset by default. + assert!( + blueprint1 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_up_to_date_nexus(z)) - .count(), - 4, + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) ); - // And the last pass is just to expunge an old Nexus zone. - let collection = update_collection_from_blueprint(&blueprint8); - let blueprint9 = Planner::new_based_on( - log.clone(), - &blueprint8, - &input, - "test_blueprint9", - &collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "bp9"))) - .plan() - .expect("replan with updated inventory"); + // Manually specify a TUF repo with fake images for all zones. + // Only the name and kind of the artifacts matter. + let mut input_builder = example.input.clone().into_builder(); + let version = ArtifactVersion::new_static("2.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + macro_rules! zone_artifact { + ($name:expr) => { + TufArtifactMeta { + id: ArtifactId { + name: String::from($name), + version: version.clone(), + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: fake_hash, + size: 0, + } + }; + } + let tuf_repo = TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts: vec![ + zone_artifact!("ntp"), + zone_artifact!("clickhouse"), + zone_artifact!("clickhouse_keeper"), + zone_artifact!("cockroachdb"), + zone_artifact!("crucible-zone"), + zone_artifact!("crucible-pantry-zone"), + zone_artifact!("external-dns"), + zone_artifact!("internal-dns"), + zone_artifact!("ntp"), + zone_artifact!("nexus"), + zone_artifact!("oximeter"), + ], + }; + input_builder.policy_mut().tuf_repo = Some(tuf_repo); + let input = input_builder.build(); - let summary = blueprint9.diff_since_blueprint(&blueprint8); - assert_eq!(summary.diff.sleds.added.len(), 0); - assert_eq!(summary.diff.sleds.removed.len(), 0); - assert_eq!(summary.diff.sleds.modified().count(), 1); + let mut comp = OrderedComponent::ControlPlaneZone; + let mut parent = blueprint1; + const MAX_PLANNING_ITERATIONS: usize = 100; + for i in 2..=MAX_PLANNING_ITERATIONS { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); - // Everything's up-to-date in Kansas City! - let collection = update_collection_from_blueprint(&blueprint9); - assert_planning_makes_no_changes( - &logctx.log, - &blueprint9, - &input, - &collection, - TEST_NAME, - ); + let summary = blueprint.diff_since_blueprint(&parent); + if summary.total_zones_added() == 0 + && summary.total_zones_removed() == 0 + && summary.total_zones_modified() == 0 + { + println!("planning converged after {i} iterations"); + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| zone.image_source == image_source) + ); + logctx.cleanup_successful(); + return; + } - logctx.cleanup_successful(); + match comp { + OrderedComponent::HostOs | OrderedComponent::SpRot => { + unreachable!("can only update zones") + } + OrderedComponent::ControlPlaneZone => { + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| match zone.zone_type.kind() { + ZoneKind::Nexus => { + if zone.image_source == image_source { + comp = comp.next().unwrap(); + true + } else { + zone.image_source + == BlueprintZoneImageSource::InstallDataset + } + } + _ => zone.image_source + == BlueprintZoneImageSource::InstallDataset + || zone.image_source == image_source, + }) + ); + } + OrderedComponent::NexusZone => { + assert!( + blueprint + .all_omicron_zones( + BlueprintZoneDisposition::is_in_service + ) + .all(|(_, zone)| match zone.zone_type.kind() { + ZoneKind::Nexus => zone.image_source + == BlueprintZoneImageSource::InstallDataset + || zone.image_source == image_source, + _ => zone.image_source == image_source, + }) + ); + } + } + + parent = blueprint; + } + panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } } diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs new file mode 100644 index 00000000000..b675dd3dec8 --- /dev/null +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -0,0 +1,108 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Updatable components ordered by dependencies (RFD 565). + +use nexus_sled_agent_shared::inventory::ZoneKind; +use strum::{EnumIter, IntoEnumIterator as _}; + +/// Update sequence as defined by RFD 565 §6. +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter, PartialOrd, Ord, +)] +pub enum OrderedComponent { + HostOs, + SpRot, + ControlPlaneZone, + NexusZone, +} + +/// To implement the checks in RFD 565 §9, we need to go backwards +/// (and mabye forwards) through the component ordering. The simple +/// linear scans here are fine as long as `OrderedComponent` has only +/// a few variants; if it starts getting large (e.g., programatically +/// generated from `ls-apis`), we can switch to something faster. +impl OrderedComponent { + #[allow(dead_code)] + pub fn next(&self) -> Option { + let mut found = false; + for comp in OrderedComponent::iter() { + if found { + return Some(comp); + } else if comp == *self { + found = true; + } + } + None + } + + pub fn prev(&self) -> Option { + let mut prev = None; + for comp in OrderedComponent::iter() { + if comp == *self { + return prev; + } + prev = Some(comp); + } + prev + } +} + +impl From for OrderedComponent { + fn from(zone_kind: ZoneKind) -> Self { + match zone_kind { + ZoneKind::Nexus => Self::NexusZone, + _ => Self::ControlPlaneZone, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn ordered_component_next_prev() { + // Exhaustive checks ok for a few variants, revisit if it grows. + assert_eq!(OrderedComponent::HostOs.prev(), None); + assert_eq!( + OrderedComponent::HostOs.next(), + Some(OrderedComponent::SpRot) + ); + assert_eq!( + OrderedComponent::SpRot.prev(), + Some(OrderedComponent::HostOs) + ); + assert_eq!( + OrderedComponent::SpRot.next(), + Some(OrderedComponent::ControlPlaneZone) + ); + assert_eq!( + OrderedComponent::ControlPlaneZone.prev(), + Some(OrderedComponent::SpRot) + ); + assert_eq!( + OrderedComponent::ControlPlaneZone.next(), + Some(OrderedComponent::NexusZone) + ); + assert_eq!( + OrderedComponent::NexusZone.prev(), + Some(OrderedComponent::ControlPlaneZone) + ); + assert_eq!(OrderedComponent::NexusZone.next(), None); + assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); + } + + #[test] + fn ordered_component_from_zone_kind() { + assert!(matches!( + OrderedComponent::from(ZoneKind::CruciblePantry), + OrderedComponent::ControlPlaneZone + )); + assert!(matches!( + OrderedComponent::from(ZoneKind::Nexus), + OrderedComponent::NexusZone + )); + } +} diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index f4b947ecc59..de3e93e821d 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -98,6 +98,7 @@ pub struct SystemDescription { clickhouse_policy: Option, oximeter_read_policy: OximeterReadPolicy, tuf_repo: Option, + old_repo: Option, } impl SystemDescription { @@ -178,6 +179,7 @@ impl SystemDescription { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, } } @@ -431,6 +433,7 @@ impl SystemDescription { clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 48da2704e69..dea9e9b7791 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -7,6 +7,7 @@ use anyhow::Context; use futures::StreamExt; use nexus_db_model::DnsGroup; +use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::DataStoreDnsTest; @@ -80,6 +81,7 @@ pub struct PlanningInputFromDb<'a> { pub clickhouse_policy: Option, pub oximeter_read_policy: OximeterReadPolicy, pub tuf_repo: Option, + pub old_repo: Option, pub log: &'a Logger, } @@ -160,6 +162,25 @@ impl PlanningInputFromDb<'_> { .into_external(), ), }; + let prev_release = if let Some(prev) = target_release.generation.prev() + { + datastore + .target_release_get_generation(opctx, Generation(prev)) + .await + .internal_context("fetching current target release")? + } else { + None + }; + let old_repo = match prev_release.and_then(|r| r.tuf_repo_id) { + None => None, + Some(repo_id) => Some( + datastore + .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .await + .internal_context("fetching target release repo")? + .into_external(), + ), + }; let oximeter_read_policy = datastore .oximeter_read_policy_get_latest(opctx) @@ -187,6 +208,7 @@ impl PlanningInputFromDb<'_> { clickhouse_policy, oximeter_read_policy, tuf_repo, + old_repo, } .build() .internal_context("assembling planning_input")?; @@ -211,6 +233,7 @@ impl PlanningInputFromDb<'_> { clickhouse_policy: self.clickhouse_policy.clone(), oximeter_read_policy: self.oximeter_read_policy.clone(), tuf_repo: self.tuf_repo.clone(), + old_repo: self.old_repo.clone(), }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 16502409892..8eadfa9f01b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -27,6 +27,7 @@ use nexus_sled_agent_shared::inventory::OmicronZonesConfig; use nexus_sled_agent_shared::inventory::ZoneKind; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; +use omicron_common::api::external::TufArtifactMeta; use omicron_common::api::internal::shared::DatasetKind; use omicron_common::disk::CompressionAlgorithm; use omicron_common::disk::DatasetConfig; @@ -999,6 +1000,17 @@ pub enum BlueprintZoneImageSource { Artifact { version: BlueprintZoneImageVersion, hash: ArtifactHash }, } +impl BlueprintZoneImageSource { + pub fn from_available_artifact(artifact: &TufArtifactMeta) -> Self { + BlueprintZoneImageSource::Artifact { + version: BlueprintZoneImageVersion::Available { + version: artifact.id.version.clone(), + }, + hash: artifact.hash, + } + } +} + impl From for OmicronZoneImageSource { fn from(source: BlueprintZoneImageSource) -> Self { match source { diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index e4b300b39bb..86e9fb02d94 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -157,6 +157,10 @@ impl PlanningInput { self.policy.tuf_repo.as_ref() } + pub fn old_repo(&self) -> Option<&TufRepoDescription> { + self.policy.old_repo.as_ref() + } + pub fn service_ip_pool_ranges(&self) -> &[IpRange] { &self.policy.service_ip_pool_ranges } @@ -926,10 +930,16 @@ pub struct Policy { /// Desired system software release repository. /// - /// New zones will use artifacts in this repo as their image sources, + /// New zones may use artifacts in this repo as their image sources, /// and at most one extant zone may be modified to use it or replaced /// with one that does. pub tuf_repo: Option, + + /// Previous system software release repository. + /// + /// New zones deployed mid-update may use artifacts in this repo as + /// their image sources. See RFD 565 §9. + pub old_repo: Option, } #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -1100,6 +1110,7 @@ impl PlanningInputBuilder { clickhouse_policy: None, oximeter_read_policy: OximeterReadPolicy::new(1), tuf_repo: None, + old_repo: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), From 194a8a3d3631acec4a52080333ed94694294adb3 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Mon, 28 Apr 2025 10:01:51 -0600 Subject: [PATCH 05/32] Don't update an already up-to-date zone --- nexus/reconfigurator/planning/src/planner.rs | 61 ++++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index dd39b53fb7c..98d1c1b7f98 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -955,31 +955,42 @@ impl<'a> Planner<'a> { ) -> Result<(), Error> { let zone_kind = zone.zone_type.kind(); let image_source = self.blueprint.zone_image_source(zone_kind); - match zone_kind { - ZoneKind::Crucible - | ZoneKind::CockroachDb - | ZoneKind::Clickhouse => { - info!( - self.log, "updating zone image source in-place"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - "image_source" => %image_source, - ); - self.blueprint.sled_set_zone_source( - sled_id, - zone.id, - image_source, - )?; - } - _ => { - info!( - self.log, "expunging out-of-date zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - ); - self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + if zone.image_source == image_source { + // This should only happen in the event of a planning error above. + warn!( + self.log, "zone is already up-to-date"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + } else { + match zone_kind { + ZoneKind::Crucible + | ZoneKind::CockroachDb + | ZoneKind::Clickhouse => { + info!( + self.log, "updating zone image source in-place"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + "image_source" => %image_source, + ); + self.blueprint.sled_set_zone_source( + sled_id, + zone.id, + image_source, + )?; + } + _ => { + info!( + self.log, "expunging out-of-date zone"; + "sled_id" => %sled_id, + "zone_id" => %zone.id, + "kind" => ?zone.zone_type.kind(), + ); + self.blueprint.sled_expunge_zone(sled_id, zone.id)?; + } } } Ok(()) From 50c116974728de6371106fd532d9884dfb8039a6 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Thu, 1 May 2025 20:33:37 -0600 Subject: [PATCH 06/32] Don't trust inventory zones' image_source --- nexus/db-model/src/inventory.rs | 1 + nexus/reconfigurator/planning/src/planner.rs | 16 ++++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 71ebe4e145d..e76bd4be617 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1632,6 +1632,7 @@ impl InvOmicronZone { .filesystem_pool .map(|id| ZpoolName::new_external(id.into())), zone_type, + // FIXME image_source: OmicronZoneImageSource::InstallDataset, }) } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 98d1c1b7f98..b50db73eca3 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -887,27 +887,23 @@ impl<'a> Planner<'a> { .map(|(id, _details)| id) .collect::>(); - // Wait for all current zones to become up-to-date. + // Wait for all current zones to appear in the inventory. + // It would be nice if we could check their image source, + // but the inventory doesn't report that correctly: see + // . let inventory_zones = self .inventory .all_omicron_zones() .map(|z| (z.id, z)) .collect::>(); for sled_id in sleds.iter().cloned() { - if self + if !self .blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .any(|zone| { - inventory_zones - .get(&zone.id) - .map(|z| { - z.image_source != zone.image_source.clone().into() - }) - .unwrap_or(false) - }) + .all(|zone| inventory_zones.contains_key(&zone.id)) { info!( self.log, "zones not yet up-to-date"; From ad7103e76316cb5c5507afd03b58feec0d3d536d Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 3 May 2025 02:55:30 +0000 Subject: [PATCH 07/32] Fix failing tests --- .../output/cmd-expunge-newly-added-stdout | 2 +- nexus/reconfigurator/planning/src/planner.rs | 84 ++++++++++++++++--- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout index c1c471f0476..1e1b1fe879d 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmd-expunge-newly-added-stdout @@ -628,7 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 -INFO all zones up-to-date +INFO zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index b50db73eca3..8ca2810b0f3 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4542,13 +4542,13 @@ pub(crate) mod test { // We should start with no specified TUF repo and nothing to do. assert!(example.input.tuf_repo().is_none()); - assert_planning_makes_no_changes( - &logctx.log, - &blueprint1, - &example.input, - &example.collection, - TEST_NAME, - ); + // assert_planning_makes_no_changes( + // &logctx.log, + // &blueprint1, + // &example.input, + // &example.collection, + // TEST_NAME, + // ); // All zones should be sourced from the install dataset by default. assert!( @@ -4659,16 +4659,46 @@ pub(crate) mod test { && zone.image_source == image_source }; - // Request another Nexus zone. This should *not* use the new artifact, - // since not all of its dependencies can be updated. + // Request another Nexus zone. input_builder.policy_mut().target_nexus_zone_count = input_builder.policy_mut().target_nexus_zone_count + 1; let input = input_builder.build(); + // Check that there is a new nexus zone that does *not* use the new + // artifact (since not all of its dependencies are updated yet). + update_collection_from_blueprint(&mut example, &blueprint2); + let blueprint3 = Planner::new_based_on( + log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, "bp3"))) + .plan() + .expect("can't re-plan for new Nexus zone"); + { + let summary = blueprint3.diff_since_blueprint(&blueprint2); + for sled in summary.diff.sleds.modified_values_diff() { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert!(matches!( + &added.image_source, + BlueprintZoneImageSource::InstallDataset + )); + } + } + // We should now have three sets of expunge/add iterations for the // Crucible Pantry zones. - let mut parent = blueprint2; - for i in 3..=8 { + let mut parent = blueprint3; + for i in 4..=9 { let blueprint_name = format!("blueprint_{i}"); update_collection_from_blueprint(&mut example, &parent); let blueprint = Planner::new_based_on( @@ -4682,6 +4712,38 @@ pub(crate) mod test { .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) .plan() .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + assert_eq!( + sled.zones + .common + .iter() + .filter(|(_, z)| matches!( + z.after.zone_type, + BlueprintZoneType::CruciblePantry(_) + ) && matches!( + z.after.disposition, + BlueprintZoneDisposition::Expunged { .. } + )) + .count(), + 1 + ); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::CruciblePantry(_) + )); + assert_eq!(added.image_source, image_source); + } + } + parent = blueprint; } From 55b8e0ef597676861f4edf1b6410066a481e528f Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:00:11 -0600 Subject: [PATCH 08/32] =?UTF-8?q?Rename=20datastore=20methods:=20update=5F?= =?UTF-8?q?tuf=5F*=20=E2=86=92=20tuf=5F*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nexus/db-queries/src/db/datastore/deployment.rs | 2 +- nexus/db-queries/src/db/datastore/target_release.rs | 2 +- nexus/db-queries/src/db/datastore/update.rs | 10 +++++----- nexus/reconfigurator/preparation/src/lib.rs | 4 ++-- .../app/background/tasks/tuf_artifact_replication.rs | 9 ++------- nexus/src/app/update.rs | 4 ++-- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/tests/integration_tests/updates.rs | 8 ++++---- 8 files changed, 18 insertions(+), 23 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index f4af7a58a10..3349c9a20c7 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -2243,7 +2243,7 @@ mod tests { const SYSTEM_HASH: ArtifactHash = ArtifactHash([3; 32]); datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index e5886699ac7..a6f82edff99 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -210,7 +210,7 @@ mod test { .parse() .expect("SHA256('')"); let repo = datastore - .update_tuf_repo_insert( + .tuf_repo_insert( opctx, &TufRepoDescription { repo: TufRepoMeta { diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 6a9b62f2ef0..c27d44d797a 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -76,7 +76,7 @@ impl DataStore { /// `TufRepoDescription` if one was already found. (This is not an upsert, /// because if we know about an existing repo but with different contents, /// we reject that.) - pub async fn update_tuf_repo_insert( + pub async fn tuf_repo_insert( &self, opctx: &OpContext, description: &external::TufRepoDescription, @@ -108,7 +108,7 @@ impl DataStore { } /// Returns a TUF repo description. - pub async fn update_tuf_repo_get_by_id( + pub async fn tuf_repo_get_by_id( &self, opctx: &OpContext, repo_id: TypedUuid, @@ -141,7 +141,7 @@ impl DataStore { } /// Returns the TUF repo description corresponding to this system version. - pub async fn update_tuf_repo_get_by_version( + pub async fn tuf_repo_get_by_version( &self, opctx: &OpContext, system_version: SemverVersion, @@ -174,7 +174,7 @@ impl DataStore { } /// Returns the list of all TUF repo artifacts known to the system. - pub async fn update_tuf_artifact_list( + pub async fn tuf_list_repos( &self, opctx: &OpContext, generation: Generation, @@ -194,7 +194,7 @@ impl DataStore { } /// Returns the current TUF repo generation number. - pub async fn update_tuf_generation_get( + pub async fn tuf_get_generation( &self, opctx: &OpContext, ) -> LookupResult { diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index dea9e9b7791..59e6a8ab6aa 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -156,7 +156,7 @@ impl PlanningInputFromDb<'_> { None => None, Some(repo_id) => Some( datastore - .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .tuf_repo_get_by_id(opctx, repo_id.into()) .await .internal_context("fetching target release repo")? .into_external(), @@ -175,7 +175,7 @@ impl PlanningInputFromDb<'_> { None => None, Some(repo_id) => Some( datastore - .update_tuf_repo_get_by_id(opctx, repo_id.into()) + .tuf_repo_get_by_id(opctx, repo_id.into()) .await .internal_context("fetching target release repo")? .into_external(), diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index 0ede39b6bdc..4f7fd5dd70d 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -590,18 +590,13 @@ impl ArtifactReplication { &self, opctx: &OpContext, ) -> Result<(ArtifactConfig, Inventory)> { - let generation = - self.datastore.update_tuf_generation_get(opctx).await?; + let generation = self.datastore.tuf_get_generation(opctx).await?; let mut inventory = Inventory::default(); let mut paginator = Paginator::new(SQL_BATCH_SIZE); while let Some(p) = paginator.next() { let batch = self .datastore - .update_tuf_artifact_list( - opctx, - generation, - &p.current_pagparams(), - ) + .tuf_list_repos(opctx, generation, &p.current_pagparams()) .await?; paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); for artifact in batch { diff --git a/nexus/src/app/update.rs b/nexus/src/app/update.rs index 5ba1b9e0a40..a843fb072b6 100644 --- a/nexus/src/app/update.rs +++ b/nexus/src/app/update.rs @@ -43,7 +43,7 @@ impl super::Nexus { // Now store the artifacts in the database. let response = self .db_datastore - .update_tuf_repo_insert(opctx, artifacts_with_plan.description()) + .tuf_repo_insert(opctx, artifacts_with_plan.description()) .await .map_err(HttpError::from)?; @@ -88,7 +88,7 @@ impl super::Nexus { let tuf_repo_description = self .db_datastore - .update_tuf_repo_get_by_version(opctx, system_version.into()) + .tuf_repo_get_by_version(opctx, system_version.into()) .await .map_err(HttpError::from)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ed0089f4c8d..adafe160f61 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6575,7 +6575,7 @@ impl NexusExternalApi for NexusExternalApiImpl { // Fetch the TUF repo metadata and update the target release. let tuf_repo_id = nexus .datastore() - .update_tuf_repo_get_by_version(&opctx, system_version.into()) + .tuf_repo_get_by_version(&opctx, system_version.into()) .await? .repo .id; diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 9e2a6ed95a2..e030a6342b1 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -114,7 +114,7 @@ async fn test_repo_upload() -> Result<()> { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 1u32.into() ); @@ -162,7 +162,7 @@ async fn test_repo_upload() -> Result<()> { })); // The generation number should now be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -215,7 +215,7 @@ async fn test_repo_upload() -> Result<()> { // We didn't insert a new repo, so the generation number should still be 2. assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); @@ -361,7 +361,7 @@ async fn test_repo_upload() -> Result<()> { } // No artifacts changed, so the generation number should still be 2... assert_eq!( - datastore.update_tuf_generation_get(&opctx).await.unwrap(), + datastore.tuf_get_generation(&opctx).await.unwrap(), 2u32.into() ); // ... and the task should have nothing to do and should immediately From a7c1e1752972e95bf0f61774a6febf03df394130 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:15:57 -0600 Subject: [PATCH 09/32] Fix typed UUID --- nexus/db-queries/src/db/datastore/update.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index c27d44d797a..f38aa73d5d7 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -17,7 +17,9 @@ use diesel::result::Error as DieselError; use nexus_db_errors::OptionalError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_lookup::DbConnection; -use nexus_db_model::{ArtifactHash, TufArtifact, TufRepo, TufRepoDescription}; +use nexus_db_model::{ + ArtifactHash, TufArtifact, TufRepo, TufRepoDescription, to_db_typed_uuid, +}; use omicron_common::api::external::{ self, CreateResult, DataPageParams, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, @@ -118,9 +120,8 @@ impl DataStore { use nexus_db_schema::schema::tuf_repo::dsl; let conn = self.pool_connection_authorized(opctx).await?; - let repo_id = repo_id.into_untyped_uuid(); let repo = dsl::tuf_repo - .filter(dsl::id.eq(repo_id)) + .filter(dsl::id.eq(to_db_typed_uuid(repo_id))) .select(TufRepo::as_select()) .first_async::(&*conn) .await @@ -129,7 +130,7 @@ impl DataStore { e, ErrorHandler::NotFoundByLookup( ResourceType::TufRepo, - LookupType::ById(repo_id), + LookupType::ById(repo_id.into_untyped_uuid()), ), ) })?; From 57f7154f60dc1fc0bfcbaf0b9df725137cc0777b Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 11:31:09 -0600 Subject: [PATCH 10/32] =?UTF-8?q?Rename=20ControlPlaneZone=20=E2=86=92=20N?= =?UTF-8?q?onNexusOmicronZone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../planning/src/blueprint_builder/builder.rs | 2 +- nexus/reconfigurator/planning/src/planner.rs | 4 ++-- .../planning/src/planner/update_sequence.rs | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index d3f7854d089..6b33dcd6633 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1930,7 +1930,7 @@ impl<'a> BlueprintBuilder<'a> { let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); if let Some(prev) = OrderedComponent::from(zone_kind).prev() { - if prev >= OrderedComponent::ControlPlaneZone + if prev >= OrderedComponent::NonNexusOmicronZone && self.sled_ids_with_zones().any(|sled_id| { self.current_sled_zones( sled_id, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 8ca2810b0f3..e94bffd6093 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4876,7 +4876,7 @@ pub(crate) mod test { input_builder.policy_mut().tuf_repo = Some(tuf_repo); let input = input_builder.build(); - let mut comp = OrderedComponent::ControlPlaneZone; + let mut comp = OrderedComponent::NonNexusOmicronZone; let mut parent = blueprint1; const MAX_PLANNING_ITERATIONS: usize = 100; for i in 2..=MAX_PLANNING_ITERATIONS { @@ -4915,7 +4915,7 @@ pub(crate) mod test { OrderedComponent::HostOs | OrderedComponent::SpRot => { unreachable!("can only update zones") } - OrderedComponent::ControlPlaneZone => { + OrderedComponent::NonNexusOmicronZone => { assert!( blueprint .all_omicron_zones( diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index b675dd3dec8..873ec520751 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -14,7 +14,7 @@ use strum::{EnumIter, IntoEnumIterator as _}; pub enum OrderedComponent { HostOs, SpRot, - ControlPlaneZone, + NonNexusOmicronZone, NexusZone, } @@ -53,7 +53,7 @@ impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { match zone_kind { ZoneKind::Nexus => Self::NexusZone, - _ => Self::ControlPlaneZone, + _ => Self::NonNexusOmicronZone, } } } @@ -76,19 +76,19 @@ mod test { ); assert_eq!( OrderedComponent::SpRot.next(), - Some(OrderedComponent::ControlPlaneZone) + Some(OrderedComponent::NonNexusOmicronZone) ); assert_eq!( - OrderedComponent::ControlPlaneZone.prev(), + OrderedComponent::NonNexusOmicronZone.prev(), Some(OrderedComponent::SpRot) ); assert_eq!( - OrderedComponent::ControlPlaneZone.next(), + OrderedComponent::NonNexusOmicronZone.next(), Some(OrderedComponent::NexusZone) ); assert_eq!( OrderedComponent::NexusZone.prev(), - Some(OrderedComponent::ControlPlaneZone) + Some(OrderedComponent::NonNexusOmicronZone) ); assert_eq!(OrderedComponent::NexusZone.next(), None); assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); @@ -98,7 +98,7 @@ mod test { fn ordered_component_from_zone_kind() { assert!(matches!( OrderedComponent::from(ZoneKind::CruciblePantry), - OrderedComponent::ControlPlaneZone + OrderedComponent::NonNexusOmicronZone )); assert!(matches!( OrderedComponent::from(ZoneKind::Nexus), From 77b21563f48fd868565136fa212a4428fa56a683 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:14:21 -0600 Subject: [PATCH 11/32] Simplify OrderedComponent logic --- .../planning/src/blueprint_builder/builder.rs | 36 ++++---- nexus/reconfigurator/planning/src/planner.rs | 2 +- .../planning/src/planner/update_sequence.rs | 86 +------------------ 3 files changed, 23 insertions(+), 101 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 6b33dcd6633..c64fe5bc6bd 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1929,28 +1929,32 @@ impl<'a> BlueprintBuilder<'a> { let old_repo = self.input.old_repo(); let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); - if let Some(prev) = OrderedComponent::from(zone_kind).prev() { - if prev >= OrderedComponent::NonNexusOmicronZone - && self.sled_ids_with_zones().any(|sled_id| { + match OrderedComponent::from(zone_kind) { + // Nexus can only be updated if all non-Nexus zones have been updated. + OrderedComponent::NexusZone => { + if self.sled_ids_with_zones().any(|sled_id| { self.current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .any(|z| { - let kind = z.zone_type.kind(); - let old_artifact = - Self::zone_image_artifact(old_repo, kind); - OrderedComponent::from(kind) == prev - && z.image_source == old_artifact + .filter(|z| { + OrderedComponent::from(z.zone_type.kind()) + == OrderedComponent::NonNexusOmicronZone }) - }) - { - old_artifact - } else { - new_artifact + .any(|z| z.image_source != new_artifact) + }) { + // Some dependent zone is not up-to-date. + old_artifact + } else { + // All dependent zones are up-to-date. + new_artifact + } + } + // It's always safe to use the new artifact for non-Nexus zones. + OrderedComponent::NonNexusOmicronZone => new_artifact, + OrderedComponent::HostOs | OrderedComponent::SpRot => { + unreachable!("can't get an OS or SP/RoT image from a zone") } - } else { - new_artifact } } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index e94bffd6093..5ea6497e81d 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4924,7 +4924,7 @@ pub(crate) mod test { .all(|(_, zone)| match zone.zone_type.kind() { ZoneKind::Nexus => { if zone.image_source == image_source { - comp = comp.next().unwrap(); + comp = OrderedComponent::NexusZone; true } else { zone.image_source diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index 873ec520751..066bc18d9b6 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -5,12 +5,10 @@ //! Updatable components ordered by dependencies (RFD 565). use nexus_sled_agent_shared::inventory::ZoneKind; -use strum::{EnumIter, IntoEnumIterator as _}; /// Update sequence as defined by RFD 565 §6. -#[derive( - Debug, Clone, Copy, PartialEq, Eq, Hash, EnumIter, PartialOrd, Ord, -)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[allow(dead_code)] pub enum OrderedComponent { HostOs, SpRot, @@ -18,37 +16,6 @@ pub enum OrderedComponent { NexusZone, } -/// To implement the checks in RFD 565 §9, we need to go backwards -/// (and mabye forwards) through the component ordering. The simple -/// linear scans here are fine as long as `OrderedComponent` has only -/// a few variants; if it starts getting large (e.g., programatically -/// generated from `ls-apis`), we can switch to something faster. -impl OrderedComponent { - #[allow(dead_code)] - pub fn next(&self) -> Option { - let mut found = false; - for comp in OrderedComponent::iter() { - if found { - return Some(comp); - } else if comp == *self { - found = true; - } - } - None - } - - pub fn prev(&self) -> Option { - let mut prev = None; - for comp in OrderedComponent::iter() { - if comp == *self { - return prev; - } - prev = Some(comp); - } - prev - } -} - impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { match zone_kind { @@ -57,52 +24,3 @@ impl From for OrderedComponent { } } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn ordered_component_next_prev() { - // Exhaustive checks ok for a few variants, revisit if it grows. - assert_eq!(OrderedComponent::HostOs.prev(), None); - assert_eq!( - OrderedComponent::HostOs.next(), - Some(OrderedComponent::SpRot) - ); - assert_eq!( - OrderedComponent::SpRot.prev(), - Some(OrderedComponent::HostOs) - ); - assert_eq!( - OrderedComponent::SpRot.next(), - Some(OrderedComponent::NonNexusOmicronZone) - ); - assert_eq!( - OrderedComponent::NonNexusOmicronZone.prev(), - Some(OrderedComponent::SpRot) - ); - assert_eq!( - OrderedComponent::NonNexusOmicronZone.next(), - Some(OrderedComponent::NexusZone) - ); - assert_eq!( - OrderedComponent::NexusZone.prev(), - Some(OrderedComponent::NonNexusOmicronZone) - ); - assert_eq!(OrderedComponent::NexusZone.next(), None); - assert!(OrderedComponent::HostOs < OrderedComponent::NexusZone); - } - - #[test] - fn ordered_component_from_zone_kind() { - assert!(matches!( - OrderedComponent::from(ZoneKind::CruciblePantry), - OrderedComponent::NonNexusOmicronZone - )); - assert!(matches!( - OrderedComponent::from(ZoneKind::Nexus), - OrderedComponent::NexusZone - )); - } -} From e31fc60985db98643ba702fbf2e9c8279915e62e Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:41:49 -0600 Subject: [PATCH 12/32] Simplify out-of-date zone collection --- nexus/reconfigurator/planning/src/planner.rs | 22 +++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 5ea6497e81d..384a442ebb9 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -37,7 +37,6 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; -use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -888,22 +887,22 @@ impl<'a> Planner<'a> { .collect::>(); // Wait for all current zones to appear in the inventory. - // It would be nice if we could check their image source, - // but the inventory doesn't report that correctly: see + // TODO-correctness: We should check their image source, + // but the inventory doesn't report that yet; see // . let inventory_zones = self .inventory .all_omicron_zones() - .map(|z| (z.id, z)) - .collect::>(); - for sled_id in sleds.iter().cloned() { + .map(|z| z.id) + .collect::>(); + for &sled_id in &sleds { if !self .blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .all(|zone| inventory_zones.contains_key(&zone.id)) + .all(|zone| inventory_zones.contains(&zone.id)) { info!( self.log, "zones not yet up-to-date"; @@ -917,19 +916,18 @@ impl<'a> Planner<'a> { let mut out_of_date_zones = sleds .into_iter() .flat_map(|sled_id| { - self.blueprint + let blueprint = &self.blueprint; + blueprint .current_sled_zones( sled_id, BlueprintZoneDisposition::is_in_service, ) - .filter_map(|zone| { + .filter_map(move |zone| { (zone.image_source - != self - .blueprint + != blueprint .zone_image_source(zone.zone_type.kind())) .then(|| (sled_id, zone.clone())) }) - .collect::>() }) .collect::>(); From 7f1f52244c6a4d9eac2ad677323a8903f32065f5 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:43:56 -0600 Subject: [PATCH 13/32] Make planning error an error --- nexus/reconfigurator/planning/src/blueprint_builder/builder.rs | 2 ++ nexus/reconfigurator/planning/src/planner.rs | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index c64fe5bc6bd..50686c096d0 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -125,6 +125,8 @@ pub enum Error { AllocateExternalNetworking(#[from] ExternalNetworkingError), #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] PolicySpecifiesTooManyInternalDnsServers, + #[error("zone is already up-to-date and should not be updated")] + ZoneAlreadyUpToDate, } /// Describes the result of an idempotent "ensure" operation diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 384a442ebb9..02f8b5abe1f 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -951,13 +951,14 @@ impl<'a> Planner<'a> { let image_source = self.blueprint.zone_image_source(zone_kind); if zone.image_source == image_source { // This should only happen in the event of a planning error above. - warn!( + error!( self.log, "zone is already up-to-date"; "sled_id" => %sled_id, "zone_id" => %zone.id, "kind" => ?zone.zone_type.kind(), "image_source" => %image_source, ); + return Err(Error::ZoneAlreadyUpToDate); } else { match zone_kind { ZoneKind::Crucible From 050933ef758a22e473be7f0b18091e3565d20e74 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:47:26 -0600 Subject: [PATCH 14/32] Explicitly list which zone kinds get in-place updates --- nexus/reconfigurator/planning/src/planner.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 02f8b5abe1f..21d2d5b7605 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -962,8 +962,10 @@ impl<'a> Planner<'a> { } else { match zone_kind { ZoneKind::Crucible - | ZoneKind::CockroachDb - | ZoneKind::Clickhouse => { + | ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer + | ZoneKind::CockroachDb => { info!( self.log, "updating zone image source in-place"; "sled_id" => %sled_id, @@ -977,7 +979,13 @@ impl<'a> Planner<'a> { image_source, )?; } - _ => { + ZoneKind::BoundaryNtp + | ZoneKind::CruciblePantry + | ZoneKind::ExternalDns + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Nexus + | ZoneKind::Oximeter => { info!( self.log, "expunging out-of-date zone"; "sled_id" => %sled_id, From 2daabf5126ad1c83ac231c04aa75182e7cbaeea7 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 12:50:39 -0600 Subject: [PATCH 15/32] Uncomment test assertion --- nexus/reconfigurator/planning/src/planner.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 21d2d5b7605..9f6fd623a90 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4549,13 +4549,13 @@ pub(crate) mod test { // We should start with no specified TUF repo and nothing to do. assert!(example.input.tuf_repo().is_none()); - // assert_planning_makes_no_changes( - // &logctx.log, - // &blueprint1, - // &example.input, - // &example.collection, - // TEST_NAME, - // ); + assert_planning_makes_no_changes( + &logctx.log, + &blueprint1, + &example.input, + &example.collection, + TEST_NAME, + ); // All zones should be sourced from the install dataset by default. assert!( From 88733d257718496cd1492da26c34dd409eb82c42 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 13:00:53 -0600 Subject: [PATCH 16/32] Type-safe fake-zone names --- nexus/reconfigurator/planning/src/planner.rs | 70 ++++++++------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 9f6fd623a90..66d3ffffb76 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4532,6 +4532,20 @@ pub(crate) mod test { example.system.to_collection_builder().unwrap().build(); } + macro_rules! fake_zone_artifact { + ($kind: ident, $version: expr) => { + TufArtifactMeta { + id: ArtifactId { + name: ZoneKind::$kind.artifact_name().to_string(), + version: $version, + kind: ArtifactKind::from_known(KnownArtifactKind::Zone), + }, + hash: ArtifactHash([0; 32]), + size: 0, + } + }; + } + #[test] fn test_update_crucible_pantry() { static TEST_NAME: &str = "update_crucible_pantry"; @@ -4616,24 +4630,8 @@ pub(crate) mod test { hash: fake_hash, }; let artifacts = vec![ - TufArtifactMeta { - id: ArtifactId { - name: String::from("crucible-pantry-zone"), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - }, - TufArtifactMeta { - id: ArtifactId { - name: String::from("nexus"), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - }, + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), ]; input_builder.policy_mut().tuf_repo = Some(TufRepoDescription { repo: TufRepoMeta { @@ -4845,19 +4843,6 @@ pub(crate) mod test { }, hash: fake_hash, }; - macro_rules! zone_artifact { - ($name:expr) => { - TufArtifactMeta { - id: ArtifactId { - name: String::from($name), - version: version.clone(), - kind: ArtifactKind::from_known(KnownArtifactKind::Zone), - }, - hash: fake_hash, - size: 0, - } - }; - } let tuf_repo = TufRepoDescription { repo: TufRepoMeta { hash: fake_hash, @@ -4867,17 +4852,18 @@ pub(crate) mod test { file_name: String::from(""), }, artifacts: vec![ - zone_artifact!("ntp"), - zone_artifact!("clickhouse"), - zone_artifact!("clickhouse_keeper"), - zone_artifact!("cockroachdb"), - zone_artifact!("crucible-zone"), - zone_artifact!("crucible-pantry-zone"), - zone_artifact!("external-dns"), - zone_artifact!("internal-dns"), - zone_artifact!("ntp"), - zone_artifact!("nexus"), - zone_artifact!("oximeter"), + fake_zone_artifact!(BoundaryNtp, version.clone()), + fake_zone_artifact!(Clickhouse, version.clone()), + fake_zone_artifact!(ClickhouseKeeper, version.clone()), + fake_zone_artifact!(ClickhouseServer, version.clone()), + fake_zone_artifact!(CockroachDb, version.clone()), + fake_zone_artifact!(Crucible, version.clone()), + fake_zone_artifact!(CruciblePantry, version.clone()), + fake_zone_artifact!(ExternalDns, version.clone()), + fake_zone_artifact!(InternalDns, version.clone()), + fake_zone_artifact!(InternalNtp, version.clone()), + fake_zone_artifact!(Nexus, version.clone()), + fake_zone_artifact!(Oximeter, version.clone()), ], }; input_builder.policy_mut().tuf_repo = Some(tuf_repo); From 3ae171ef4d07b9be399c1c68838741001c8d5b5a Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Sat, 10 May 2025 13:53:53 -0600 Subject: [PATCH 17/32] Fix doc bug from renamed method --- nexus/db-queries/src/db/datastore/update.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index f38aa73d5d7..6dcee53e5c8 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -31,7 +31,7 @@ use swrite::{SWrite, swrite}; use tufaceous_artifact::ArtifactVersion; use uuid::Uuid; -/// The return value of [`DataStore::update_tuf_repo_insert`]. +/// The return value of [`DataStore::tuf_repo_insert`]. /// /// This is similar to [`external::TufRepoInsertResponse`], but uses /// nexus-db-model's types instead of external types. From 3611b428c99ed9d2b993b2eb7db2f4dc57b2aeb1 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Wed, 21 May 2025 12:03:16 -0600 Subject: [PATCH 18/32] Refactor zone update readiness check --- .../planning/src/blueprint_builder/builder.rs | 64 +++++---- nexus/reconfigurator/planning/src/planner.rs | 131 +++++++++++------- .../planning/src/planner/update_sequence.rs | 8 +- 3 files changed, 120 insertions(+), 83 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index a8352019acf..c63cb78e5c1 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1947,34 +1947,48 @@ impl<'a> BlueprintBuilder<'a> { ) -> BlueprintZoneImageSource { let new_repo = self.input.tuf_repo(); let old_repo = self.input.old_repo(); - let new_artifact = Self::zone_image_artifact(new_repo, zone_kind); - let old_artifact = Self::zone_image_artifact(old_repo, zone_kind); + Self::zone_image_artifact( + if self.zone_is_ready_for_update(zone_kind, new_repo) { + new_repo + } else { + old_repo + }, + zone_kind, + ) + } + + /// Return `true` iff a zone of the given kind is ready to be updated; + /// i.e., its dependencies have been updated, or its data sufficiently + /// replicated, etc. + fn zone_is_ready_for_update( + &self, + zone_kind: ZoneKind, + new_repo: Option<&TufRepoDescription>, + ) -> bool { match OrderedComponent::from(zone_kind) { - // Nexus can only be updated if all non-Nexus zones have been updated. - OrderedComponent::NexusZone => { - if self.sled_ids_with_zones().any(|sled_id| { - self.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .filter(|z| { - OrderedComponent::from(z.zone_type.kind()) - == OrderedComponent::NonNexusOmicronZone - }) - .any(|z| z.image_source != new_artifact) - }) { - // Some dependent zone is not up-to-date. - old_artifact - } else { - // All dependent zones are up-to-date. - new_artifact - } - } - // It's always safe to use the new artifact for non-Nexus zones. - OrderedComponent::NonNexusOmicronZone => new_artifact, OrderedComponent::HostOs | OrderedComponent::SpRot => { - unreachable!("can't get an OS or SP/RoT image from a zone") + todo!("can't yet update Host OS or SP/RoT") } + OrderedComponent::OmicronZone(kind) => match kind { + ZoneKind::Nexus => { + // Nexus can only be updated if all non-Nexus zones have been updated, + // i.e., their image source is an artifact from the new repo. + self.sled_ids_with_zones().all(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter(|z| z.zone_type.kind() != ZoneKind::Nexus) + .all(|z| { + z.image_source + == Self::zone_image_artifact(new_repo, z.kind()) + }) + }) + } + // + // ZoneKind::CockroachDb => todo!("check cluster status in inventory"), + _ => true, // other zone kinds have no special dependencies + }, } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 655a44b7004..02387f7aedd 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -4548,6 +4548,8 @@ pub(crate) mod test { }; } + /// Ensure that dependent zones (here just Crucible Pantry) are updated + /// before Nexus. #[test] fn test_update_crucible_pantry() { static TEST_NAME: &str = "update_crucible_pantry"; @@ -4654,6 +4656,9 @@ pub(crate) mod test { BlueprintZoneImageSource::InstallDataset ) }; + let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool { + zone.zone_type.is_nexus() && zone.image_source == image_source + }; let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool { zone.zone_type.is_crucible_pantry() && matches!( @@ -4765,7 +4770,7 @@ pub(crate) mod test { // One more iteration for the last old zone to be expunged. update_collection_from_blueprint(&mut example, &parent); - let blueprint = Planner::new_based_on( + let blueprint10 = Planner::new_based_on( log.clone(), &parent, &input, @@ -4777,16 +4782,17 @@ pub(crate) mod test { .plan() .expect("replan for last blueprint"); + // All old Pantry zones should now be expunged. assert_eq!( - blueprint + blueprint10 .all_omicron_zones(BlueprintZoneDisposition::is_expunged) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // We won't be able to update Nexus until *all* of its dependent zones - // are updated. Stay tuned for the next test! + // Now we can update Nexus, because all of its dependent zones + // are up-to-date w/r/t the new repo. assert_eq!( parent .all_omicron_zones(BlueprintZoneDisposition::is_in_service) @@ -4794,12 +4800,58 @@ pub(crate) mod test { .count(), NEXUS_REDUNDANCY + 1, ); + let mut parent = blueprint10; + for i in 11..=17 { + let blueprint_name = format!("blueprint_{i}"); + update_collection_from_blueprint(&mut example, &parent); + + let blueprint = Planner::new_based_on( + log.clone(), + &parent, + &input, + &blueprint_name, + &example.collection, + ) + .expect("can't create planner") + .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) + .plan() + .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + + let summary = blueprint.diff_since_blueprint(&parent); + eprintln!("{}", summary.display()); + for sled in summary.diff.sleds.modified_values_diff() { + if i % 2 == 0 { + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + } else { + assert!(sled.zones.removed.is_empty()); + assert_eq!(sled.zones.added.len(), 1); + let added = sled.zones.added.values().next().unwrap(); + assert!(matches!( + &added.zone_type, + BlueprintZoneType::Nexus(_) + )); + assert_eq!(added.image_source, image_source); + } + } + + parent = blueprint; + } // Everything's up-to-date in Kansas City! - update_collection_from_blueprint(&mut example, &blueprint); + let blueprint17 = parent; + assert_eq!( + blueprint17 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + NEXUS_REDUNDANCY + 1, + ); + + update_collection_from_blueprint(&mut example, &blueprint17); assert_planning_makes_no_changes( &logctx.log, - &blueprint, + &blueprint17, &input, &example.collection, TEST_NAME, @@ -4808,6 +4860,7 @@ pub(crate) mod test { logctx.cleanup_successful(); } + /// Ensure that planning to update all zones terminates. #[test] fn test_update_all_zones() { static TEST_NAME: &str = "update_all_zones"; @@ -4871,9 +4924,17 @@ pub(crate) mod test { input_builder.policy_mut().tuf_repo = Some(tuf_repo); let input = input_builder.build(); - let mut comp = OrderedComponent::NonNexusOmicronZone; - let mut parent = blueprint1; + /// Expected number of planner iterations required to converge. + /// If incidental planner work changes this value occasionally, + /// that's fine; but if we find we're changing it all the time, + /// we should probably drop it and keep just the maximum below. + const EXP_PLANNING_ITERATIONS: usize = 57; + + /// Planning must not take more than this number of iterations. const MAX_PLANNING_ITERATIONS: usize = 100; + assert!(EXP_PLANNING_ITERATIONS < MAX_PLANNING_ITERATIONS); + + let mut parent = blueprint1; for i in 2..=MAX_PLANNING_ITERATIONS { let blueprint_name = format!("blueprint_{i}"); update_collection_from_blueprint(&mut example, &parent); @@ -4894,62 +4955,28 @@ pub(crate) mod test { && summary.total_zones_removed() == 0 && summary.total_zones_modified() == 0 { - println!("planning converged after {i} iterations"); assert!( blueprint .all_omicron_zones( BlueprintZoneDisposition::is_in_service ) - .all(|(_, zone)| zone.image_source == image_source) + .all(|(_, zone)| zone.image_source == image_source), + "failed to update all zones" + ); + + assert_eq!( + i, EXP_PLANNING_ITERATIONS, + "expected {EXP_PLANNING_ITERATIONS} iterations but converged in {i}" ); + println!("planning converged after {i} iterations"); + logctx.cleanup_successful(); return; } - match comp { - OrderedComponent::HostOs | OrderedComponent::SpRot => { - unreachable!("can only update zones") - } - OrderedComponent::NonNexusOmicronZone => { - assert!( - blueprint - .all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) - .all(|(_, zone)| match zone.zone_type.kind() { - ZoneKind::Nexus => { - if zone.image_source == image_source { - comp = OrderedComponent::NexusZone; - true - } else { - zone.image_source - == BlueprintZoneImageSource::InstallDataset - } - } - _ => zone.image_source - == BlueprintZoneImageSource::InstallDataset - || zone.image_source == image_source, - }) - ); - } - OrderedComponent::NexusZone => { - assert!( - blueprint - .all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) - .all(|(_, zone)| match zone.zone_type.kind() { - ZoneKind::Nexus => zone.image_source - == BlueprintZoneImageSource::InstallDataset - || zone.image_source == image_source, - _ => zone.image_source == image_source, - }) - ); - } - } - parent = blueprint; } + panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } } diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs index 066bc18d9b6..38d28d06e2a 100644 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ b/nexus/reconfigurator/planning/src/planner/update_sequence.rs @@ -12,15 +12,11 @@ use nexus_sled_agent_shared::inventory::ZoneKind; pub enum OrderedComponent { HostOs, SpRot, - NonNexusOmicronZone, - NexusZone, + OmicronZone(ZoneKind), } impl From for OrderedComponent { fn from(zone_kind: ZoneKind) -> Self { - match zone_kind { - ZoneKind::Nexus => Self::NexusZone, - _ => Self::NonNexusOmicronZone, - } + Self::OmicronZone(zone_kind) } } From 70edce738cb7f8e3bb2071d3ca650d8fa033e885 Mon Sep 17 00:00:00 2001 From: Alex Plotnick Date: Thu, 22 May 2025 15:30:30 -0600 Subject: [PATCH 19/32] Check zone image source in inventory Also remove now-superfluous `OrderedComponent` enum. --- .../output/cmds-expunge-newly-added-stdout | 2 +- .../planning/src/blueprint_builder/builder.rs | 42 +++---- nexus/reconfigurator/planning/src/planner.rs | 110 +++++------------- .../planning/src/planner/update_sequence.rs | 22 ---- 4 files changed, 45 insertions(+), 131 deletions(-) delete mode 100644 nexus/reconfigurator/planning/src/planner/update_sequence.rs diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout index e6f2c202c8a..c5068c3b170 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-stdout @@ -628,7 +628,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 -INFO zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 +INFO some zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index cd08fb46725..458156d7982 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -14,7 +14,6 @@ use crate::blueprint_editor::ExternalSnatNetworkingChoice; use crate::blueprint_editor::NoAvailableDnsSubnets; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; -use crate::planner::OrderedComponent; use crate::planner::ZoneExpungeReason; use crate::planner::rng::PlannerRng; use anyhow::Context as _; @@ -1965,30 +1964,25 @@ impl<'a> BlueprintBuilder<'a> { zone_kind: ZoneKind, new_repo: Option<&TufRepoDescription>, ) -> bool { - match OrderedComponent::from(zone_kind) { - OrderedComponent::HostOs | OrderedComponent::SpRot => { - todo!("can't yet update Host OS or SP/RoT") - } - OrderedComponent::OmicronZone(kind) => match kind { - ZoneKind::Nexus => { - // Nexus can only be updated if all non-Nexus zones have been updated, - // i.e., their image source is an artifact from the new repo. - self.sled_ids_with_zones().all(|sled_id| { - self.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .filter(|z| z.zone_type.kind() != ZoneKind::Nexus) - .all(|z| { - z.image_source - == Self::zone_image_artifact(new_repo, z.kind()) - }) + match zone_kind { + ZoneKind::Nexus => { + // Nexus can only be updated if all non-Nexus zones have been updated, + // i.e., their image source is an artifact from the new repo. + self.sled_ids_with_zones().all(|sled_id| { + self.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter(|z| z.zone_type.kind() != ZoneKind::Nexus) + .all(|z| { + z.image_source + == Self::zone_image_artifact(new_repo, z.kind()) }) - } - // - // ZoneKind::CockroachDb => todo!("check cluster status in inventory"), - _ => true, // other zone kinds have no special dependencies - }, + }) + } + // + // ZoneKind::CockroachDb => todo!("check cluster status in inventory"), + _ => true, // other zone kinds have no special dependencies } } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 74127702f77..4cb65c02546 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -37,6 +37,7 @@ use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use slog::error; use slog::{Logger, info, warn}; +use std::collections::BTreeMap; use std::collections::BTreeSet; use std::str::FromStr; @@ -45,11 +46,9 @@ use self::omicron_zone_placement::OmicronZonePlacement; use self::omicron_zone_placement::OmicronZonePlacementSledState; pub use self::rng::PlannerRng; pub use self::rng::SledPlannerRng; -pub(crate) use self::update_sequence::OrderedComponent; mod omicron_zone_placement; pub(crate) mod rng; -mod update_sequence; pub struct Planner<'a> { log: Logger, @@ -905,15 +904,12 @@ impl<'a> Planner<'a> { .map(|(id, _details)| id) .collect::>(); - // Wait for all current zones to appear in the inventory. - // TODO-correctness: We should check their image source, - // but the inventory doesn't report that yet; see - // . + // Wait for zones to appear up-to-date in the inventory. let inventory_zones = self .inventory - .all_omicron_zones() - .map(|z| z.id) - .collect::>(); + .all_ledgered_omicron_zones() + .map(|z| (z.id, z.image_source.clone())) + .collect::>(); for &sled_id in &sleds { if !self .blueprint @@ -921,18 +917,21 @@ impl<'a> Planner<'a> { sled_id, BlueprintZoneDisposition::is_in_service, ) - .all(|zone| inventory_zones.contains(&zone.id)) + .all(|zone| { + let image_source = zone.image_source.clone().into(); + inventory_zones.get(&zone.id) == Some(&image_source) + }) { info!( - self.log, "zones not yet up-to-date"; + self.log, "some zones not yet up-to-date"; "sled_id" => %sled_id, ); return Ok(()); } } - // Choose among the out-of-date zones one with the fewest dependencies. - let mut out_of_date_zones = sleds + // Update the first out-of-date zone. + let out_of_date_zones = sleds .into_iter() .flat_map(|sled_id| { let blueprint = &self.blueprint; @@ -949,8 +948,6 @@ impl<'a> Planner<'a> { }) }) .collect::>(); - - sort_zones_by_deps(&mut out_of_date_zones); if let Some((sled_id, zone)) = out_of_date_zones.first() { return self.update_or_expunge_zone(*sled_id, zone); } @@ -1113,15 +1110,6 @@ impl<'a> Planner<'a> { } } -/// Sort in place a vector of sled-specific zone configurations by API -/// dependencies (RFD 565 §6). -fn sort_zones_by_deps(zones: &mut Vec<(SledUuid, BlueprintZoneConfig)>) { - zones.sort_by(|a, b| { - OrderedComponent::from(a.1.zone_type.kind()) - .cmp(&OrderedComponent::from(b.1.zone_type.kind())) - }) -} - /// The reason a sled's zones need to be expunged. /// /// This is used only for introspection and logging -- it's not part of the @@ -4511,37 +4499,6 @@ pub(crate) mod test { logctx.cleanup_successful(); } - #[test] - fn test_sort_zones_by_deps() { - static TEST_NAME: &str = "sort_zones_by_deps"; - let logctx = test_setup_log(TEST_NAME); - let log = logctx.log.clone(); - - // Collect zone configs from our example system. - let (_collection, _input, blueprint) = example(&log, TEST_NAME); - let mut zones = blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .map(|(sled_id, z)| (sled_id, z.clone())) - .collect::>(); - - // Sort them and verify the ordering constraints. - sort_zones_by_deps(&mut zones); - let mut nexus = false; - for kind in zones.iter().map(|(_, z)| z.zone_type.kind()) { - match kind { - ZoneKind::Nexus => { - nexus = true; - } - _ => { - assert!(!nexus); - } - } - } - assert!(nexus); - - logctx.cleanup_successful(); - } - /// Manually update the example system's inventory collection's zones /// from a blueprint. fn update_collection_from_blueprint( @@ -4549,15 +4506,13 @@ pub(crate) mod test { blueprint: &Blueprint, ) { for (&sled_id, config) in blueprint.sleds.iter() { - let sled_config = config.clone().into_in_service_sled_config(); - let zones_config = OmicronZonesConfig { - generation: sled_config.generation, - zones: sled_config.zones.into_iter().collect(), - }; example .system - .sled_set_omicron_zones(sled_id, zones_config) - .expect("can't set omicron zones for sled"); + .sled_set_omicron_config( + sled_id, + config.clone().into_in_service_sled_config(), + ) + .expect("can't set sled config"); } example.collection = example.system.to_collection_builder().unwrap().build(); @@ -4787,33 +4742,20 @@ pub(crate) mod test { parent = blueprint; } + let blueprint9 = parent; // All Crucible Pantries should now be updated. assert_eq!( - parent + blueprint9 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // One more iteration for the last old zone to be expunged. - update_collection_from_blueprint(&mut example, &parent); - let blueprint10 = Planner::new_based_on( - log.clone(), - &parent, - &input, - "last_blueprint", - &example.collection, - ) - .expect("can't create planner") - .with_rng(PlannerRng::from_seed((TEST_NAME, "last_bp"))) - .plan() - .expect("replan for last blueprint"); - // All old Pantry zones should now be expunged. assert_eq!( - blueprint10 + blueprint9 .all_omicron_zones(BlueprintZoneDisposition::is_expunged) .filter(|(_, z)| is_old_pantry(z)) .count(), @@ -4823,17 +4765,17 @@ pub(crate) mod test { // Now we can update Nexus, because all of its dependent zones // are up-to-date w/r/t the new repo. assert_eq!( - parent + blueprint9 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_nexus(z)) .count(), NEXUS_REDUNDANCY + 1, ); - let mut parent = blueprint10; - for i in 11..=17 { - let blueprint_name = format!("blueprint_{i}"); + let mut parent = blueprint9; + for i in 10..=17 { update_collection_from_blueprint(&mut example, &parent); + let blueprint_name = format!("blueprint{i}"); let blueprint = Planner::new_based_on( log.clone(), &parent, @@ -4847,7 +4789,6 @@ pub(crate) mod test { .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); let summary = blueprint.diff_since_blueprint(&parent); - eprintln!("{}", summary.display()); for sled in summary.diff.sleds.modified_values_diff() { if i % 2 == 0 { assert!(sled.zones.added.is_empty()); @@ -4965,8 +4906,9 @@ pub(crate) mod test { let mut parent = blueprint1; for i in 2..=MAX_PLANNING_ITERATIONS { - let blueprint_name = format!("blueprint_{i}"); update_collection_from_blueprint(&mut example, &parent); + + let blueprint_name = format!("blueprint{i}"); let blueprint = Planner::new_based_on( log.clone(), &parent, diff --git a/nexus/reconfigurator/planning/src/planner/update_sequence.rs b/nexus/reconfigurator/planning/src/planner/update_sequence.rs deleted file mode 100644 index 38d28d06e2a..00000000000 --- a/nexus/reconfigurator/planning/src/planner/update_sequence.rs +++ /dev/null @@ -1,22 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Updatable components ordered by dependencies (RFD 565). - -use nexus_sled_agent_shared::inventory::ZoneKind; - -/// Update sequence as defined by RFD 565 §6. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[allow(dead_code)] -pub enum OrderedComponent { - HostOs, - SpRot, - OmicronZone(ZoneKind), -} - -impl From for OrderedComponent { - fn from(zone_kind: ZoneKind) -> Self { - Self::OmicronZone(zone_kind) - } -} From 29d20936a2c515f94177ab3a6a83302f0c957c8f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 28 May 2025 11:43:29 -0700 Subject: [PATCH 20/32] initial draft: planning SP, some pieces missing --- Cargo.lock | 2 + nexus/reconfigurator/planning/Cargo.toml | 2 + nexus/reconfigurator/planning/src/lib.rs | 1 + .../planning/src/mgs_updates/mod.rs | 1197 +++++++++++++++++ nexus/types/src/deployment.rs | 1 - 5 files changed, 1202 insertions(+), 1 deletion(-) create mode 100644 nexus/reconfigurator/planning/src/mgs_updates/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 4e7338cd2fc..4bbbb4baac5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6466,6 +6466,7 @@ dependencies = [ "clickhouse-admin-types", "daft", "debug-ignore", + "dropshot", "expectorate", "gateway-client", "id-map", @@ -6488,6 +6489,7 @@ dependencies = [ "oxnet", "proptest", "rand 0.8.5", + "semver 1.0.26", "sled-agent-client", "slog", "slog-error-chain", diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 695e65eb584..d45ffec496c 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -42,9 +42,11 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] +dropshot.workspace = true expectorate.workspace = true maplit.workspace = true omicron-common = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true proptest.workspace = true +semver.workspace = true test-strategy.workspace = true diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs index 8262380075c..a11c5e4132c 100644 --- a/nexus/reconfigurator/planning/src/lib.rs +++ b/nexus/reconfigurator/planning/src/lib.rs @@ -9,5 +9,6 @@ pub mod blueprint_builder; pub mod blueprint_editor; pub mod example; +pub mod mgs_updates; pub mod planner; pub mod system; diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs new file mode 100644 index 00000000000..241b05f7e6c --- /dev/null +++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs @@ -0,0 +1,1197 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Facilities for making choices about MGS-managed updates + +use nexus_types::deployment::ExpectedVersion; +use nexus_types::deployment::PendingMgsUpdate; +use nexus_types::deployment::PendingMgsUpdateDetails; +use nexus_types::deployment::PendingMgsUpdates; +use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::Collection; +use omicron_common::api::external::TufRepoDescription; +use slog::{debug, error, info, warn}; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeSet; +use std::sync::Arc; +use thiserror::Error; +use tufaceous_artifact::ArtifactVersion; +use tufaceous_artifact::KnownArtifactKind; + +pub fn plan_mgs_updates( + log: &slog::Logger, + inventory: &Collection, + current_boards: &BTreeSet>, + current_updates: &PendingMgsUpdates, + current_artifacts: Option<&TufRepoDescription>, + nmax_updates: usize, +) -> PendingMgsUpdates { + let mut rv = PendingMgsUpdates::new(); + let mut boards_preferred = BTreeSet::new(); + + // Determine the status of all currently pending updates by comparing what + // they were trying to do (and their preconditions) against the current + // state (from inventory). + // + // If a pending update is either done or impossible, we'll prioritize + // evaluating the same board for any further updates. For the "done" case, + // this will cause us to update one board's SP, RoT, etc. before moving onto + // another board. For the "impossible" case, this should just fix up the + // update request with updated preconditions so that it can complete. + // + // If a pending update is in-progress or if we cannot determine its status + // because inventory is incomplete, then we'll "keep" it (that is, we copy + // the same update into the `PendingMgsUpdates` that we're returning). + for update in current_updates { + match mgs_update_status(log, inventory, update) { + Ok(MgsUpdateStatus::Done) => { + info!( + log, + "SP update completed \ + (will remove it and re-evaluate board)"; + update + ); + boards_preferred.insert(update.baseboard_id.clone()); + } + Ok(MgsUpdateStatus::Impossible) => { + info!( + log, + "SP update impossible + (will remove it and re-evaluate board)"; + update + ); + boards_preferred.insert(update.baseboard_id.clone()); + } + Ok(MgsUpdateStatus::NotDone) => { + info!( + log, + "SP update not yet completed (will keep it)"; + update + ); + rv.insert(update.clone()); + } + Err(error) => { + info!( + log, + "cannot determine SP update status (will keep it)"; + update, + InlineErrorChain::new(&error) + ); + rv.insert(update.clone()); + } + } + } + + // If we don't have any current artifacts (i.e., there is no target release + // set), then we cannot configure more updates. + let Some(current_artifacts) = ¤t_artifacts else { + warn!(log, "cannot issue more SP updates (no current artifacts)"); + return rv; + }; + + // Next, configure new updates for any boards that need an update, up to + // `nmax_updates`. + // + // For the reasons mentioned above, we'll start with the boards that just + // had an in-progress update that we elected not to keep. Then we'll look + // at all the other boards. Note that using `extend` here will cause the + // boards that we're prioritizing to appear twice in this list. + let non_preferred = + current_boards.iter().filter(|b| !boards_preferred.contains(*b)); + let candidates = boards_preferred.iter().chain(non_preferred); + for board in candidates { + if rv.len() == nmax_updates { + info!( + log, + "reached maximum number of pending SP updates"; + "max" => nmax_updates + ); + return rv; + } + + match try_make_update(log, board, inventory, current_artifacts) { + Some(update) => { + info!(log, "configuring SP update"; &update); + rv.insert(update); + } + None => { + info!(log, "skipping board for SP update"; board); + } + } + } + + info!(log, "ran out of boards for SP update"); + rv +} + +#[derive(Debug)] +enum MgsUpdateStatus { + /// the requested update has completed (i.e., the active slot is running the + /// requested version) + Done, + /// the requested update has not completed, but remains possible + /// (i.e., the active slot is not running the requested version and the + /// preconditions remain true) + NotDone, + /// the requested update has not completed and the preconditions are not + /// currently met + Impossible, +} + +#[derive(Debug, Error)] +enum MgsUpdateStatusError { + #[error("no SP info found in inventory")] + MissingSpInfo, + #[error("no caboose found for active slot in inventory")] + MissingActiveCaboose, + #[error("not yet implemented")] + NotYetImplemented, +} + +/// Determine the status of a single MGS update based on what's in inventory for +/// that board. +fn mgs_update_status( + log: &slog::Logger, + inventory: &Collection, + update: &PendingMgsUpdate, +) -> Result { + let baseboard_id = &update.baseboard_id; + let desired_version = &update.artifact_version; + + // Check the contents of the cabooses against what we expect either before + // or after the update. + // + // We check this before anything else because if we get back + // `MgsUpdateStatus::Done`, then we're done no matter what else is true. + let caboose_status = match &update.details { + PendingMgsUpdateDetails::Sp { + expected_active_version, + expected_inactive_version, + } => { + let Some(active_caboose) = + inventory.caboose_for(CabooseWhich::SpSlot0, baseboard_id) + else { + return Err(MgsUpdateStatusError::MissingActiveCaboose); + }; + + let found_inactive_version = inventory + .caboose_for(CabooseWhich::SpSlot1, baseboard_id) + .map(|c| c.caboose.version.as_ref()); + + Ok(mgs_update_status_sp( + desired_version, + expected_active_version, + expected_inactive_version, + &active_caboose.caboose.version, + found_inactive_version, + )) + } + PendingMgsUpdateDetails::Rot { .. } => { + return Err(MgsUpdateStatusError::NotYetImplemented); + } + }; + + // If we're able to reach a clear determination based on the caboose status + // alone, great. Return that. + if matches!( + caboose_status, + Err(_) | Ok(MgsUpdateStatus::Done) | Ok(MgsUpdateStatus::Impossible) + ) { + return caboose_status; + } + + // If based on the caboose we're only able to determine that the update is + // not yet done, there's another "impossible" case to consider: that the + // baseboard has moved in the rack. + let sp_info = inventory + .sps + .get(baseboard_id) + .ok_or(MgsUpdateStatusError::MissingSpInfo)?; + if sp_info.sp_type != update.sp_type { + // This should be impossible. This same baseboard has somehow changed + // its type (e.g., sled vs. switch vs. PSC). This doesn't affect what + // we do here but definitely raises a red flag. + error!( + log, + "baseboard appears to have changed board type"; + "sp_info" => #?sp_info, + update, + ); + Ok(MgsUpdateStatus::Impossible) + } else if u32::from(sp_info.sp_slot) != update.slot_id { + warn!( + log, + "baseboard with in-progress SP update has moved"; + "sp_info" => #?sp_info, + update, + ); + Ok(MgsUpdateStatus::Impossible) + } else { + caboose_status + } +} + +fn mgs_update_status_sp( + desired_version: &ArtifactVersion, + expected_active_version: &ArtifactVersion, + expected_inactive_version: &ExpectedVersion, + found_active_version: &str, + found_inactive_version: Option<&str>, +) -> MgsUpdateStatus { + if found_active_version == desired_version.as_str() { + // If we find the desired version in the active slot, we're done. + return MgsUpdateStatus::Done; + } + + // The update hasn't completed. + // + // Check to make sure the contents of the active slot are still what they + // were when we configured this update. If not, then this update cannot + // proceed as currently configured. It will fail its precondition check. + if found_active_version != expected_active_version.as_str() { + return MgsUpdateStatus::Impossible; + } + + // Similarly, check the contents of the inactive slot to determine if it + // still matches what we saw when we configured this update. If not, then + // this update cannot proceed as currently configured. It will fail its + // precondition check. + // + // This logic is more complex than for the active slot because unlike the + // active slot, it's possible for both the found contents and the expected + // contents to be missing and that's not necessarily an error. + match (found_inactive_version, expected_inactive_version) { + (Some(_), ExpectedVersion::NoValidVersion) => { + // We expected nothing in the inactive slot, but found something. + MgsUpdateStatus::Impossible + } + (Some(found), ExpectedVersion::Version(expected)) => { + if found == expected.as_str() { + // We found something in the inactive slot that matches what we + // expected. + MgsUpdateStatus::NotDone + } else { + // We found something in the inactive slot that differs from + // what we expected. + MgsUpdateStatus::Impossible + } + } + (None, ExpectedVersion::Version(_)) => { + // We expected something in the inactive slot, but found nothing. + // This case is tricky because we can't tell from the inventory + // whether we transiently failed to fetch the caboose for some + // reason or whether the caboose is actually garbage. We choose to + // assume that it's actually garbage, which would mean that this + // update as-configured is impossible. This will cause us to + // generate a new update that expects garbage in the inactive slot. + // If we're right, great. If we're wrong, then *that* update will + // be impossible to complete, but we should fix this again if the + // transient error goes away. + // + // If we instead assumed that this was a transient error, we'd do + // nothing here instead. But if the caboose was really missing, + // then we'd get stuck forever waiting for something that would + // never happen. + MgsUpdateStatus::Impossible + } + (None, ExpectedVersion::NoValidVersion) => { + // We expected nothing in the inactive slot and found nothing there. + // No problem! + MgsUpdateStatus::NotDone + } + } +} + +fn try_make_update( + log: &slog::Logger, + baseboard_id: &Arc, + inventory: &Collection, + current_artifacts: &TufRepoDescription, +) -> Option { + // TODO When we add support for planning RoT, RoT bootloader, and host OS + // updates, we'll try these in a hardcoded priority order until any of them + // returns `Some`. For now, we only plan SP updates. + try_make_update_sp(log, baseboard_id, inventory, current_artifacts) +} + +fn try_make_update_sp( + log: &slog::Logger, + baseboard_id: &Arc, + inventory: &Collection, + current_artifacts: &TufRepoDescription, +) -> Option { + let Some(sp_info) = inventory.sps.get(baseboard_id) else { + warn!( + log, + "cannot configure SP update for board \ + (missing SP info from inventory)"; + baseboard_id + ); + return None; + }; + + let Some(active_caboose) = + inventory.caboose_for(CabooseWhich::SpSlot0, baseboard_id) + else { + warn!( + log, + "cannot configure SP update for board \ + (missing active caboose from inventory)"; + baseboard_id, + ); + return None; + }; + + let Ok(expected_active_version) = active_caboose.caboose.version.parse() + else { + warn!( + log, + "cannot configure SP update for board \ + (cannot parse current active version as an ArtifactVersion)"; + baseboard_id, + "found_version" => &active_caboose.caboose.version, + ); + return None; + }; + + let board = &active_caboose.caboose.board; + let matching_artifacts: Vec<_> = current_artifacts + .artifacts + .iter() + .filter(|a| { + // A matching SP artifact will have: + // + // - "name" matching the board name (found above from caboose) + // - "kind" matching one of the known SP kinds + + if a.id.name != *board { + return false; + } + + match a.id.kind.to_known() { + None => false, + Some( + KnownArtifactKind::GimletSp + | KnownArtifactKind::PscSp + | KnownArtifactKind::SwitchSp, + ) => true, + Some( + KnownArtifactKind::GimletRot + | KnownArtifactKind::Host + | KnownArtifactKind::Trampoline + | KnownArtifactKind::ControlPlane + | KnownArtifactKind::Zone + | KnownArtifactKind::PscRot + | KnownArtifactKind::SwitchRot + | KnownArtifactKind::GimletRotBootloader + | KnownArtifactKind::PscRotBootloader + | KnownArtifactKind::SwitchRotBootloader, + ) => false, + } + }) + .collect(); + if matching_artifacts.is_empty() { + warn!( + log, + "cannot configure SP update for board (no matching artifact)"; + baseboard_id, + ); + return None; + } + + if matching_artifacts.len() > 1 { + // This is noteworthy, but not a problem. We'll just pick one. + warn!(log, "found more than one matching artifact for SP update"); + } + + let artifact = matching_artifacts[0]; + + // If the artifact's version matches what's deployed, then no update is + // needed. + if artifact.id.version == expected_active_version { + debug!(log, "no SP update needed for board"; baseboard_id); + return None; + } + + // Begin configuring an update. + let expected_inactive_version = match inventory + .caboose_for(CabooseWhich::SpSlot1, baseboard_id) + .map(|c| c.caboose.version.parse::()) + .transpose() + { + Ok(None) => ExpectedVersion::NoValidVersion, + Ok(Some(v)) => ExpectedVersion::Version(v), + Err(_) => { + warn!( + log, + "cannot configure SP update for board + (found inactive slot contents but version was not valid)"; + baseboard_id + ); + return None; + } + }; + + Some(PendingMgsUpdate { + baseboard_id: baseboard_id.clone(), + sp_type: sp_info.sp_type, + slot_id: u32::from(sp_info.sp_slot), + details: PendingMgsUpdateDetails::Sp { + expected_active_version, + expected_inactive_version, + }, + artifact_hash: artifact.hash, + artifact_version: artifact.id.version.clone(), + }) +} + +#[cfg(test)] +mod test { + use crate::mgs_updates::plan_mgs_updates; + use chrono::Utc; + use dropshot::ConfigLogging; + use dropshot::ConfigLoggingLevel; + use gateway_client::types::PowerState; + use gateway_client::types::RotState; + use gateway_client::types::SpComponentCaboose; + use gateway_client::types::SpState; + use gateway_client::types::SpType; + use nexus_types::deployment::ExpectedVersion; + use nexus_types::deployment::PendingMgsUpdate; + use nexus_types::deployment::PendingMgsUpdateDetails; + use nexus_types::deployment::PendingMgsUpdates; + use nexus_types::inventory::CabooseWhich; + use nexus_types::inventory::Collection; + use nexus_types::inventory::RotSlot; + use omicron_common::api::external::TufArtifactMeta; + use omicron_common::api::external::TufRepoDescription; + use omicron_common::api::external::TufRepoMeta; + use omicron_common::update::ArtifactId; + use omicron_test_utils::dev::LogContext; + use std::collections::BTreeMap; + use std::collections::BTreeSet; + use tufaceous_artifact::ArtifactHash; + use tufaceous_artifact::ArtifactVersion; + use tufaceous_artifact::KnownArtifactKind; + + /// Version that will be used for all artifacts in the TUF repo + const ARTIFACT_VERSION_2: ArtifactVersion = + ArtifactVersion::new_const("2.0.0"); + /// Version that will be "deployed" in the SP we want to update + const ARTIFACT_VERSION_1: ArtifactVersion = + ArtifactVersion::new_const("1.0.0"); + /// Version that's different from the other two + const ARTIFACT_VERSION_1_5: ArtifactVersion = + ArtifactVersion::new_const("1.5.0"); + + /// Hash of fake artifact for fake gimlet-e SP + const ARTIFACT_HASH_SP_GIMLET_E: ArtifactHash = ArtifactHash([1; 32]); + /// Hash of fake artifact for fake gimlet-d SP + const ARTIFACT_HASH_SP_GIMLET_D: ArtifactHash = ArtifactHash([2; 32]); + /// Hash of fake artifact for fake sidecar-b SP + const ARTIFACT_HASH_SP_SIDECAR_B: ArtifactHash = ArtifactHash([5; 32]); + /// Hash of fake artifact for fake sidecar-c SP + const ARTIFACT_HASH_SP_SIDECAR_C: ArtifactHash = ArtifactHash([6; 32]); + /// Hash of fake artifact for fake psc-b SP + const ARTIFACT_HASH_SP_PSC_B: ArtifactHash = ArtifactHash([9; 32]); + /// Hash of fake artifact for fake psc-c SP + const ARTIFACT_HASH_SP_PSC_C: ArtifactHash = ArtifactHash([10; 32]); + + // unused artifact hashes + + const ARTIFACT_HASH_CONTROL_PLANE: ArtifactHash = ArtifactHash([33; 32]); + const ARTIFACT_HASH_NEXUS: ArtifactHash = ArtifactHash([34; 32]); + const ARTIFACT_HASH_HOST_OS: ArtifactHash = ArtifactHash([35; 32]); + + fn test_artifact_for_board(board: &str) -> ArtifactHash { + match board { + "gimlet-d" => ARTIFACT_HASH_SP_GIMLET_D, + "gimlet-e" => ARTIFACT_HASH_SP_GIMLET_E, + "sidecar-b" => ARTIFACT_HASH_SP_SIDECAR_B, + "sidecar-c" => ARTIFACT_HASH_SP_SIDECAR_C, + "psc-b" => ARTIFACT_HASH_SP_PSC_B, + "psc-c" => ARTIFACT_HASH_SP_PSC_C, + _ => panic!("test bug: no artifact for board {board:?}"), + } + } + + /// Describes the SPs in the environment used in these tests + /// + /// There will be: + /// + /// - 4 sled SPs + /// - 2 switch SPs + /// - 2 PSC SPs + /// + /// The specific set of hardware (boards) vary and are hardcoded: + /// + /// - sled 0: gimlet-d + /// - other sleds: gimlet-e + /// - switch 0: sidecar-b + /// - switch 1: sidecar-c + /// - psc 0: psc-b + /// - psc 1: psc-c + fn test_config() -> BTreeMap<(SpType, u32), (&'static str, &'static str)> { + BTreeMap::from([ + ((SpType::Sled, 0), ("sled_0", "gimlet-d")), + ((SpType::Sled, 1), ("sled_1", "gimlet-e")), + ((SpType::Sled, 2), ("sled_2", "gimlet-e")), + ((SpType::Sled, 3), ("sled_3", "gimlet-e")), + ((SpType::Switch, 0), ("switch_0", "sidecar-b")), + ((SpType::Switch, 1), ("switch_1", "sidecar-c")), + ((SpType::Power, 0), ("power_0", "psc-b")), + ((SpType::Power, 1), ("power_1", "psc-c")), + ]) + } + + /// Returns a TufRepoDescription that we can use to exercise the planning + /// code. + fn make_tuf_repo() -> TufRepoDescription { + const SYSTEM_VERSION: semver::Version = semver::Version::new(0, 0, 1); + const SYSTEM_HASH: ArtifactHash = ArtifactHash([3; 32]); + + // Include a bunch of SP-related artifacts, as well as a few others just + // to make sure those are properly ignored. + let artifacts = vec![ + make_artifact( + "control-plane", + KnownArtifactKind::ControlPlane, + ARTIFACT_HASH_CONTROL_PLANE, + ), + make_artifact( + "nexus", + KnownArtifactKind::Zone, + ARTIFACT_HASH_NEXUS, + ), + make_artifact( + "host-os", + KnownArtifactKind::Host, + ARTIFACT_HASH_HOST_OS, + ), + make_artifact( + "gimlet-d", + KnownArtifactKind::GimletSp, + test_artifact_for_board("gimlet-d"), + ), + make_artifact( + "gimlet-e", + KnownArtifactKind::GimletSp, + test_artifact_for_board("gimlet-e"), + ), + make_artifact( + "sidecar-b", + KnownArtifactKind::SwitchSp, + test_artifact_for_board("sidecar-b"), + ), + make_artifact( + "sidecar-c", + KnownArtifactKind::SwitchSp, + test_artifact_for_board("sidecar-c"), + ), + make_artifact( + "psc-b", + KnownArtifactKind::PscSp, + test_artifact_for_board("psc-b"), + ), + make_artifact( + "psc-c", + KnownArtifactKind::PscSp, + test_artifact_for_board("psc-c"), + ), + ]; + + TufRepoDescription { + repo: TufRepoMeta { + hash: SYSTEM_HASH, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: SYSTEM_VERSION, + file_name: String::new(), + }, + artifacts, + } + } + + fn make_artifact( + name: &str, + kind: KnownArtifactKind, + hash: ArtifactHash, + ) -> TufArtifactMeta { + TufArtifactMeta { + id: ArtifactId { + name: name.to_string(), + version: ARTIFACT_VERSION_2, + kind: kind.into(), + }, + hash, + size: 0, // unused here + } + } + + // Construct inventory for an environment suitable for our testing. + // + // See test_config() for information about the hardware. All SPs will + // appear to be running version `active_version` except those identified in + // `active_version_exceptions`. All SPs will appear to have + // `inactive_version` in the inactive slot. + fn make_collection( + active_version: ArtifactVersion, + active_version_exceptions: &BTreeMap<(SpType, u32), ArtifactVersion>, + inactive_version: ExpectedVersion, + ) -> Collection { + let mut builder = nexus_inventory::CollectionBuilder::new( + "planning_mgs_updates_basic", + ); + + let dummy_sp_state = SpState { + base_mac_address: [0; 6], + hubris_archive_id: String::from("unused"), + model: String::from("unused"), + power_state: PowerState::A0, + revision: 0, + rot: RotState::V3 { + active: RotSlot::A, + pending_persistent_boot_preference: None, + persistent_boot_preference: RotSlot::A, + slot_a_error: None, + slot_a_fwid: Default::default(), + slot_b_error: None, + slot_b_fwid: Default::default(), + stage0_error: None, + stage0_fwid: Default::default(), + stage0next_error: None, + stage0next_fwid: Default::default(), + transient_boot_preference: None, + }, + serial_number: String::from("unused"), + }; + + let test_config = test_config(); + for ((sp_type, sp_slot), (serial, caboose_board)) in test_config { + let sp_state = SpState { + model: format!("dummy_{}", sp_type), + serial_number: serial.to_string(), + ..dummy_sp_state.clone() + }; + + let baseboard_id = builder + .found_sp_state("test", sp_type, sp_slot, sp_state) + .unwrap(); + let active_version = active_version_exceptions + .get(&(sp_type, sp_slot)) + .unwrap_or(&active_version); + + builder + .found_caboose( + &baseboard_id, + CabooseWhich::SpSlot0, + "test", + SpComponentCaboose { + board: caboose_board.to_string(), + epoch: None, + git_commit: String::from("unused"), + name: caboose_board.to_string(), + sign: None, + version: active_version.as_str().to_string(), + }, + ) + .unwrap(); + + if let ExpectedVersion::Version(inactive_version) = + &inactive_version + { + builder + .found_caboose( + &baseboard_id, + CabooseWhich::SpSlot1, + "test", + SpComponentCaboose { + board: caboose_board.to_string(), + epoch: None, + git_commit: String::from("unused"), + name: caboose_board.to_string(), + sign: None, + version: inactive_version.as_str().to_string(), + }, + ) + .unwrap(); + } + } + + builder.build() + } + + // Short hand-rolled update sequence that exercises some basic behavior. + #[test] + fn test_basic() { + let logctx = LogContext::new( + "planning_mgs_updates_basic", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let log = &logctx.log; + + // Test that with no updates pending and no TUF repo specified, there + // will remain no updates pending. + let collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]), + ExpectedVersion::NoValidVersion, + ); + let current_boards = &collection.baseboards; + let initial_updates = PendingMgsUpdates::new(); + let nmax_updates = 1; + let updates = plan_mgs_updates( + log, + &collection, + current_boards, + &initial_updates, + None, + nmax_updates, + ); + assert!(updates.is_empty()); + + // Test that when a TUF repo is specified and one SP is outdated, then + // it's configured with an update (and the update looks correct). + let repo = make_tuf_repo(); + let updates = plan_mgs_updates( + log, + &collection, + current_boards, + &initial_updates, + Some(&repo), + nmax_updates, + ); + assert_eq!(updates.len(), 1); + let first_update = updates.iter().next().expect("at least one update"); + assert_eq!(first_update.baseboard_id.serial_number, "sled_0"); + assert_eq!(first_update.sp_type, SpType::Sled); + assert_eq!(first_update.slot_id, 0); + assert_eq!(first_update.artifact_hash, ARTIFACT_HASH_SP_GIMLET_D); + assert_eq!(first_update.artifact_version, ARTIFACT_VERSION_2); + + // Test that when an update is already pending, and nothing changes + // about the state of the world (i.e., the inventory), then the planner + // makes no changes. + let later_updates = plan_mgs_updates( + log, + &collection, + current_boards, + &updates, + Some(&repo), + nmax_updates, + ); + assert_eq!(updates, later_updates); + + // Test that when two updates are needed, but one is already pending, + // then the other one is *not* started (because it exceeds + // nmax_updates). + let later_collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([ + ((SpType::Sled, 0), ARTIFACT_VERSION_1), + ((SpType::Switch, 1), ARTIFACT_VERSION_1), + ]), + ExpectedVersion::NoValidVersion, + ); + let later_updates = plan_mgs_updates( + log, + &later_collection, + current_boards, + &updates, + Some(&repo), + nmax_updates, + ); + assert_eq!(updates, later_updates); + + // At this point, we're ready to test that when the first update + // completes, then th esecond one *is* started. This tests two + // different things: first that we noticed the first one completed, and + // second that we noticed another thing needed an update + let later_collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Switch, 1), ARTIFACT_VERSION_1)]), + ExpectedVersion::NoValidVersion, + ); + let later_updates = plan_mgs_updates( + log, + &later_collection, + current_boards, + &updates, + Some(&repo), + nmax_updates, + ); + assert_eq!(later_updates.len(), 1); + let next_update = + later_updates.iter().next().expect("at least one update"); + assert_ne!(first_update, next_update); + assert_eq!(next_update.baseboard_id.serial_number, "switch_1"); + assert_eq!(next_update.sp_type, SpType::Switch); + assert_eq!(next_update.slot_id, 1); + assert_eq!(next_update.artifact_hash, ARTIFACT_HASH_SP_SIDECAR_C); + assert_eq!(next_update.artifact_version, ARTIFACT_VERSION_2); + + // Finally, test that when all SPs are in spec, then no updates are + // configured. + let updated_collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::new(), + ExpectedVersion::NoValidVersion, + ); + let later_updates = plan_mgs_updates( + log, + &updated_collection, + current_boards, + &later_updates, + Some(&repo), + nmax_updates, + ); + assert!(later_updates.is_empty()); + + // Test that we don't try to update boards that aren't in + // `current_boards`, even if they're in inventory and outdated. + let collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]), + ExpectedVersion::NoValidVersion, + ); + let updates = plan_mgs_updates( + log, + &collection, + &BTreeSet::new(), + &PendingMgsUpdates::new(), + Some(&repo), + nmax_updates, + ); + assert!(updates.is_empty()); + let updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &PendingMgsUpdates::new(), + Some(&repo), + nmax_updates, + ); + // We verified most of the details above. Here we're just double + // checking that the baseboard being missing is the only reason that no + // update was generated. + assert_eq!(updates.len(), 1); + + // Verify the precondition details of an ordinary update. + let old_update = + updates.into_iter().next().expect("at least one update"); + let PendingMgsUpdateDetails::Sp { + expected_active_version: old_expected_active_version, + expected_inactive_version: old_expected_inactive_version, + } = &old_update.details + else { + panic!("expected SP update"); + }; + assert_eq!(ARTIFACT_VERSION_1, *old_expected_active_version); + assert_eq!( + ExpectedVersion::NoValidVersion, + *old_expected_inactive_version + ); + + // Test that if the inactive slot contents have changed, then we'll get + // a new update reflecting that. + let collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]), + ExpectedVersion::Version(ARTIFACT_VERSION_1), + ); + let new_updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &updates, + Some(&repo), + nmax_updates, + ); + assert_ne!(updates, new_updates); + assert_eq!(new_updates.len(), 1); + let new_update = + new_updates.into_iter().next().expect("at least one update"); + assert_eq!(old_update.baseboard_id, new_update.baseboard_id); + assert_eq!(old_update.sp_type, new_update.sp_type); + assert_eq!(old_update.slot_id, new_update.slot_id); + assert_eq!(old_update.artifact_hash, new_update.artifact_hash); + assert_eq!(old_update.artifact_version, new_update.artifact_version); + let PendingMgsUpdateDetails::Sp { + expected_active_version: new_expected_active_version, + expected_inactive_version: new_expected_inactive_version, + } = &new_update.details + else { + panic!("expected SP update"); + }; + assert_eq!(ARTIFACT_VERSION_1, *new_expected_active_version); + assert_eq!( + ExpectedVersion::Version(ARTIFACT_VERSION_1), + *new_expected_inactive_version + ); + + // Test that if instead it's the active slot whose contents have changed + // to something other than the new expected version, then we'll also get + // a new update reflecting that. + let collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1_5)]), + ExpectedVersion::NoValidVersion, + ); + let new_updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &updates, + Some(&repo), + nmax_updates, + ); + assert_ne!(updates, new_updates); + assert_eq!(new_updates.len(), 1); + let new_update = + new_updates.into_iter().next().expect("at least one update"); + assert_eq!(old_update.baseboard_id, new_update.baseboard_id); + assert_eq!(old_update.sp_type, new_update.sp_type); + assert_eq!(old_update.slot_id, new_update.slot_id); + assert_eq!(old_update.artifact_hash, new_update.artifact_hash); + assert_eq!(old_update.artifact_version, new_update.artifact_version); + let PendingMgsUpdateDetails::Sp { + expected_active_version: new_expected_active_version, + expected_inactive_version: new_expected_inactive_version, + } = &new_update.details + else { + panic!("expected SP update"); + }; + assert_eq!(ARTIFACT_VERSION_1_5, *new_expected_active_version); + assert_eq!( + ExpectedVersion::NoValidVersion, + *new_expected_inactive_version + ); + + logctx.cleanup_successful(); + } + + // Updates a whole system's SPs one at a time + #[test] + fn test_whole_system_sequential() { + let logctx = LogContext::new( + "planning_mgs_updates_whole_system_sequential", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let log = &logctx.log; + let mut latest_updates = PendingMgsUpdates::new(); + let repo = make_tuf_repo(); + let nmax_updates = 1; + + // Maintain a map of SPs that we've updated. We'll use this to + // configure the inventory collection that we create at each step. + let mut exceptions = BTreeMap::new(); + + // We do not control the order of updates. But we expect to update each + // of the SPs in this map. When we do, we expect to find the given + // artifact. + let mut expected_updates: BTreeMap<_, _> = test_config() + .into_iter() + .map(|(k, (serial, board_name))| { + (k, (serial, test_artifact_for_board(board_name))) + }) + .collect(); + + for _ in 0..expected_updates.len() { + // Generate an inventory collection reflecting that everything is at + // version 1 except for what we've already updated. + let collection = make_collection( + ARTIFACT_VERSION_1, + &exceptions, + ExpectedVersion::NoValidVersion, + ); + + // For this test, all systems that are found in inventory are part + // of the control plane. + let current_boards = &collection.baseboards; + + // Run the planner and verify that we got one of our expected + // updates. + let new_updates = plan_mgs_updates( + log, + &collection, + current_boards, + &latest_updates, + Some(&repo), + nmax_updates, + ); + assert_eq!(new_updates.len(), 1); + let update = + new_updates.iter().next().expect("at least one update"); + verify_one_sp_update(&mut expected_updates, update); + + // Update `exceptions` for the next iteration. + let sp_type = update.sp_type; + let sp_slot = update.slot_id; + assert!( + exceptions + .insert((sp_type, sp_slot), ARTIFACT_VERSION_2) + .is_none() + ); + latest_updates = new_updates; + } + + // Take one more lap. It should reflect zero updates. + let collection = make_collection( + ARTIFACT_VERSION_1, + &exceptions, + ExpectedVersion::NoValidVersion, + ); + let last_updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &latest_updates, + Some(&repo), + nmax_updates, + ); + assert!(last_updates.is_empty()); + + logctx.cleanup_successful(); + } + + // Updates a whole system's SPs at once + #[test] + fn test_whole_system_simultaneous() { + let logctx = LogContext::new( + "planning_mgs_updates_whole_system_simultaneous", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let log = &logctx.log; + let repo = make_tuf_repo(); + + let mut expected_updates: BTreeMap<_, _> = test_config() + .into_iter() + .map(|(k, (serial, board_name))| { + (k, (serial, test_artifact_for_board(board_name))) + }) + .collect(); + + // Update the whole system at once. + let collection = make_collection( + ARTIFACT_VERSION_1, + &BTreeMap::new(), + ExpectedVersion::NoValidVersion, + ); + let all_updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &PendingMgsUpdates::new(), + Some(&repo), + usize::MAX, + ); + assert_eq!(all_updates.len(), expected_updates.len()); + for update in &all_updates { + verify_one_sp_update(&mut expected_updates, update); + } + + // Now, notice when they've all been updated, even if the limit is only + // one. + let collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::new(), + ExpectedVersion::NoValidVersion, + ); + let all_updates_done = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &all_updates, + Some(&repo), + 1, + ); + assert!(all_updates_done.is_empty()); + + logctx.cleanup_successful(); + } + + fn verify_one_sp_update( + expected_updates: &mut BTreeMap<(SpType, u32), (&str, ArtifactHash)>, + update: &PendingMgsUpdate, + ) { + let sp_type = update.sp_type; + let sp_slot = update.slot_id; + println!("found update: {} slot {}", sp_type, sp_slot); + let (expected_serial, expected_artifact) = expected_updates + .remove(&(sp_type, sp_slot)) + .expect("unexpected update"); + assert_eq!(update.artifact_hash, expected_artifact); + assert_eq!(update.artifact_version, ARTIFACT_VERSION_2); + assert_eq!(update.baseboard_id.serial_number, *expected_serial); + let PendingMgsUpdateDetails::Sp { + expected_active_version, + expected_inactive_version, + } = &update.details + else { + panic!("expected SP update"); + }; + assert_eq!(*expected_active_version, ARTIFACT_VERSION_1); + assert_eq!(*expected_inactive_version, ExpectedVersion::NoValidVersion); + } + + // Tests the case where an SP appears to move while an update is pending + #[test] + fn test_sp_move() { + let logctx = LogContext::new( + "planning_mgs_updates_whole_system_simultaneous", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + + // Configure an update for one SP. + let log = &logctx.log; + let repo = make_tuf_repo(); + let mut collection = make_collection( + ARTIFACT_VERSION_2, + &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]), + ExpectedVersion::NoValidVersion, + ); + let updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &PendingMgsUpdates::new(), + Some(&repo), + 1, + ); + assert!(!updates.is_empty()); + let update = updates.into_iter().next().expect("at least one update"); + + // Move an SP (as if someone had moved the sled to a different cubby). + // This is awful, but at least it's easy. + let sp_info = collection + .sps + .values_mut() + .find(|sp| sp.sp_type == SpType::Sled && sp.sp_slot == 0) + .expect("missing sled 0 SP"); + sp_info.sp_slot = 9; + + // Plan again. The configured update should be updated to reflect the + // new location. + let new_updates = plan_mgs_updates( + log, + &collection, + &collection.baseboards, + &updates, + Some(&repo), + 1, + ); + assert!(!new_updates.is_empty()); + let new_update = + new_updates.into_iter().next().expect("at least one update"); + assert_ne!(new_update.slot_id, update.slot_id); + assert_eq!(new_update.baseboard_id, update.baseboard_id); + assert_eq!(new_update.sp_type, update.sp_type); + assert_eq!(new_update.artifact_hash, update.artifact_hash); + assert_eq!(new_update.artifact_version, update.artifact_version); + assert_eq!(new_update.details, update.details); + + logctx.cleanup_successful(); + } +} diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 6a02dd05c87..b761ac901db 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -1188,7 +1188,6 @@ pub struct PendingMgsUpdate { pub details: PendingMgsUpdateDetails, /// which artifact to apply to this device - /// (implies which component is being updated) pub artifact_hash: ArtifactHash, pub artifact_version: ArtifactVersion, } From 38dc53a758be4d2ad3b09c7b300e9854e20dcf14 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 3 Jun 2025 13:56:40 -0700 Subject: [PATCH 21/32] add docs --- .../planning/src/mgs_updates/mod.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs index 241b05f7e6c..1e56ad14c29 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs @@ -20,6 +20,19 @@ use thiserror::Error; use tufaceous_artifact::ArtifactVersion; use tufaceous_artifact::KnownArtifactKind; +/// Generates a new set of `PendingMgsUpdates` based on: +/// +/// * `inventory`: the latest inventory +/// * `current_boards`: a set of baseboards to consider updating +/// (it is possible to have baseboards in inventory that would never be +/// updated because they're not considered part of the current system) +/// * `current_updates`: the most recent set of configured `PendingMgsUpdates` +/// * `current_artifacts`: information about artifacts frmo the current target +/// release (if any) +/// * `nmax_updates`: the maximum number of updates allowed at once +/// +/// By current policy, `nmax_updates` is always 1, but the implementation here +/// supports more than one update per invocation. pub fn plan_mgs_updates( log: &slog::Logger, inventory: &Collection, @@ -233,6 +246,8 @@ fn mgs_update_status( } } +/// Compares a configured SP update with information from inventory and +/// determines the current status of the update. See `MgsUpdateStatus`. fn mgs_update_status_sp( desired_version: &ArtifactVersion, expected_active_version: &ArtifactVersion, @@ -304,6 +319,8 @@ fn mgs_update_status_sp( } } +/// Determine if the given baseboard needs any MGS-driven update (e.g., update +/// to its SP, RoT, etc.). If so, returns the update. If not, returns `None`. fn try_make_update( log: &slog::Logger, baseboard_id: &Arc, @@ -316,6 +333,7 @@ fn try_make_update( try_make_update_sp(log, baseboard_id, inventory, current_artifacts) } +/// Determine if the given baseboard needs an SP update and, if so, returns it. fn try_make_update_sp( log: &slog::Logger, baseboard_id: &Arc, From f28b929eb3c4c4411020dc2bcd37367be882593c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 07:05:19 -0700 Subject: [PATCH 22/32] fix openapi --- openapi/nexus-internal.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index cf5954d825e..851df54d050 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5385,7 +5385,7 @@ "type": "object", "properties": { "artifact_hash": { - "description": "which artifact to apply to this device (implies which component is being updated)", + "description": "which artifact to apply to this device", "type": "string", "format": "hex string (32 bytes)" }, From bd7bcd66097bdaa1528a3101cd9bef07f3ead783 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 07:11:52 -0700 Subject: [PATCH 23/32] update comments based on review feedback --- .../reconfigurator/planning/src/mgs_updates/mod.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs index 1e56ad14c29..3102c116de7 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs @@ -150,6 +150,13 @@ enum MgsUpdateStatus { NotDone, /// the requested update has not completed and the preconditions are not /// currently met + /// + /// The only way for this update to complete as-written is if reality + /// changes such that the preconditions become met. But the only normal + /// operation that could make that happen is blueprint execution, which + /// won't do anything while the preconditions aren't met. So if an update + /// is in this state, generally the planner must remove this update (and + /// presumably add one with updated preconditions). Impossible, } @@ -329,7 +336,8 @@ fn try_make_update( ) -> Option { // TODO When we add support for planning RoT, RoT bootloader, and host OS // updates, we'll try these in a hardcoded priority order until any of them - // returns `Some`. For now, we only plan SP updates. + // returns `Some`. The order is described in RFD 565 section "Update + // Sequence". For now, we only plan SP updates. try_make_update_sp(log, baseboard_id, inventory, current_artifacts) } @@ -420,7 +428,9 @@ fn try_make_update_sp( } if matching_artifacts.len() > 1 { - // This is noteworthy, but not a problem. We'll just pick one. + // This should be impossible unless we shipped a TUF repo with multiple + // artifacts for the same board. But it doesn't prevent us from picking + // one and proceeding. Make a note and proceed. warn!(log, "found more than one matching artifact for SP update"); } From bf2ab7cac4f6390f4215b590b079268bb20b02e1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 09:58:45 -0700 Subject: [PATCH 24/32] review feedback --- nexus/reconfigurator/planning/src/mgs_updates/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs index 3102c116de7..ca6a953a3a8 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs @@ -27,7 +27,7 @@ use tufaceous_artifact::KnownArtifactKind; /// (it is possible to have baseboards in inventory that would never be /// updated because they're not considered part of the current system) /// * `current_updates`: the most recent set of configured `PendingMgsUpdates` -/// * `current_artifacts`: information about artifacts frmo the current target +/// * `current_artifacts`: information about artifacts from the current target /// release (if any) /// * `nmax_updates`: the maximum number of updates allowed at once /// @@ -71,7 +71,7 @@ pub fn plan_mgs_updates( Ok(MgsUpdateStatus::Impossible) => { info!( log, - "SP update impossible + "SP update impossible \ (will remove it and re-evaluate board)"; update ); @@ -115,7 +115,7 @@ pub fn plan_mgs_updates( current_boards.iter().filter(|b| !boards_preferred.contains(*b)); let candidates = boards_preferred.iter().chain(non_preferred); for board in candidates { - if rv.len() == nmax_updates { + if rv.len() >= nmax_updates { info!( log, "reached maximum number of pending SP updates"; @@ -454,7 +454,7 @@ fn try_make_update_sp( Err(_) => { warn!( log, - "cannot configure SP update for board + "cannot configure SP update for board \ (found inactive slot contents but version was not valid)"; baseboard_id ); @@ -834,7 +834,7 @@ mod test { assert_eq!(updates, later_updates); // At this point, we're ready to test that when the first update - // completes, then th esecond one *is* started. This tests two + // completes, then the second one *is* started. This tests two // different things: first that we noticed the first one completed, and // second that we noticed another thing needed an update let later_collection = make_collection( From 8182a462bd7fe9aad42ac85662859e2f90e0bdab Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 11:30:02 -0700 Subject: [PATCH 25/32] incorporate SP updates into planner --- .../db-queries/src/db/datastore/deployment.rs | 5 ++ .../planning/src/blueprint_builder/builder.rs | 7 ++ nexus/reconfigurator/planning/src/planner.rs | 72 ++++++++++++++++++- nexus/reconfigurator/planning/src/system.rs | 12 ++++ nexus/reconfigurator/preparation/src/lib.rs | 5 ++ nexus/types/src/deployment/planning_input.rs | 11 +++ 6 files changed, 111 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index e280cefd09c..d085d0de0ed 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1881,6 +1881,7 @@ mod tests { use nexus_types::external_api::views::PhysicalDiskState; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledState; + use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Collection; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; @@ -1999,6 +2000,10 @@ mod tests { policy: SledPolicy::provisionable(), state: SledState::Active, resources, + baseboard_id: BaseboardId { + part_number: String::from("unused"), + serial_number: String::from("unused"), + }, } } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index c86c0d45698..dfa21b01927 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1927,6 +1927,13 @@ impl<'a> BlueprintBuilder<'a> { Ok(self.resource_allocator()?.inject_untracked_external_dns_ip(addr)?) } + pub fn pending_mgs_updates_replace_all( + &mut self, + updates: nexus_types::deployment::PendingMgsUpdates, + ) { + self.pending_mgs_updates = updates; + } + pub fn pending_mgs_update_insert( &mut self, update: nexus_types::deployment::PendingMgsUpdate, diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 4cb65c02546..1452a574d0a 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -13,7 +13,9 @@ use crate::blueprint_builder::Error; use crate::blueprint_builder::Operation; use crate::blueprint_editor::DisksEditError; use crate::blueprint_editor::SledEditError; +use crate::mgs_updates::plan_mgs_updates; use crate::planner::omicron_zone_placement::PlacementError; +use gateway_client::types::SpType; use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; @@ -50,6 +52,11 @@ pub use self::rng::SledPlannerRng; mod omicron_zone_placement; pub(crate) mod rng; +enum UpdateStepResult { + ContinueToNextStep, + Waiting, +} + pub struct Planner<'a> { log: Logger, input: &'a PlanningInput, @@ -115,7 +122,11 @@ impl<'a> Planner<'a> { self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; - self.do_plan_zone_updates()?; + if let UpdateStepResult::ContinueToNextStep = + self.do_plan_mgs_updates()? + { + self.do_plan_zone_updates()?; + } self.do_plan_cockroachdb_settings(); Ok(()) } @@ -895,6 +906,65 @@ impl<'a> Planner<'a> { Ok(()) } + /// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of + /// date. + fn do_plan_mgs_updates(&mut self) -> Result { + // Determine which baseboards we will consider updating. + // + // Sleds may be present but not adopted as part of the control plane. + // In deployed systems, this would probably only happen if a sled was + // about to be added. In dev/test environments, it's common to leave + // some number of sleds out of the control plane for various reasons. + // Inventory will still report them, but we don't want to touch them. + // + // For better or worse, switches and PSCs do not have the same idea of + // being adopted into the control plane. If they're present, they're + // part of the system, and we will update them. + let included_sled_baseboards: BTreeSet<_> = self + .input + .all_sleds(SledFilter::SpsUpdatedByReconfigurator) + .map(|(_sled_id, details)| &details.baseboard_id) + .collect(); + let included_baseboards = self + .inventory + .sps + .iter() + .filter_map(|(baseboard_id, sp_state)| { + let do_include = match sp_state.sp_type { + SpType::Sled => { + included_sled_baseboards.contains(baseboard_id.as_ref()) + } + SpType::Power => true, + SpType::Switch => true, + }; + do_include.then_some(baseboard_id.clone()) + }) + .collect(); + + // Compute the new set of PendingMgsUpdates. + let current_updates = + &self.blueprint.parent_blueprint().pending_mgs_updates; + let current_artifacts = self.input.tuf_repo(); + let next = plan_mgs_updates( + &self.log, + &self.inventory, + &included_baseboards, + ¤t_updates, + current_artifacts, + 1, + ); + + // XXX-dap this is not quite right. It's possible that no updates were + // planned but SPs aren't yet updated. + let rv = if next.is_empty() { + UpdateStepResult::ContinueToNextStep + } else { + UpdateStepResult::Waiting + }; + self.blueprint.pending_mgs_updates_replace_all(next); + Ok(rv) + } + /// Update at most one existing zone to use a new image source. fn do_plan_zone_updates(&mut self) -> Result<(), Error> { // We are only interested in non-decommissioned sleds. diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 9a43c6c2564..dc4a5416ec7 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -481,6 +481,18 @@ impl SystemDescription { policy: sled.policy, state: sled.state, resources: sled.resources.clone(), + baseboard_id: BaseboardId { + part_number: sled + .inventory_sled_agent + .baseboard + .model() + .to_owned(), + serial_number: sled + .inventory_sled_agent + .baseboard + .identifier() + .to_owned(), + }, }; builder.add_sled(sled.sled_id, sled_details)?; } diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 59e6a8ab6aa..b0fe41ce1c1 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -33,6 +33,7 @@ use nexus_types::deployment::SledResources; use nexus_types::deployment::UnstableReconfiguratorState; use nexus_types::identity::Asset; use nexus_types::identity::Resource; +use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Collection; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; @@ -274,6 +275,10 @@ impl PlanningInputFromDb<'_> { policy: sled_row.policy(), state: sled_row.state().into(), resources: SledResources { subnet, zpools }, + baseboard_id: BaseboardId { + part_number: sled_row.part_number().to_owned(), + serial_number: sled_row.serial_number().to_owned(), + }, }; // TODO-cleanup use `TypedUuid` everywhere let sled_id = SledUuid::from_untyped_uuid(sled_id); diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 86e9fb02d94..51ff668b0d2 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -14,6 +14,7 @@ use crate::external_api::views::PhysicalDiskState; use crate::external_api::views::SledPolicy; use crate::external_api::views::SledProvisionPolicy; use crate::external_api::views::SledState; +use crate::inventory::BaseboardId; use chrono::DateTime; use chrono::Utc; use clap::ValueEnum; @@ -724,6 +725,9 @@ pub enum SledFilter { /// Sleds which should have TUF repo artifacts replicated onto them. TufArtifactReplication, + + /// Sleds whose SPs should be updated by Reconfigurator + SpsUpdatedByReconfigurator, } impl SledFilter { @@ -780,6 +784,7 @@ impl SledPolicy { SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, SledFilter::TufArtifactReplication => true, + SledFilter::SpsUpdatedByReconfigurator => true, }, SledPolicy::InService { provision_policy: SledProvisionPolicy::NonProvisionable, @@ -794,6 +799,7 @@ impl SledPolicy { SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, SledFilter::TufArtifactReplication => true, + SledFilter::SpsUpdatedByReconfigurator => true, }, SledPolicy::Expunged => match filter { SledFilter::All => true, @@ -806,6 +812,7 @@ impl SledPolicy { SledFilter::VpcRouting => false, SledFilter::VpcFirewall => false, SledFilter::TufArtifactReplication => false, + SledFilter::SpsUpdatedByReconfigurator => false, }, } } @@ -840,6 +847,7 @@ impl SledState { SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, SledFilter::TufArtifactReplication => true, + SledFilter::SpsUpdatedByReconfigurator => true, }, SledState::Decommissioned => match filter { SledFilter::All => true, @@ -852,6 +860,7 @@ impl SledState { SledFilter::VpcRouting => false, SledFilter::VpcFirewall => false, SledFilter::TufArtifactReplication => false, + SledFilter::SpsUpdatedByReconfigurator => false, }, } } @@ -1058,6 +1067,8 @@ pub struct SledDetails { pub state: SledState, /// current resources allocated to this sled pub resources: SledResources, + /// baseboard id for this sled + pub baseboard_id: BaseboardId, } #[derive(Debug, thiserror::Error)] From ad88157ea9e6f9d1e156fb6b26c0957545f08c4b Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 14:55:07 -0700 Subject: [PATCH 26/32] reconfigurator-cli could let you set SP versions --- dev-tools/reconfigurator-cli/src/lib.rs | 73 +++++++- .../reconfigurator-cli/tests/input/cmds.txt | 12 ++ .../tests/output/cmds-example-stdout | 6 + ...ds-expunge-newly-added-external-dns-stdout | 1 + ...ds-expunge-newly-added-internal-dns-stdout | 1 + .../tests/output/cmds-stderr | 7 + .../tests/output/cmds-stdout | 171 +++++++++++++++++- nexus/reconfigurator/planning/src/system.rs | 158 ++++++++++++++++ nexus/reconfigurator/simulation/src/system.rs | 12 +- nexus/types/src/deployment.rs | 9 + 10 files changed, 441 insertions(+), 9 deletions(-) diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 3308888b9f2..1f32c9aea90 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -207,6 +207,7 @@ fn process_command( Commands::SledRemove(args) => cmd_sled_remove(sim, args), Commands::SledShow(args) => cmd_sled_show(sim, args), Commands::SledSetPolicy(args) => cmd_sled_set_policy(sim, args), + Commands::SledUpdateSp(args) => cmd_sled_update_sp(sim, args), Commands::SiloList => cmd_silo_list(sim), Commands::SiloAdd(args) => cmd_silo_add(sim, args), Commands::SiloRemove(args) => cmd_silo_remove(sim, args), @@ -260,6 +261,8 @@ enum Commands { SledShow(SledArgs), /// set a sled's policy SledSetPolicy(SledSetPolicyArgs), + /// simulate updating the sled's SP versions + SledUpdateSp(SledUpdateSpArgs), /// list silos SiloList, @@ -371,6 +374,20 @@ impl From for SledPolicy { } } +#[derive(Debug, Args)] +struct SledUpdateSpArgs { + /// id of the sled + sled_id: SledUuid, + + /// sets the version reported for the SP active slot + #[clap(long, required_unless_present_any = &["inactive"])] + active: Option, + + /// sets the version reported for the SP inactive slot + #[clap(long, required_unless_present_any = &["active"])] + inactive: Option, +} + #[derive(Debug, Args)] struct SledRemoveArgs { /// id of the sled @@ -837,18 +854,22 @@ fn cmd_sled_show( args: SledArgs, ) -> anyhow::Result> { let state = sim.current_state(); - let planning_input = state - .system() - .description() + let description = state.system().description(); + let sled_id = args.sled_id; + let sp_active_version = description.sled_sp_active_version(sled_id)?; + let sp_inactive_version = description.sled_sp_inactive_version(sled_id)?; + let planning_input = description .to_planning_input_builder() .context("failed to generate planning_input builder")? .build(); - let sled_id = args.sled_id; - let sled_resources = - &planning_input.sled_lookup(args.filter, sled_id)?.resources; + let sled = planning_input.sled_lookup(args.filter, sled_id)?; + let sled_resources = &sled.resources; let mut s = String::new(); swriteln!(s, "sled {}", sled_id); + swriteln!(s, "serial {}", sled.baseboard_id.serial_number); swriteln!(s, "subnet {}", sled_resources.subnet.net()); + swriteln!(s, "SP active version: {:?}", sp_active_version); + swriteln!(s, "SP inactive version: {:?}", sp_inactive_version); swriteln!(s, "zpools ({}):", sled_resources.zpools.len()); for (zpool, disk) in &sled_resources.zpools { swriteln!(s, " {:?}", zpool); @@ -876,6 +897,46 @@ fn cmd_sled_set_policy( Ok(Some(format!("set sled {} policy to {}", args.sled_id, args.policy))) } +fn cmd_sled_update_sp( + sim: &mut ReconfiguratorSim, + args: SledUpdateSpArgs, +) -> anyhow::Result> { + let mut labels = Vec::new(); + if let Some(active) = &args.active { + labels.push(format!("active -> {}", active)); + } + if let Some(inactive) = &args.inactive { + labels.push(format!("inactive -> {}", inactive)); + } + + assert!( + !labels.is_empty(), + "clap configuration requires that at least one argument is specified" + ); + + let mut state = sim.current_state().to_mut(); + state.system_mut().description_mut().sled_update_sp_versions( + args.sled_id, + args.active, + args.inactive, + )?; + + sim.commit_and_bump( + format!( + "reconfigurator-cli sled-update-sp: {}: {}", + args.sled_id, + labels.join(", "), + ), + state, + ); + + Ok(Some(format!( + "set sled {} SP versions: {}", + args.sled_id, + labels.join(", ") + ))) +} + fn cmd_inventory_list( sim: &mut ReconfiguratorSim, ) -> anyhow::Result> { diff --git a/dev-tools/reconfigurator-cli/tests/input/cmds.txt b/dev-tools/reconfigurator-cli/tests/input/cmds.txt index 12e4245dc23..d8d15011399 100644 --- a/dev-tools/reconfigurator-cli/tests/input/cmds.txt +++ b/dev-tools/reconfigurator-cli/tests/input/cmds.txt @@ -13,6 +13,18 @@ sled-add 90c1102a-b9f5-4d88-92a2-60d54a2d98cc sled-add 04ef3330-c682-4a08-8def-fcc4bef31bcd sled-list +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 1.0.0 +sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --inactive 2.0.0 +sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 3.0.0 +sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 4.0.0 --inactive invalid +sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 4.0.0 --inactive 5.0.0 +sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a + inventory-generate inventory-list diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 1f92791db7c..d07113d52aa 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -37,7 +37,10 @@ T ENA ID PARENT > sled-show 2eb69596-f081-4e2d-9425-9994926e0832 sled 2eb69596-f081-4e2d-9425-9994926e0832 +serial serial1 subnet fd00:1122:3344:102::/64 +SP active version: Some("0.0.1") +SP inactive version: None zpools (10): 088ed702-551e-453b-80d7-57700372a844 (zpool) SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-088ed702-551e-453b-80d7-57700372a844" }, disk_id: b2850ccb-4ac7-4034-aeab-b1cd582d407b (physical_disk), policy: InService, state: Active } @@ -394,7 +397,10 @@ T ENA ID PARENT > sled-show 89d02b1b-478c-401a-8e28-7a26f74fa41b sled 89d02b1b-478c-401a-8e28-7a26f74fa41b +serial serial0 subnet fd00:1122:3344:101::/64 +SP active version: Some("0.0.1") +SP inactive version: None zpools (4): 44fa7024-c2bc-4d2c-b478-c4997e4aece8 (zpool) SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-44fa7024-c2bc-4d2c-b478-c4997e4aece8" }, disk_id: 2a15b33c-dd0e-45b7-aba9-d05f40f030ff (physical_disk), policy: InService, state: Active } diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 390ad31b60f..a3ea1852524 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -1160,6 +1160,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +WARN cannot issue more SP updates (no current artifacts) INFO some zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index 31cd80e8e74..374bb711182 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -1379,6 +1379,7 @@ INFO added zone to sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, kind: In INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +WARN cannot issue more SP updates (no current artifacts) INFO some zones not yet up-to-date, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-stderr b/dev-tools/reconfigurator-cli/tests/output/cmds-stderr index e69de29bb2d..cf184b190ab 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-stderr +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-stderr @@ -0,0 +1,7 @@ +error: the following required arguments were not provided: + --active + --inactive + +Usage: sled-update-sp --active --inactive + +For more information, try '--help'. diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-stdout index 2ad9fd53316..412e0baa5d0 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-stdout @@ -15,7 +15,7 @@ T ENA ID PARENT TIME_CREATED > sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a -error: sled dde1c0e2-b10d-4621-b420-f179f7a7a00a was not found in the planning input +error: attempted to access sled dde1c0e2-b10d-4621-b420-f179f7a7a00a not found in system > sled-add dde1c0e2-b10d-4621-b420-f179f7a7a00a added sled dde1c0e2-b10d-4621-b420-f179f7a7a00a @@ -26,7 +26,10 @@ dde1c0e2-b10d-4621-b420-f179f7a7a00a 10 fd00:1122:3344:101::/64 > sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 subnet fd00:1122:3344:101::/64 +SP active version: Some("0.0.1") +SP inactive version: None zpools (10): 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } @@ -63,6 +66,169 @@ ID NZPOOLS SUBNET dde1c0e2-b10d-4621-b420-f179f7a7a00a 10 fd00:1122:3344:101::/64 +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a + +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 1.0.0 +set sled dde1c0e2-b10d-4621-b420-f179f7a7a00a SP versions: active -> 1.0.0 + +> sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 +subnet fd00:1122:3344:101::/64 +SP active version: Some("1.0.0") +SP inactive version: None +zpools (10): + 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } + 104f891f-e018-4787-a346-3cfaa6cc7e9d (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-104f891f-e018-4787-a346-3cfaa6cc7e9d" }, disk_id: 301ab9e6-bdc1-4287-a37d-2604893712f8 (physical_disk), policy: InService, state: Active } + 111f7a4e-5696-4be8-b13d-8ef314bc83e0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-111f7a4e-5696-4be8-b13d-8ef314bc83e0" }, disk_id: 1f77c099-8205-41b3-ac34-3807f3bbaf56 (physical_disk), policy: InService, state: Active } + 5a1786e9-770d-4ac9-b291-4501398170b5 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-5a1786e9-770d-4ac9-b291-4501398170b5" }, disk_id: b111a961-be34-4ede-80e2-ef92af5e0a1f (physical_disk), policy: InService, state: Active } + 658fef3f-c3cd-4e6d-8823-79f9a0bec4c0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-658fef3f-c3cd-4e6d-8823-79f9a0bec4c0" }, disk_id: b3a01997-9894-4abd-83ad-e2d520d4c3a0 (physical_disk), policy: InService, state: Active } + 73ce66f5-a39a-4dd1-ad84-5647a5038d35 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-73ce66f5-a39a-4dd1-ad84-5647a5038d35" }, disk_id: 48568b33-8f21-4537-b330-666aa3334236 (physical_disk), policy: InService, state: Active } + 7480aa69-3a3d-478d-bbdb-ba1fb74752ef (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-7480aa69-3a3d-478d-bbdb-ba1fb74752ef" }, disk_id: 9a968677-4da7-40b3-9579-9c54a7620b58 (physical_disk), policy: InService, state: Active } + 9ff438c6-00bb-4daf-9013-87969c892b02 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-9ff438c6-00bb-4daf-9013-87969c892b02" }, disk_id: cc22404e-8a30-4b98-9552-790e84a162bd (physical_disk), policy: InService, state: Active } + ad0602bf-f577-401a-a28b-687c3d86f6bb (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-ad0602bf-f577-401a-a28b-687c3d86f6bb" }, disk_id: 32baf388-4cd9-4435-b70b-d8b2e515d918 (physical_disk), policy: InService, state: Active } + da6e6a21-8d32-46f9-a2b3-635f6700c3f0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-da6e6a21-8d32-46f9-a2b3-635f6700c3f0" }, disk_id: 1e7ee543-fe10-4ba7-b8f3-d579e8e0803a (physical_disk), policy: InService, state: Active } + + +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --inactive 2.0.0 +set sled dde1c0e2-b10d-4621-b420-f179f7a7a00a SP versions: inactive -> 2.0.0 + +> sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 +subnet fd00:1122:3344:101::/64 +SP active version: Some("1.0.0") +SP inactive version: Some("2.0.0") +zpools (10): + 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } + 104f891f-e018-4787-a346-3cfaa6cc7e9d (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-104f891f-e018-4787-a346-3cfaa6cc7e9d" }, disk_id: 301ab9e6-bdc1-4287-a37d-2604893712f8 (physical_disk), policy: InService, state: Active } + 111f7a4e-5696-4be8-b13d-8ef314bc83e0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-111f7a4e-5696-4be8-b13d-8ef314bc83e0" }, disk_id: 1f77c099-8205-41b3-ac34-3807f3bbaf56 (physical_disk), policy: InService, state: Active } + 5a1786e9-770d-4ac9-b291-4501398170b5 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-5a1786e9-770d-4ac9-b291-4501398170b5" }, disk_id: b111a961-be34-4ede-80e2-ef92af5e0a1f (physical_disk), policy: InService, state: Active } + 658fef3f-c3cd-4e6d-8823-79f9a0bec4c0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-658fef3f-c3cd-4e6d-8823-79f9a0bec4c0" }, disk_id: b3a01997-9894-4abd-83ad-e2d520d4c3a0 (physical_disk), policy: InService, state: Active } + 73ce66f5-a39a-4dd1-ad84-5647a5038d35 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-73ce66f5-a39a-4dd1-ad84-5647a5038d35" }, disk_id: 48568b33-8f21-4537-b330-666aa3334236 (physical_disk), policy: InService, state: Active } + 7480aa69-3a3d-478d-bbdb-ba1fb74752ef (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-7480aa69-3a3d-478d-bbdb-ba1fb74752ef" }, disk_id: 9a968677-4da7-40b3-9579-9c54a7620b58 (physical_disk), policy: InService, state: Active } + 9ff438c6-00bb-4daf-9013-87969c892b02 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-9ff438c6-00bb-4daf-9013-87969c892b02" }, disk_id: cc22404e-8a30-4b98-9552-790e84a162bd (physical_disk), policy: InService, state: Active } + ad0602bf-f577-401a-a28b-687c3d86f6bb (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-ad0602bf-f577-401a-a28b-687c3d86f6bb" }, disk_id: 32baf388-4cd9-4435-b70b-d8b2e515d918 (physical_disk), policy: InService, state: Active } + da6e6a21-8d32-46f9-a2b3-635f6700c3f0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-da6e6a21-8d32-46f9-a2b3-635f6700c3f0" }, disk_id: 1e7ee543-fe10-4ba7-b8f3-d579e8e0803a (physical_disk), policy: InService, state: Active } + + +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 3.0.0 +set sled dde1c0e2-b10d-4621-b420-f179f7a7a00a SP versions: active -> 3.0.0 + +> sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 +subnet fd00:1122:3344:101::/64 +SP active version: Some("3.0.0") +SP inactive version: Some("2.0.0") +zpools (10): + 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } + 104f891f-e018-4787-a346-3cfaa6cc7e9d (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-104f891f-e018-4787-a346-3cfaa6cc7e9d" }, disk_id: 301ab9e6-bdc1-4287-a37d-2604893712f8 (physical_disk), policy: InService, state: Active } + 111f7a4e-5696-4be8-b13d-8ef314bc83e0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-111f7a4e-5696-4be8-b13d-8ef314bc83e0" }, disk_id: 1f77c099-8205-41b3-ac34-3807f3bbaf56 (physical_disk), policy: InService, state: Active } + 5a1786e9-770d-4ac9-b291-4501398170b5 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-5a1786e9-770d-4ac9-b291-4501398170b5" }, disk_id: b111a961-be34-4ede-80e2-ef92af5e0a1f (physical_disk), policy: InService, state: Active } + 658fef3f-c3cd-4e6d-8823-79f9a0bec4c0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-658fef3f-c3cd-4e6d-8823-79f9a0bec4c0" }, disk_id: b3a01997-9894-4abd-83ad-e2d520d4c3a0 (physical_disk), policy: InService, state: Active } + 73ce66f5-a39a-4dd1-ad84-5647a5038d35 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-73ce66f5-a39a-4dd1-ad84-5647a5038d35" }, disk_id: 48568b33-8f21-4537-b330-666aa3334236 (physical_disk), policy: InService, state: Active } + 7480aa69-3a3d-478d-bbdb-ba1fb74752ef (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-7480aa69-3a3d-478d-bbdb-ba1fb74752ef" }, disk_id: 9a968677-4da7-40b3-9579-9c54a7620b58 (physical_disk), policy: InService, state: Active } + 9ff438c6-00bb-4daf-9013-87969c892b02 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-9ff438c6-00bb-4daf-9013-87969c892b02" }, disk_id: cc22404e-8a30-4b98-9552-790e84a162bd (physical_disk), policy: InService, state: Active } + ad0602bf-f577-401a-a28b-687c3d86f6bb (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-ad0602bf-f577-401a-a28b-687c3d86f6bb" }, disk_id: 32baf388-4cd9-4435-b70b-d8b2e515d918 (physical_disk), policy: InService, state: Active } + da6e6a21-8d32-46f9-a2b3-635f6700c3f0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-da6e6a21-8d32-46f9-a2b3-635f6700c3f0" }, disk_id: 1e7ee543-fe10-4ba7-b8f3-d579e8e0803a (physical_disk), policy: InService, state: Active } + + +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 4.0.0 --inactive invalid +set sled dde1c0e2-b10d-4621-b420-f179f7a7a00a SP versions: active -> 4.0.0, inactive -> invalid + +> sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 +subnet fd00:1122:3344:101::/64 +SP active version: Some("4.0.0") +SP inactive version: None +zpools (10): + 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } + 104f891f-e018-4787-a346-3cfaa6cc7e9d (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-104f891f-e018-4787-a346-3cfaa6cc7e9d" }, disk_id: 301ab9e6-bdc1-4287-a37d-2604893712f8 (physical_disk), policy: InService, state: Active } + 111f7a4e-5696-4be8-b13d-8ef314bc83e0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-111f7a4e-5696-4be8-b13d-8ef314bc83e0" }, disk_id: 1f77c099-8205-41b3-ac34-3807f3bbaf56 (physical_disk), policy: InService, state: Active } + 5a1786e9-770d-4ac9-b291-4501398170b5 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-5a1786e9-770d-4ac9-b291-4501398170b5" }, disk_id: b111a961-be34-4ede-80e2-ef92af5e0a1f (physical_disk), policy: InService, state: Active } + 658fef3f-c3cd-4e6d-8823-79f9a0bec4c0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-658fef3f-c3cd-4e6d-8823-79f9a0bec4c0" }, disk_id: b3a01997-9894-4abd-83ad-e2d520d4c3a0 (physical_disk), policy: InService, state: Active } + 73ce66f5-a39a-4dd1-ad84-5647a5038d35 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-73ce66f5-a39a-4dd1-ad84-5647a5038d35" }, disk_id: 48568b33-8f21-4537-b330-666aa3334236 (physical_disk), policy: InService, state: Active } + 7480aa69-3a3d-478d-bbdb-ba1fb74752ef (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-7480aa69-3a3d-478d-bbdb-ba1fb74752ef" }, disk_id: 9a968677-4da7-40b3-9579-9c54a7620b58 (physical_disk), policy: InService, state: Active } + 9ff438c6-00bb-4daf-9013-87969c892b02 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-9ff438c6-00bb-4daf-9013-87969c892b02" }, disk_id: cc22404e-8a30-4b98-9552-790e84a162bd (physical_disk), policy: InService, state: Active } + ad0602bf-f577-401a-a28b-687c3d86f6bb (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-ad0602bf-f577-401a-a28b-687c3d86f6bb" }, disk_id: 32baf388-4cd9-4435-b70b-d8b2e515d918 (physical_disk), policy: InService, state: Active } + da6e6a21-8d32-46f9-a2b3-635f6700c3f0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-da6e6a21-8d32-46f9-a2b3-635f6700c3f0" }, disk_id: 1e7ee543-fe10-4ba7-b8f3-d579e8e0803a (physical_disk), policy: InService, state: Active } + + +> sled-update-sp dde1c0e2-b10d-4621-b420-f179f7a7a00a --active 4.0.0 --inactive 5.0.0 +set sled dde1c0e2-b10d-4621-b420-f179f7a7a00a SP versions: active -> 4.0.0, inactive -> 5.0.0 + +> sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a +sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 +subnet fd00:1122:3344:101::/64 +SP active version: Some("4.0.0") +SP inactive version: Some("5.0.0") +zpools (10): + 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } + 104f891f-e018-4787-a346-3cfaa6cc7e9d (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-104f891f-e018-4787-a346-3cfaa6cc7e9d" }, disk_id: 301ab9e6-bdc1-4287-a37d-2604893712f8 (physical_disk), policy: InService, state: Active } + 111f7a4e-5696-4be8-b13d-8ef314bc83e0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-111f7a4e-5696-4be8-b13d-8ef314bc83e0" }, disk_id: 1f77c099-8205-41b3-ac34-3807f3bbaf56 (physical_disk), policy: InService, state: Active } + 5a1786e9-770d-4ac9-b291-4501398170b5 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-5a1786e9-770d-4ac9-b291-4501398170b5" }, disk_id: b111a961-be34-4ede-80e2-ef92af5e0a1f (physical_disk), policy: InService, state: Active } + 658fef3f-c3cd-4e6d-8823-79f9a0bec4c0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-658fef3f-c3cd-4e6d-8823-79f9a0bec4c0" }, disk_id: b3a01997-9894-4abd-83ad-e2d520d4c3a0 (physical_disk), policy: InService, state: Active } + 73ce66f5-a39a-4dd1-ad84-5647a5038d35 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-73ce66f5-a39a-4dd1-ad84-5647a5038d35" }, disk_id: 48568b33-8f21-4537-b330-666aa3334236 (physical_disk), policy: InService, state: Active } + 7480aa69-3a3d-478d-bbdb-ba1fb74752ef (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-7480aa69-3a3d-478d-bbdb-ba1fb74752ef" }, disk_id: 9a968677-4da7-40b3-9579-9c54a7620b58 (physical_disk), policy: InService, state: Active } + 9ff438c6-00bb-4daf-9013-87969c892b02 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-9ff438c6-00bb-4daf-9013-87969c892b02" }, disk_id: cc22404e-8a30-4b98-9552-790e84a162bd (physical_disk), policy: InService, state: Active } + ad0602bf-f577-401a-a28b-687c3d86f6bb (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-ad0602bf-f577-401a-a28b-687c3d86f6bb" }, disk_id: 32baf388-4cd9-4435-b70b-d8b2e515d918 (physical_disk), policy: InService, state: Active } + da6e6a21-8d32-46f9-a2b3-635f6700c3f0 (zpool) + SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-da6e6a21-8d32-46f9-a2b3-635f6700c3f0" }, disk_id: 1e7ee543-fe10-4ba7-b8f3-d579e8e0803a (physical_disk), policy: InService, state: Active } + + + > inventory-generate generated inventory collection 6e066695-94bc-4250-bd63-fd799c166cc1 from configured sleds @@ -99,7 +265,10 @@ result: > sled-show dde1c0e2-b10d-4621-b420-f179f7a7a00a sled dde1c0e2-b10d-4621-b420-f179f7a7a00a +serial serial0 subnet fd00:1122:3344:101::/64 +SP active version: Some("4.0.0") +SP inactive version: Some("5.0.0") zpools (10): 0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa (zpool) SledDisk { disk_identity: DiskIdentity { vendor: "fake-vendor", model: "fake-model", serial: "serial-0f3f1de0-7e5a-4032-a73a-74fbdabbd2fa" }, disk_id: 2dbf19d4-7b7d-48d5-9d1c-64ac2922093b (physical_disk), policy: InService, state: Active } diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index dc4a5416ec7..9ef7702a480 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -7,6 +7,7 @@ use anyhow::{Context, anyhow, bail, ensure}; use gateway_client::types::RotState; +use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use indexmap::IndexMap; use ipnet::Ipv6Net; @@ -23,6 +24,7 @@ use nexus_sled_agent_shared::inventory::SledRole; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbSettings; +use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::OximeterReadPolicy; use nexus_types::deployment::PlanningInputBuilder; use nexus_types::deployment::Policy; @@ -35,6 +37,8 @@ use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; use nexus_types::external_api::views::SledState; use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::Caboose; +use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::PowerState; use nexus_types::inventory::RotSlot; use nexus_types::inventory::SpType; @@ -58,6 +62,7 @@ use std::fmt::Debug; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::sync::Arc; +use tufaceous_artifact::ArtifactVersion; /// Describes an actual or synthetic Oxide rack for planning and testing /// @@ -415,6 +420,43 @@ impl SystemDescription { Ok(self) } + /// Update the SP versions reported for a sled. + /// + /// Where `None` is provided, no changes are made. + pub fn sled_update_sp_versions( + &mut self, + sled_id: SledUuid, + active_version: Option, + inactive_version: Option, + ) -> anyhow::Result<&mut Self> { + let sled = self.sleds.get_mut(&sled_id).with_context(|| { + format!("attempted to access sled {} not found in system", sled_id) + })?; + let sled = Arc::make_mut(sled); + sled.set_sp_versions(active_version, inactive_version); + Ok(self) + } + + pub fn sled_sp_active_version( + &self, + sled_id: SledUuid, + ) -> anyhow::Result> { + let sled = self.sleds.get(&sled_id).with_context(|| { + format!("attempted to access sled {} not found in system", sled_id) + })?; + Ok(sled.sp_active_caboose().map(|c| c.version.as_ref())) + } + + pub fn sled_sp_inactive_version( + &self, + sled_id: SledUuid, + ) -> anyhow::Result> { + let sled = self.sleds.get(&sled_id).with_context(|| { + format!("attempted to access sled {} not found in system", sled_id) + })?; + Ok(sled.sp_inactive_caboose().map(|c| c.version.as_ref())) + } + pub fn to_collection_builder(&self) -> anyhow::Result { let collector_label = self .collector @@ -433,6 +475,46 @@ impl SystemDescription { sp_state.clone(), ) .context("recording SP state")?; + + let baseboard_id = BaseboardId { + part_number: sp_state.model.clone(), + serial_number: sp_state.serial_number.clone(), + }; + if let Some(active) = &s.sp_active_caboose() { + builder + .found_caboose( + &baseboard_id, + CabooseWhich::SpSlot0, + "fake MGS 1", + SpComponentCaboose { + board: active.board.clone(), + epoch: None, + git_commit: active.git_commit.clone(), + name: active.name.clone(), + sign: active.sign.clone(), + version: active.version.clone(), + }, + ) + .context("recording SP active caboose")?; + } + + if let Some(inactive) = &s.sp_inactive_caboose() { + builder + .found_caboose( + &baseboard_id, + CabooseWhich::SpSlot1, + "fake MGS 1", + SpComponentCaboose { + board: inactive.board.clone(), + epoch: None, + git_commit: inactive.git_commit.clone(), + name: inactive.name.clone(), + sign: inactive.sign.clone(), + version: inactive.version.clone(), + }, + ) + .context("recording SP inactive caboose")?; + } } builder @@ -597,6 +679,8 @@ pub struct SledHwInventory<'a> { pub baseboard_id: &'a BaseboardId, pub sp: &'a nexus_types::inventory::ServiceProcessor, pub rot: &'a nexus_types::inventory::RotState, + pub sp_active: Option>, + pub sp_inactive: Option>, } /// Our abstract description of a `Sled` @@ -611,6 +695,8 @@ pub struct Sled { policy: SledPolicy, state: SledState, resources: SledResources, + sp_active_caboose: Option>, + sp_inactive_caboose: Option>, } impl Sled { @@ -750,6 +836,10 @@ impl Sled { }, state: SledState::Active, resources: SledResources { subnet: sled_subnet, zpools }, + sp_active_caboose: Some(Arc::new(Self::default_sp_caboose( + String::from("0.0.1"), + ))), + sp_inactive_caboose: None, } } @@ -780,6 +870,10 @@ impl Sled { }) .unwrap_or(Baseboard::Unknown); + let sp_active_caboose = + inventory_sp.as_ref().and_then(|hw| hw.sp_active.clone()); + let sp_inactive_caboose = + inventory_sp.as_ref().and_then(|hw| hw.sp_inactive.clone()); let inventory_sp = inventory_sp.map(|sledhw| { // RotStateV3 unconditionally sets all of these let sp_state = if sledhw.rot.slot_a_sha3_256_digest.is_some() @@ -887,6 +981,8 @@ impl Sled { policy: sled_policy, state: sled_state, resources: sled_resources, + sp_active_caboose, + sp_inactive_caboose, } } @@ -916,6 +1012,68 @@ impl Sled { fn sled_agent_inventory(&self) -> &Inventory { &self.inventory_sled_agent } + + fn sp_active_caboose(&self) -> Option<&Caboose> { + self.sp_active_caboose.as_deref() + } + + fn sp_inactive_caboose(&self) -> Option<&Caboose> { + self.sp_inactive_caboose.as_deref() + } + + /// Update the reported SP versions + /// + /// If either field is `None`, that field is _unchanged_. + // Note that this means there's no way to _unset_ the version. + fn set_sp_versions( + &mut self, + active_version: Option, + inactive_version: Option, + ) { + if let Some(active_version) = active_version { + match &mut self.sp_active_caboose { + Some(caboose) => { + Arc::make_mut(caboose).version = active_version.to_string() + } + new @ None => { + *new = Some(Arc::new(Self::default_sp_caboose( + active_version.to_string(), + ))); + } + } + } + + if let Some(inactive_version) = inactive_version { + match inactive_version { + ExpectedVersion::NoValidVersion => { + self.sp_inactive_caboose = None; + } + ExpectedVersion::Version(v) => { + match &mut self.sp_inactive_caboose { + Some(caboose) => { + Arc::make_mut(caboose).version = v.to_string() + } + new @ None => { + *new = Some(Arc::new(Self::default_sp_caboose( + v.to_string(), + ))); + } + } + } + } + } + } + + fn default_sp_caboose(version: String) -> Caboose { + let board = "SimGimletSp".to_string(); // XXX-dap constant + Caboose { + board: board.clone(), + git_commit: String::from("unknown"), + name: board, + version: version.to_string(), + sign: None, + } + } } #[derive(Clone, Copy, Debug)] diff --git a/nexus/reconfigurator/simulation/src/system.rs b/nexus/reconfigurator/simulation/src/system.rs index 137b7026abb..5b5662d5cb5 100644 --- a/nexus/reconfigurator/simulation/src/system.rs +++ b/nexus/reconfigurator/simulation/src/system.rs @@ -17,7 +17,7 @@ use nexus_types::{ Blueprint, BlueprintTarget, SledFilter, UnstableReconfiguratorState, }, internal_api::params::{DnsConfigParams, DnsConfigZone}, - inventory::Collection, + inventory::{CabooseWhich, Collection}, }; use omicron_common::{address::IpRange, api::external::Generation}; use omicron_uuid_kinds::{BlueprintUuid, CollectionUuid, SledUuid}; @@ -687,7 +687,7 @@ impl SimSystemBuilderInner { else { res.warnings.push(format!( "sled {}: skipped (no inventory found for sled agent in \ - collection {}", + collection {})", sled_id, primary_collection_id )); continue; @@ -699,11 +699,19 @@ impl SimSystemBuilderInner { .and_then(|baseboard_id| { let inv_sp = primary_collection.sps.get(baseboard_id); let inv_rot = primary_collection.rots.get(baseboard_id); + let sp_active = primary_collection + .caboose_for(CabooseWhich::SpSlot0, baseboard_id) + .map(|c| c.caboose.clone()); + let sp_inactive = primary_collection + .caboose_for(CabooseWhich::SpSlot1, baseboard_id) + .map(|c| c.caboose.clone()); if let (Some(inv_sp), Some(inv_rot)) = (inv_sp, inv_rot) { Some(SledHwInventory { baseboard_id: &baseboard_id, sp: inv_sp, rot: inv_rot, + sp_active, + sp_inactive, }) } else { None diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 16ed5a6e74e..840a70180b9 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -1381,6 +1381,15 @@ impl FromStr for ExpectedVersion { } } +impl fmt::Display for ExpectedVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ExpectedVersion::NoValidVersion => f.write_str("invalid"), + ExpectedVersion::Version(v) => v.fmt(f), + } + } +} + /// Describes the expected active RoT slot, and the version we expect to find for it #[derive( Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffable, From 7705d1cc1ef8c13922ab2953a6a1329d99d70e8b Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 15:36:24 -0700 Subject: [PATCH 27/32] rustfmt --- nexus/reconfigurator/planning/src/planner.rs | 29 ++++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 1452a574d0a..afbfc50118c 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -925,21 +925,20 @@ impl<'a> Planner<'a> { .all_sleds(SledFilter::SpsUpdatedByReconfigurator) .map(|(_sled_id, details)| &details.baseboard_id) .collect(); - let included_baseboards = self - .inventory - .sps - .iter() - .filter_map(|(baseboard_id, sp_state)| { - let do_include = match sp_state.sp_type { - SpType::Sled => { - included_sled_baseboards.contains(baseboard_id.as_ref()) - } - SpType::Power => true, - SpType::Switch => true, - }; - do_include.then_some(baseboard_id.clone()) - }) - .collect(); + let included_baseboards = + self.inventory + .sps + .iter() + .filter_map(|(baseboard_id, sp_state)| { + let do_include = match sp_state.sp_type { + SpType::Sled => included_sled_baseboards + .contains(baseboard_id.as_ref()), + SpType::Power => true, + SpType::Switch => true, + }; + do_include.then_some(baseboard_id.clone()) + }) + .collect(); // Compute the new set of PendingMgsUpdates. let current_updates = From ceaaec39ceebb2be2cbab611eff3b2abf57966e9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 4 Jun 2025 15:45:54 -0700 Subject: [PATCH 28/32] fix tests --- dev-tools/omdb/tests/usage_errors.out | 27 ++++++++++--------- ...ds-expunge-newly-added-external-dns-stdout | 1 + ...ds-expunge-newly-added-internal-dns-stdout | 1 + 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 68ff3c59272..16540a10234 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -507,19 +507,22 @@ Options: Show sleds that match the given filter Possible values: - - all: All sleds in the system, regardless of policy or state - - commissioned: All sleds that are currently part of the control plane cluster - - decommissioned: All sleds that were previously part of the control plane + - all: All sleds in the system, regardless of policy or state + - commissioned: All sleds that are currently part of the control plane + cluster + - decommissioned: All sleds that were previously part of the control plane cluster but have been decommissioned - - discretionary: Sleds that are eligible for discretionary services - - in-service: Sleds that are in service (even if they might not be eligible - for discretionary services) - - query-during-inventory: Sleds whose sled agents should be queried for inventory - - reservation-create: Sleds on which reservations can be created - - vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules - - vpc-firewall: Sleds which should be sent VPC firewall rules - - tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated onto - them + - discretionary: Sleds that are eligible for discretionary services + - in-service: Sleds that are in service (even if they might not be + eligible for discretionary services) + - query-during-inventory: Sleds whose sled agents should be queried for inventory + - reservation-create: Sleds on which reservations can be created + - vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing + rules + - vpc-firewall: Sleds which should be sent VPC firewall rules + - tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated + onto them + - sps-updated-by-reconfigurator: Sleds whose SPs should be updated by Reconfigurator --log-level log level filter diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index 390ad31b60f..a3ea1852524 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -1160,6 +1160,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count INFO added zone to sled, sled_id: a88790de-5962-4871-8686-61c1fd5b7094, kind: ExternalDns INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +WARN cannot issue more SP updates (no current artifacts) INFO some zones not yet up-to-date, sled_id: a88790de-5962-4871-8686-61c1fd5b7094 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index 31cd80e8e74..374bb711182 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -1379,6 +1379,7 @@ INFO added zone to sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, kind: In INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +WARN cannot issue more SP updates (no current artifacts) INFO some zones not yet up-to-date, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763 INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 From 4c6d9c1bb614b94e909cd2b5085f0cbfd30cd7c5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 5 Jun 2025 12:25:31 -0700 Subject: [PATCH 29/32] use SIM_GIMLET_BOARD constant --- Cargo.lock | 1 + nexus/reconfigurator/planning/Cargo.toml | 1 + nexus/reconfigurator/planning/src/system.rs | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 4bbbb4baac5..e2120ae81d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6493,6 +6493,7 @@ dependencies = [ "sled-agent-client", "slog", "slog-error-chain", + "sp-sim", "static_assertions", "strum", "test-strategy", diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index e7520fc8e3b..756d7dd604c 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -33,6 +33,7 @@ semver.workspace = true sled-agent-client.workspace = true slog.workspace = true slog-error-chain.workspace = true +sp-sim.workspace = true static_assertions.workspace = true strum.workspace = true thiserror.workspace = true diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 9ef7702a480..841efd6702b 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -1065,7 +1065,7 @@ impl Sled { } fn default_sp_caboose(version: String) -> Caboose { - let board = "SimGimletSp".to_string(); // XXX-dap constant + let board = sp_sim::SIM_GIMLET_BOARD.to_string(); Caboose { board: board.clone(), git_commit: String::from("unknown"), From 709ae6df6857e0202749ba97998feba5b0dd55c8 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 5 Jun 2025 12:34:30 -0700 Subject: [PATCH 30/32] remove XXX --- nexus/reconfigurator/planning/src/planner.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index afbfc50118c..4b50f9396cd 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -953,8 +953,7 @@ impl<'a> Planner<'a> { 1, ); - // XXX-dap this is not quite right. It's possible that no updates were - // planned but SPs aren't yet updated. + // TODO This is not quite right. See oxidecomputer/omicron#8285. let rv = if next.is_empty() { UpdateStepResult::ContinueToNextStep } else { From ac66c429d7b3589242f7d42d7c3172bdc5e99e11 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 11 Jun 2025 21:51:01 -0700 Subject: [PATCH 31/32] fix test --- dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout | 1 + 1 file changed, 1 insertion(+) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index c77200a56b1..eee75c23d89 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -497,6 +497,7 @@ WARN failed to place all new desired InternalDns zones, placed: 0, wanted_to_pla INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count: 0 WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3 INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 +WARN cannot issue more SP updates (no current artifacts) INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a From 29df3f65dd453ed3240bf3aaa6ab01269dd3e3ae Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 13 Jun 2025 10:40:07 -0700 Subject: [PATCH 32/32] review feedback --- nexus/reconfigurator/planning/src/planner.rs | 35 +++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 93574119172..9dab7e10139 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -52,6 +52,32 @@ pub use self::rng::SledPlannerRng; mod omicron_zone_placement; pub(crate) mod rng; +/// Maximum number of MGS-managed updates (updates to SP, RoT, RoT bootloader, +/// or host OS) that we allow to be pending across the whole system at one time +/// +/// For now, we limit this to 1 for safety. That's for a few reasons: +/// +/// - SP updates reboot the corresponding host. Thus, if we have one of these +/// updates outstanding, we should assume that host may be offline. Most +/// control plane services are designed to survive multiple failures (e.g., +/// the Cockroach cluster can sustain two failures and stay online), but +/// having one sled offline eats into that margin. And some services like +/// Crucible volumes can only sustain one failure. Taking down two sleds +/// would render unavailable any Crucible volumes with regions on those two +/// sleds. +/// +/// - There is unfortunately some risk in updating the RoT bootloader, in that +/// there's a window where a failure could render the device unbootable. See +/// oxidecomputer/omicron#7819 for more on this. Updating only one at a time +/// helps mitigate this risk. +/// +/// More sophisticated schemes are certainly possible (e.g., allocate Crucible +/// regions in such a way that there are at least pairs of sleds we could update +/// concurrently without taking volumes down; and/or be willing to update +/// multiple sleds as long as they don't have overlapping control plane +/// services, etc.). +const NUM_CONCURRENT_MGS_UPDATES: usize = 1; + enum UpdateStepResult { ContinueToNextStep, Waiting, @@ -122,8 +148,7 @@ impl<'a> Planner<'a> { self.do_plan_expunge()?; self.do_plan_add()?; self.do_plan_decommission()?; - if let UpdateStepResult::ContinueToNextStep = - self.do_plan_mgs_updates()? + if let UpdateStepResult::ContinueToNextStep = self.do_plan_mgs_updates() { self.do_plan_zone_updates()?; } @@ -914,7 +939,7 @@ impl<'a> Planner<'a> { /// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of /// date. - fn do_plan_mgs_updates(&mut self) -> Result { + fn do_plan_mgs_updates(&mut self) -> UpdateStepResult { // Determine which baseboards we will consider updating. // // Sleds may be present but not adopted as part of the control plane. @@ -956,7 +981,7 @@ impl<'a> Planner<'a> { &included_baseboards, ¤t_updates, current_artifacts, - 1, + NUM_CONCURRENT_MGS_UPDATES, ); // TODO This is not quite right. See oxidecomputer/omicron#8285. @@ -966,7 +991,7 @@ impl<'a> Planner<'a> { UpdateStepResult::Waiting }; self.blueprint.pending_mgs_updates_replace_all(next); - Ok(rv) + rv } /// Update at most one existing zone to use a new image source.