From 65f6ffef62a436d570c8b481f4214e41c6ae6e2b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 13:09:32 -0700 Subject: [PATCH 01/23] [nexus] initial schema for sitrep tables --- nexus/db-schema/src/schema.rs | 24 ++++++++++ schema/crdb/dbinit.sql | 82 +++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index ae62b1a327..db682f05d7 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2812,3 +2812,27 @@ table! { bearer_token -> Text, } } + +table! { + fm_sitrep (id) { + id -> Uuid, + parent_sitrep_id -> Nullable, + inv_collection_id -> Uuid, + time_created -> Timestamptz, + creator_id -> Uuid, + comment -> Text, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, inv_collection); + +table! { + fm_current_sitrep (version) { + version -> Int8, + sitrep_id -> Uuid, + response_authorized -> Bool, + time_made_current -> Timestamptz, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_current_sitrep); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ca22be8fd9..62a19eff08 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6767,6 +6767,88 @@ ON omicron.public.host_ereport ( ) WHERE time_deleted IS NULL; +/* + * Fault management situation reports (and accessories) + * + * See RFD 603 for details: + * https://rfd.shared.oxide.computer/rfd/603 +*/ + +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( + -- The ID of this sitrep. + id UUID PRIMARY KEY, + -- The ID of the parent sitrep. + -- + -- A sitrep's _parent_ is the sitrep that was current when the planning + -- phase that produced that sitrep ran. The parent sitrep is a planning + -- input that produced this sitrep. + -- + -- This is effectively a foreign key back to this table; however, it is + -- allowed to be NULL: the initial sitrep has no parent. Additionally, + -- it may be non-NULL but no longer reference a row in this table: once a + -- child sitrep has been created from a parent, it's possible for the + -- parent to be deleted. We do not NULL out this field on such a deletion, + -- so we can always see that there had been a particular parent even if + -- it's now gone. + parent_sitrep_id UUID, + -- The ID of the inventory collection that was used as input to this + -- sitrep. + -- + -- This is a foreign key that references a row in the `inv_collection` + -- table (and other inventory records associated with that collection). + -- + -- Note that inventory collections are pruned on a separate schedule + -- from sitreps, so the inventory collection records may not exist. + inv_collection_id UUID NOT NULL, + + -- These fields are not semantically meaningful and are intended + -- debugging purposes. + + -- The time at which this sitrep was created. + time_created TIMESTAMPTZ NOT NULL, + -- The Omicron zone UUID of the Nexus instance that created this + -- sitrep. + creator_id UUID NOT NULL, + -- A human-readable description of the changes represented by this + -- sitrep. + comment TEXT NOT NULL +); + +-- The history of current sitreps. +-- +-- The sitrep with the highest `version` in this table is the current sitrep. +CREATE TABLE IF NOT EXISTS omicron.public.fm_current_sitrep ( + -- Monotonically increasing version for all FM sitreps. + version INT8 PRIMARY KEY, + + -- Effectively a foreign key into the `fm_sitrep` table, but may + -- reference a fm_sitrep that has been deleted (if this sitrep is + -- no longer current; the current sitrep must not be deleted). + sitrep_id UUID NOT NULL, + + -- Whether potentially-destructive automated response actions + -- are cleared hot. + -- + -- This is similar to the `enabled` column in `bp_target`, in + -- that it allows automated response to be disabled by an + -- operator in case of danger. + -- + -- However, it's a bit different from its blueprint counterpart: + -- it only disables potentially destructive automated response + -- actions. The FM system will still continue to request polling + -- and diagnose active problems. It seems important to still be + -- able to both detect and diagnose Active Problems even when + -- disabling destructive automated response. + response_authorized BOOL NOT NULL, + + -- Timestamp for when this sitrep was made current. + time_made_current TIMESTAMPTZ NOT NULL +); + +CREATE UNIQUE INDEX IF NOT EXISTS + lookup_sitrep_history_by_id +ON omicron.public.fm_sitrep_history (sitrep_id); + -- Metadata for the schema itself. -- -- This table may be read by Nexuses with different notions of "what the schema should be". From be4b7fff020a026d072d35ac3089340ecfba97d8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sun, 19 Oct 2025 12:13:42 -0700 Subject: [PATCH 02/23] [nexus] sitrep types --- nexus/db-model/src/fm.rs | 29 +++++++++++++++++++++++++++++ nexus/types/src/fm.rs | 20 ++++++++++++++++++++ nexus/types/src/lib.rs | 1 + uuid-kinds/src/lib.rs | 1 + 4 files changed, 51 insertions(+) create mode 100644 nexus/db-model/src/fm.rs create mode 100644 nexus/types/src/fm.rs diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs new file mode 100644 index 0000000000..acb9ba5e80 --- /dev/null +++ b/nexus/db-model/src/fm.rs @@ -0,0 +1,29 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing fault management sitreps in the database. + +use crate::SqlU32; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_sitrep)] +pub struct SitrepMetadata { + pub id: DbTypedUuid, + pub parent_sitrep_id: Option>, + pub inv_collection_id: DbTypedUuid, + pub creator_id: DbTypedUuid, + pub comment: String, +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = fm_current_sitrep)] +pub struct CurrentSitrep { + pub version: SqlU32, + pub sitrep_id: DbTypedUuid, + pub response_authorized: bool, + pub time_made_current: DateTime, +} diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs new file mode 100644 index 0000000000..9578e62fe0 --- /dev/null +++ b/nexus/types/src/fm.rs @@ -0,0 +1,20 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management types. + +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct SitrepMetadata { + pub id: SitrepUuid, + pub parent_sitrep_id: Option, + pub inv_collection_id: CollectionUuid, + pub creator_id: OmicronZoneUuid, + pub comment: String, + pub time_created: DateTime, +} diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 3a4056f7f4..fd1f2243bf 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -32,6 +32,7 @@ pub mod authn; pub mod deployment; pub mod external_api; +pub mod fm; pub mod identity; pub mod internal_api; pub mod inventory; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 354c677149..54c497a5d2 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -74,6 +74,7 @@ impl_typed_uuid_kinds! { Region = {}, SiloGroup = {}, SiloUser = {}, + Sitrep = {}, Sled = {}, SpUpdate = {}, SupportBundle = {}, From c3b36f029a20d87be5495e7a6ab4e75d29794ed6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 Oct 2025 12:41:33 -0700 Subject: [PATCH 03/23] [nexus] finish loader task, other stuff --- nexus-config/src/nexus_config.rs | 17 ++ nexus/background-task-interface/src/init.rs | 1 + nexus/db-model/src/fm.rs | 29 --- nexus/db-model/src/fm_sitrep.rs | 63 ++++++ nexus/db-model/src/lib.rs | 2 + nexus/db-queries/src/db/datastore/fm.rs | 92 +++++++++ nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/db-schema/src/schema.rs | 5 +- nexus/src/app/background/init.rs | 24 ++- .../app/background/tasks/fm_sitrep_load.rs | 179 ++++++++++++++++++ nexus/src/app/background/tasks/mod.rs | 1 + nexus/src/app/mod.rs | 10 + nexus/types/src/fm.rs | 13 ++ nexus/types/src/internal_api/background.rs | 13 ++ schema/crdb/dbinit.sql | 17 +- 15 files changed, 418 insertions(+), 49 deletions(-) delete mode 100644 nexus/db-model/src/fm.rs create mode 100644 nexus/db-model/src/fm_sitrep.rs create mode 100644 nexus/db-queries/src/db/datastore/fm.rs create mode 100644 nexus/src/app/background/tasks/fm_sitrep_load.rs diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 588374407b..37c4446019 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -441,6 +441,8 @@ pub struct BackgroundTaskConfig { pub webhook_deliverator: WebhookDeliveratorConfig, /// configuration for SP ereport ingester task pub sp_ereport_ingester: SpEreportIngesterConfig, + /// configuration for fault management background tasks + pub fm: FmTasksConfig, } #[serde_as] @@ -870,6 +872,21 @@ impl Default for SpEreportIngesterConfig { } } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct FmTasksConfig { + /// period (in seconds) for periodic activations of the background task that + /// reads the latest fault management sitrep from the database. + #[serde_as(as = "DurationSeconds")] + pub sitrep_load_period_secs: Duration, +} + +impl Default for FmTasksConfig { + fn default() -> Self { + Self { sitrep_load_period_secs: Duration::from_secs(15) } + } +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index bc71033bf3..8ea150eaef 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -51,6 +51,7 @@ pub struct BackgroundTasks { pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, pub task_reconfigurator_config_loader: Activator, + pub task_fm_sitrep_loader: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs deleted file mode 100644 index acb9ba5e80..0000000000 --- a/nexus/db-model/src/fm.rs +++ /dev/null @@ -1,29 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Types for representing fault management sitreps in the database. - -use crate::SqlU32; -use crate::typed_uuid::DbTypedUuid; -use chrono::{DateTime, Utc}; -use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; - -#[derive(Queryable, Insertable, Clone, Debug, Selectable)] -#[diesel(table_name = fm_sitrep)] -pub struct SitrepMetadata { - pub id: DbTypedUuid, - pub parent_sitrep_id: Option>, - pub inv_collection_id: DbTypedUuid, - pub creator_id: DbTypedUuid, - pub comment: String, -} - -#[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = fm_current_sitrep)] -pub struct CurrentSitrep { - pub version: SqlU32, - pub sitrep_id: DbTypedUuid, - pub response_authorized: bool, - pub time_made_current: DateTime, -} diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm_sitrep.rs new file mode 100644 index 0000000000..a22232b2bb --- /dev/null +++ b/nexus/db-model/src/fm_sitrep.rs @@ -0,0 +1,63 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing fault management sitreps in the database. + +use crate::SqlU32; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_version}; +use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_sitrep)] +pub struct SitrepMetadata { + pub id: DbTypedUuid, + pub parent_sitrep_id: Option>, + pub inv_collection_id: DbTypedUuid, + pub creator_id: DbTypedUuid, + pub comment: String, + pub time_created: DateTime, +} + +impl From for nexus_types::fm::SitrepMetadata { + fn from(db_meta: SitrepMetadata) -> Self { + let SitrepMetadata { + id, + parent_sitrep_id, + inv_collection_id, + creator_id, + comment, + time_created, + } = db_meta; + Self { + id: id.into(), + parent_sitrep_id: parent_sitrep_id.map(Into::into), + inv_collection_id: inv_collection_id.into(), + creator_id: creator_id.into(), + comment, + time_created, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = fm_sitrep_version)] +pub struct SitrepVersion { + pub version: SqlU32, + pub sitrep_id: DbTypedUuid, + pub time_made_current: DateTime, +} + +impl From for nexus_types::fm::SitrepVersion { + fn from(db_version: SitrepVersion) -> Self { + let SitrepVersion { sitrep_id, version, time_made_current } = + db_version; + Self { + id: sitrep_id.into(), + version: version.into(), + time_made_current, + } + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 32e4d4747d..31803623c8 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -89,6 +89,7 @@ mod webhook_rx; // for join-based marker trait generation. mod deployment; mod ereport; +mod fm_sitrep; pub mod nat_entry; mod omicron_zone_config; mod quota; @@ -181,6 +182,7 @@ pub use dns::*; pub use downstairs::*; pub use ereport::*; pub use external_ip::*; +pub use fm_sitrep::*; pub use generation::*; pub use identity_provider::*; pub use image::*; diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs new file mode 100644 index 0000000000..3339c2e01d --- /dev/null +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -0,0 +1,92 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on fault management internal data. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db::model::SitrepMetadata; +use crate::db::model::SitrepVersion; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use nexus_db_errors::ErrorHandler; +use nexus_db_errors::public_error_from_diesel; +use nexus_db_lookup::DbConnection; +use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_version::dsl as current_sitrep_dsl; +use nexus_types::fm::Sitrep; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SitrepUuid; + +impl DataStore { + pub async fn fm_get_current_sitrep_version( + &self, + opctx: &OpContext, + ) -> Result, Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + self.fm_get_current_sitrep_version_on_conn(&conn).await + } + + async fn fm_get_current_sitrep_version_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + ) -> Result, Error> { + current_sitrep_dsl::fm_sitrep_version + .order_by(current_sitrep_dsl::version.desc()) + .select(SitrepVersion::as_select()) + .first_async(conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn fm_sitrep_metadata_read( + &self, + opctx: &OpContext, + id: SitrepUuid, + ) -> Result, Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + self.fm_sitrep_metadata_read_on_conn(id, &conn).await + } + + async fn fm_sitrep_metadata_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result, Error> { + sitrep_dsl::fm_sitrep + .filter(sitrep_dsl::id.eq(id.into_untyped_uuid())) + .select(SitrepMetadata::as_select()) + .first_async(conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn fm_sitrep_read( + &self, + opctx: &OpContext, + id: SitrepUuid, + ) -> Result { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + + let metadata = self + .fm_sitrep_metadata_read_on_conn(id, &conn) + .await? + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!("sitrep {id:?}")) + })? + .into(); + + // TODO(eliza): this is where we would read all the other sitrep data, + // if there was any. + + Ok(Sitrep { metadata }) + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 3667323d49..fd434eff4e 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -71,6 +71,7 @@ mod disk; mod dns; mod ereport; mod external_ip; +mod fm; mod identity_provider; mod image; pub mod instance; diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index db682f05d7..16f1b6869e 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2827,12 +2827,11 @@ table! { allow_tables_to_appear_in_same_query!(fm_sitrep, inv_collection); table! { - fm_current_sitrep (version) { + fm_sitrep_version (version) { version -> Int8, sitrep_id -> Uuid, - response_authorized -> Bool, time_made_current -> Timestamptz, } } -allow_tables_to_appear_in_same_query!(fm_sitrep, fm_current_sitrep); +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_sitrep_version); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index c4a788f78f..de240a5056 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -103,6 +103,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; +use super::tasks::fm_sitrep_load; use super::tasks::instance_reincarnation; use super::tasks::instance_updater; use super::tasks::instance_watcher; @@ -145,6 +146,7 @@ use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::PendingMgsUpdates; +use nexus_types::fm; use nexus_types::inventory::Collection; use omicron_uuid_kinds::OmicronZoneUuid; use oximeter::types::ProducerRegistry; @@ -254,6 +256,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator: Activator::new(), task_sp_ereport_ingester: Activator::new(), task_reconfigurator_config_loader: Activator::new(), + task_fm_sitrep_loader: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -334,6 +337,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator, task_sp_ereport_ingester, task_reconfigurator_config_loader, + task_fm_sitrep_loader, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -1045,7 +1049,7 @@ impl BackgroundTasksInitializer { description: "collects error reports from service processors", period: config.sp_ereport_ingester.period_secs, task_impl: Box::new(ereport_ingester::SpEreportIngester::new( - datastore, + datastore.clone(), resolver, nexus_id, config.sp_ereport_ingester.disable, @@ -1055,6 +1059,21 @@ impl BackgroundTasksInitializer { activator: task_sp_ereport_ingester, }); + driver.register(TaskDefinition { + name: "fm_sitrep_loader", + description: + "loads the current fault management situation report from \ + the database", + period: config.fm.sitrep_load_period_secs, + task_impl: Box::new(fm_sitrep_load::SitrepLoader::new( + datastore, + args.sitrep_load_tx, + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_fm_sitrep_loader, + }); + driver } } @@ -1093,6 +1112,9 @@ pub struct BackgroundTasksData { pub mgs_updates_tx: watch::Sender, /// handle for controlling Nexus quiesce pub nexus_quiesce: NexusQuiesceHandle, + /// Channel for exposing the latest loaded fault-management sitrep. + pub sitrep_load_tx: + watch::Sender>>, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs new file mode 100644 index 0000000000..09c7a19973 --- /dev/null +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -0,0 +1,179 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for loading the current fault management sitrep +//! from the DB + +use crate::app::background::BackgroundTask; +use chrono::Utc; +use futures::future::BoxFuture; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::Sitrep; +use nexus_types::fm::SitrepVersion; +use nexus_types::internal_api::background::SitrepLoadStatus as Status; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct SitrepLoader { + datastore: Arc, + tx: watch::Sender, +} + +type CurrentSitrep = Option>; + +impl BackgroundTask for SitrepLoader { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.load_if_needed(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let err = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + json!({ "error": err }) + } + } + }) + } +} + +impl SitrepLoader { + pub fn new( + datastore: Arc, + tx: watch::Sender, + ) -> Self { + Self { datastore, tx } + } + + #[allow(dead_code)] // subsequent PRs will consume this + pub fn watcher(&self) -> watch::Receiver { + self.tx.subscribe() + } + + async fn load_if_needed(&self, opctx: &OpContext) -> Status { + // Set up a logger for this activation that includes metadata about + // the current sitrep. + let (old, log) = match &*self.tx.borrow() { + None => (None, opctx.log.clone()), + Some(old) => { + let (ref old_version, _) = **old; + let log = opctx.log.new(slog::o!( + // since this is a TypedUuid, use `Debug` to avoid + // including () + "original_id" => format!("{:?}", old_version.id), + "original_made_current" => old_version.time_made_current.to_string(), + "original_version" => old_version.version, + )); + (Some(old_version.clone()), log) + } + }; + + // Get the ID of the current sitrep. + let time_loaded = Utc::now(); + let current_version: SitrepVersion = match self + .datastore + .fm_get_current_sitrep_version(opctx) + .await + { + Ok(Some(version)) => version.into(), + Ok(None) => match old { + Some(SitrepVersion { version, id, .. }) => { + // We should never go from "some sitrep" to "no sitrep"; + // pruning should always keep a small number of old sitreps + // around until we have new ones to replace them. + // + // In this case we won't replace our channel contents with + // `None`; we'll keep around whatever old collection we had + // loaded. + warn!(log, "previously had a sitrep, but now none exist"); + return Status::Error(format!( + "previously loaded sitrep {id:?} (v{version}), \ + but now no sitreps exist", + )); + } + None => { + // Had no sitrep; still have no sitrep. + return Status::NoSitrep; + } + }, + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, + "failed to read current sitrep version"; + &err + ); + return Status::Error(format!( + "failed to read current sitrep version: {err}" + )); + } + }; + + // Have we already loaded this sitrep? + match old { + Some(version) if version.id == current_version.id => { + debug!(log, "current sitrep has not changed"); + return Status::Loaded { version, time_loaded }; + } + Some(SitrepVersion { version, id, .. }) + if version == current_version.version + && id != current_version.id => + { + // Well, this is weird! Entries in the `sitrep_version` table + // should not change IDs once they are created, that seems like + // a bug. Nonetheless, we will load the new UUID, but we should + // say something about this, as it's a bit odd. + warn!( + log, + "sitrep ID associated with the current version in the \ + database has changed; this is not supposed to happen!"; + "current_id" => ?current_version.id, + ); + } + _ => (), + } + + let sitrep = match self + .datastore + .fm_sitrep_read(opctx, current_version.id) + .await + { + Ok(sitrep) => sitrep, + Err(err) => { + let err = InlineErrorChain::new(&err); + error!( + log, + "failed to load current sitrep"; + "current_id" => ?current_version.id, + "current_version" => ?current_version.version, + &err + ); + return Status::Error(format!( + "failed to read current sitrep {:?} (v{}): {err}", + current_version.id, current_version.version + )); + } + }; + + let sitrep = Arc::new((current_version.clone(), sitrep)); + self.tx.send_modify(|s| { + *s = Some(sitrep); + }); + + Status::Loaded { version: current_version, time_loaded } + } +} + +#[cfg(test)] +mod test { + // TODO +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 6ec34c5b2b..ca9e431463 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -18,6 +18,7 @@ pub mod dns_propagation; pub mod dns_servers; pub mod ereport_ingester; pub mod external_endpoints; +pub mod fm_sitrep_load; pub mod instance_reincarnation; pub mod instance_updater; pub mod instance_watcher; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index f4cf081305..b977690e8f 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -30,6 +30,7 @@ use nexus_mgs_updates::ArtifactCache; use nexus_mgs_updates::MgsUpdateDriver; use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::ReconfiguratorConfigParam; +use nexus_types::fm; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; use omicron_common::api::external::ByteCount; @@ -286,6 +287,11 @@ pub struct Nexus { #[allow(dead_code)] repo_depot_resolver: Box, + /// Watch channel containing the currently-loaded fault management sitrep. + #[allow(dead_code)] + sitrep_load_rx: + watch::Receiver>>, + /// handle to pull update status data update_status: UpdateStatusHandle, @@ -485,6 +491,8 @@ impl Nexus { let mgs_update_status_rx = mgs_update_driver.status_rx(); let _mgs_driver_task = tokio::spawn(mgs_update_driver.run()); + let (sitrep_load_tx, sitrep_load_rx) = watch::channel(None); + let nexus = Nexus { id: config.deployment.id, rack_id, @@ -540,6 +548,7 @@ impl Nexus { repo_depot_resolver, update_status: UpdateStatusHandle::new(blueprint_load_rx), quiesce, + sitrep_load_rx, }; // TODO-cleanup all the extra Arcs here seems wrong @@ -624,6 +633,7 @@ impl Nexus { tuf_artifact_replication_rx, mgs_updates_tx, blueprint_load_tx, + sitrep_load_tx, }, ); diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 9578e62fe0..f102141a81 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -9,6 +9,19 @@ use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct Sitrep { + pub metadata: SitrepMetadata, + // TODO(eliza): draw the rest of the sitrep +} + +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct SitrepVersion { + pub id: SitrepUuid, + pub version: u32, + pub time_made_current: DateTime, +} + #[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] pub struct SitrepMetadata { pub id: SitrepUuid, diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 6fcbed2eb7..5e3ccbe0c8 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -757,6 +757,19 @@ pub struct EreporterStatus { pub errors: Vec, } +/// The status of a `fm_sitrep_loader` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SitrepLoadStatus { + /// An error occurred. + Error(String), + + /// There is no current sitrep. + NoSitrep, + + /// We've loaded the most recent sitrep as of `time_loaded`. + Loaded { version: crate::fm::SitrepVersion, time_loaded: DateTime }, +} + #[cfg(test)] mod test { use super::TufRepoInfo; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 62a19eff08..34aa311f88 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6817,7 +6817,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( -- The history of current sitreps. -- -- The sitrep with the highest `version` in this table is the current sitrep. -CREATE TABLE IF NOT EXISTS omicron.public.fm_current_sitrep ( +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_version ( -- Monotonically increasing version for all FM sitreps. version INT8 PRIMARY KEY, @@ -6826,21 +6826,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_current_sitrep ( -- no longer current; the current sitrep must not be deleted). sitrep_id UUID NOT NULL, - -- Whether potentially-destructive automated response actions - -- are cleared hot. - -- - -- This is similar to the `enabled` column in `bp_target`, in - -- that it allows automated response to be disabled by an - -- operator in case of danger. - -- - -- However, it's a bit different from its blueprint counterpart: - -- it only disables potentially destructive automated response - -- actions. The FM system will still continue to request polling - -- and diagnose active problems. It seems important to still be - -- able to both detect and diagnose Active Problems even when - -- disabling destructive automated response. - response_authorized BOOL NOT NULL, - -- Timestamp for when this sitrep was made current. time_made_current TIMESTAMPTZ NOT NULL ); From 72a7c504605b9abcf1f15376140146377eea0176 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 Oct 2025 12:42:44 -0700 Subject: [PATCH 04/23] [nexus] s/fm_sitrep_version/fm_sitrep_history --- nexus/db-model/src/fm_sitrep.rs | 4 ++-- nexus/db-queries/src/db/datastore/fm.rs | 4 ++-- nexus/db-schema/src/schema.rs | 4 ++-- schema/crdb/dbinit.sql | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm_sitrep.rs index a22232b2bb..07d9540512 100644 --- a/nexus/db-model/src/fm_sitrep.rs +++ b/nexus/db-model/src/fm_sitrep.rs @@ -7,7 +7,7 @@ use crate::SqlU32; use crate::typed_uuid::DbTypedUuid; use chrono::{DateTime, Utc}; -use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_version}; +use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_history}; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; #[derive(Queryable, Insertable, Clone, Debug, Selectable)] @@ -43,7 +43,7 @@ impl From for nexus_types::fm::SitrepMetadata { } #[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = fm_sitrep_version)] +#[diesel(table_name = fm_sitrep_history)] pub struct SitrepVersion { pub version: SqlU32, pub sitrep_id: DbTypedUuid, diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 3339c2e01d..6407daf89b 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -15,7 +15,7 @@ use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; -use nexus_db_schema::schema::fm_sitrep_version::dsl as current_sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_history::dsl as current_sitrep_dsl; use nexus_types::fm::Sitrep; use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; @@ -35,7 +35,7 @@ impl DataStore { &self, conn: &async_bb8_diesel::Connection, ) -> Result, Error> { - current_sitrep_dsl::fm_sitrep_version + current_sitrep_dsl::fm_sitrep_history .order_by(current_sitrep_dsl::version.desc()) .select(SitrepVersion::as_select()) .first_async(conn) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 16f1b6869e..1e114683c6 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2827,11 +2827,11 @@ table! { allow_tables_to_appear_in_same_query!(fm_sitrep, inv_collection); table! { - fm_sitrep_version (version) { + fm_sitrep_history (version) { version -> Int8, sitrep_id -> Uuid, time_made_current -> Timestamptz, } } -allow_tables_to_appear_in_same_query!(fm_sitrep, fm_sitrep_version); +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_sitrep_history); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 34aa311f88..9fa533c449 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6817,7 +6817,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( -- The history of current sitreps. -- -- The sitrep with the highest `version` in this table is the current sitrep. -CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_version ( +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_history ( -- Monotonically increasing version for all FM sitreps. version INT8 PRIMARY KEY, @@ -6831,7 +6831,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_version ( ); CREATE UNIQUE INDEX IF NOT EXISTS - lookup_sitrep_history_by_id + lookup_sitrep_version_by_id ON omicron.public.fm_sitrep_history (sitrep_id); -- Metadata for the schema itself. From 5d0cfa98de41e1389a897ef287af5c759a0411bd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 Oct 2025 13:44:16 -0700 Subject: [PATCH 05/23] start on horrific CTE --- nexus/db-queries/src/db/datastore/fm.rs | 175 ++++++++++++++++++++++++ 1 file changed, 175 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 6407daf89b..4fc12bf3d4 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -10,7 +10,10 @@ use crate::context::OpContext; use crate::db::model::SitrepMetadata; use crate::db::model::SitrepVersion; use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::{DateTime, Utc}; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind; +use diesel::result::Error as DieselError; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; @@ -90,3 +93,175 @@ impl DataStore { Ok(Sitrep { metadata }) } } + +/// Query to insert a new current sitrep. +/// +/// ```sql +/// WITH +/// -- Subquery to fetch the current sitrep (i.e., the row with the max +/// -- version). +/// current_sitrep AS ( +/// SELECT +/// "version" AS version, +/// "sitrep_id" AS sitrep_id, +/// "inv_collection"."time_done" AS inv_collection_time_done, +/// FROM "sitrep_history" +/// INNER JOIN "sitrep" ON "sitrep_history"."sitrep_id" = "sitrep"."id" +/// LEFT JOIN "inv_collection" ON "sitrep"."inv_collection_id" = "inv_collection"."id" +/// ORDER BY "version" DESC +/// LIMIT 1 +/// ), +/// +/// -- Error checking subquery: This uses similar tricks as elsewhere in +/// -- this crate to `CAST(... AS UUID)` with non-UUID values that result +/// -- in runtime errors in specific cases, allowing us to give accurate +/// -- error messages. +/// -- +/// -- These checks are not required for correct behavior by the insert +/// -- below. If we removed them, the insert would insert 0 rows if +/// -- these checks would have failed. But they make it easier to report +/// -- specific problems to our caller. +/// -- +/// -- The specific cases we check here are noted below. +/// check_validity AS MATERIALIZED ( +/// SELECT CAST(IF( +/// -- Check for whether our new sitrep's inventory collection was +/// -- completed earlier than the inventory collection of the parent +/// -- sitrep. If this is the case, the new sitrep was generated based +/// -- on an older inventory correction, and may be invalid. +/// -- +/// -- Note that if the parent's inventory collection no longer exists +/// -- in the inv_collection table, it has been deleted, and therefore +/// -- we just kinda assume it must be older than ours. +/// -- However, if *ours* no longer exists, then it must be older than +/// -- the parent's inventory collection. +/// (SELECT 1 FROM "sitrep" +/// INNER JOIN "inv_collection" ON "sitrep"."inv_collection_id" = "inv_collection"."id" +/// WHERE +/// "id" = AND +/// "inv_collection"."time_done" < current_sitrep.inv_collection_time_done +/// ) = 1, +/// 'inv-collection-time-travelled', +/// IF( +/// -- Check for whether our new sitrep's parent matches our current +/// -- sitrep. There are two cases here: The first is the common case +/// -- (i.e., the new sitrep has a parent: does it match the current +/// -- sitrep ID?). The second is the bootstrapping check: if we're +/// -- trying to insert a new sitrep that does not have a parent, +/// -- we should not have a sitrep target at all. +/// -- +/// -- If either of these cases fails, we return `parent-not-current`. +/// ( +/// SELECT "parent_sitrep_id" FROM "sitrep", current_sitrep +/// WHERE +/// "id" = +/// AND current_sitrep.sitrep_id = "parent_sitrep_id" +/// ) IS NOT NULL +/// OR +/// ( +/// SELECT 1 FROM "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NULL +/// AND NOT EXISTS (SELECT version FROM current_sitrep) +/// ) = 1, +/// -- Sometime between v22.1.9 and v22.2.19, Cockroach's type checker +/// -- became too smart for our `CAST(... as UUID)` error checking +/// -- gadget: it can infer that `` must be a UUID, so +/// -- then tries to parse 'parent-not-target' and 'no-such-blueprint' +/// -- as UUIDs _during typechecking_, which causes the query to always +/// -- fail. We can defeat this by casting the UUID to text here, which +/// -- will allow the 'parent-not-target' and 'no-such-blueprint' +/// -- sentinels to survive type checking, making it to query execution +/// -- where they will only be cast to UUIDs at runtime in the failure +/// -- cases they're supposed to catch. +/// CAST( AS text), +/// 'parent-not-current' +/// ) +/// ) AS UUID) +/// ), +/// +/// -- Determine the new version number to use: either 1 if this is the +/// -- first sitrep being made the current sitrep, or 1 higher than +/// -- the previous sitrep's version. +/// -- +/// -- The final clauses of each of these WHERE clauses repeat the +/// -- checks performed above in `check_validity`, and will cause this +/// -- subquery to return no rows if we should not allow the new +/// -- target to be set. +/// new_sitrep AS ( +/// SELECT 1 AS new_version FROM "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NULL +/// AND NOT EXISTS (SELECT version FROM current_sitrep) +/// UNION +/// SELECT current_sitrep.version + 1 FROM current_sitrep, "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NOT NULL +/// AND "parent_sitrep_id" = current_sitrep.sitrep_id +/// ) +/// +/// -- Perform the actual insertion. +/// INSERT INTO "sitrep_history"( +/// "version","sitrep_id","time_made_current" +/// ) +/// SELECT +/// new_sitrep.new_version, +/// , +/// NOW() +/// FROM new_sitrep +/// ``` +#[derive(Debug, Clone, Copy)] +struct InsertSitrepVersionQuery { + sitrep_id: SitrepUuid, + time_made_current: DateTime, +} + +#[derive(Debug)] +pub enum InsertSitrepError { + Other(DieselError), + /// The parent sitrep ID is no longer the current sitrep. + ParentNotCurrent(SitrepUuid), + InvCollectionTimeTravelled(SitrepUuid), + InventoryCollectionTimeTravelled, +} + +// Uncastable sentinel used to detect we attempt to make a sitrep current when +// its parent sitrep ID is no longer the current sitrep. +const PARENT_NOT_CURRENT: &str = "parent-not-current"; + +// Uncastable sentinel used to detect we attempt to make a sitrep current when +// its inventory collection ID refers to an inventory collection that is older +// than the parent sitrep's inventory collection. +const INV_COLLECTION_TIME_TRAVELLED: &str = "inv-collection-time-travelled"; + +// Error messages generated from the above sentinel values. +const PARENT_NOT_CURRENT_ERROR_MESSAGE: &str = "could not parse \ + \"parent-not-current\" as type uuid: \ + uuid: incorrect UUID length: parent-not-current"; +const INV_COLLECTION_TIME_TRAVELLED_ERROR_MESSAGE: &str = "could not parse \ + \"inv-collection-time-travelled\" as type uuid: \ + uuid: incorrect UUID length: inv-collection-time-travelled"; + +impl InsertSitrepVersionQuery { + fn decode_error(&self, err: DieselError) -> InsertSitrepError { + match err { + DieselError::DatabaseError(DatabaseErrorKind::Unknown, info) + if info.message() == PARENT_NOT_CURRENT_ERROR_MESSAGE => + { + InsertSitrepError::ParentNotCurrent(self.sitrep_id) + } + DieselError::DatabaseError(DatabaseErrorKind::Unknown, info) + if info.message() + == INV_COLLECTION_TIME_TRAVELLED_ERROR_MESSAGE => + { + InsertSitrepError::InventoryCollectionTimeTravelled( + self.sitrep_id, + ) + } + other => InsertSitrepError::Other(other), + } + } +} From 9cf6b8d65e27b991b3a87af8639a09ba0ffd4a0e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 28 Oct 2025 10:51:22 -0700 Subject: [PATCH 06/23] finish sitrep insert CTE --- nexus-config/src/nexus_config.rs | 3 + nexus/db-model/src/fm_sitrep.rs | 21 + nexus/db-queries/src/db/datastore/fm.rs | 686 ++++++++++++++++++++---- nexus/types/src/fm.rs | 10 + 4 files changed, 617 insertions(+), 103 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 37c4446019..44c2ccc8a9 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -1433,6 +1433,9 @@ mod test { period_secs: Duration::from_secs(47), disable: false, }, + fm: FmTasksConfig { + sitrep_load_period_secs: Duration::from_secs(48), + } }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm_sitrep.rs index 07d9540512..0f25f5d4ff 100644 --- a/nexus/db-model/src/fm_sitrep.rs +++ b/nexus/db-model/src/fm_sitrep.rs @@ -42,6 +42,27 @@ impl From for nexus_types::fm::SitrepMetadata { } } +impl From for SitrepMetadata { + fn from(db_meta: nexus_types::fm::SitrepMetadata) -> Self { + let nexus_types::fm::SitrepMetadata { + id, + parent_sitrep_id, + inv_collection_id, + creator_id, + comment, + time_created, + } = db_meta; + Self { + id: id.into(), + parent_sitrep_id: parent_sitrep_id.map(Into::into), + inv_collection_id: inv_collection_id.into(), + creator_id: creator_id.into(), + comment, + time_created, + } + } +} + #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = fm_sitrep_history)] pub struct SitrepVersion { diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 4fc12bf3d4..b5cc68b4ab 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -2,45 +2,58 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! [`DataStore`] methods on fault management internal data. +//! [`DataStore`] methods on fault management internal data, such as situation +//! reports (sitreps). +//! +//! See [RFD 603](https://rfd.shared.oxide.computer/rfd/0603) for details on the +//! fault management sitrep. use super::DataStore; use crate::authz; use crate::context::OpContext; -use crate::db::model::SitrepMetadata; -use crate::db::model::SitrepVersion; +use crate::db::model; use async_bb8_diesel::AsyncRunQueryDsl; -use chrono::{DateTime, Utc}; +use diesel::pg::Pg; use diesel::prelude::*; +use diesel::query_builder::AstPass; +use diesel::query_builder::QueryFragment; +use diesel::query_builder::QueryId; use diesel::result::DatabaseErrorKind; use diesel::result::Error as DieselError; +use diesel::sql_types; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; -use nexus_db_schema::schema::fm_sitrep_history::dsl as current_sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; +use nexus_types::fm; use nexus_types::fm::Sitrep; use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SitrepUuid; +use uuid::Uuid; impl DataStore { pub async fn fm_get_current_sitrep_version( &self, opctx: &OpContext, - ) -> Result, Error> { + ) -> Result, Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - self.fm_get_current_sitrep_version_on_conn(&conn).await + let version = self + .fm_get_current_sitrep_version_on_conn(&conn) + .await? + .map(Into::into); + Ok(version) } async fn fm_get_current_sitrep_version_on_conn( &self, conn: &async_bb8_diesel::Connection, - ) -> Result, Error> { - current_sitrep_dsl::fm_sitrep_history - .order_by(current_sitrep_dsl::version.desc()) - .select(SitrepVersion::as_select()) + ) -> Result, Error> { + history_dsl::fm_sitrep_history + .order_by(history_dsl::version.desc()) + .select(model::SitrepVersion::as_select()) .first_async(conn) .await .optional() @@ -51,26 +64,44 @@ impl DataStore { &self, opctx: &OpContext, id: SitrepUuid, - ) -> Result, Error> { + ) -> Result, Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - self.fm_sitrep_metadata_read_on_conn(id, &conn).await + let meta = self + .fm_sitrep_metadata_read_on_conn(id, &conn) + .await? + .map(Into::into); + Ok(meta) } async fn fm_sitrep_metadata_read_on_conn( &self, id: SitrepUuid, conn: &async_bb8_diesel::Connection, - ) -> Result, Error> { + ) -> Result, Error> { sitrep_dsl::fm_sitrep .filter(sitrep_dsl::id.eq(id.into_untyped_uuid())) - .select(SitrepMetadata::as_select()) + .select(model::SitrepMetadata::as_select()) .first_async(conn) .await .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + pub async fn fm_sitrep_read_current( + &self, + opctx: &OpContext, + ) -> Result, Error> { + let conn = self.pool_connection_authorized(opctx).await?; + let version: fm::SitrepVersion = + match self.fm_get_current_sitrep_version_on_conn(&conn).await? { + Some(version) => version.into(), + None => return Ok(None), + }; + let sitrep = self.fm_sitrep_read_on_conn(version.id, &conn).await?; + Ok(Some((version, sitrep))) + } + pub async fn fm_sitrep_read( &self, opctx: &OpContext, @@ -78,7 +109,14 @@ impl DataStore { ) -> Result { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; + self.fm_sitrep_read_on_conn(id, &conn).await + } + async fn fm_sitrep_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result { let metadata = self .fm_sitrep_metadata_read_on_conn(id, &conn) .await? @@ -92,9 +130,104 @@ impl DataStore { Ok(Sitrep { metadata }) } + + /// Insert the provided [`Sitrep`] into the database, and attempt to mark it + /// as the current sitrep. + /// + /// If the sitrep's parent is not the current sitrep, the new sitrep is not + /// added to the sitrep history, and an error is returned. See [this + /// section](https://rfd.shared.oxide.computer/rfd/0603#_creating_sitreps) + /// in RFD 603 for details. + /// + /// # Returns + /// + /// - `Ok(())` if the new sitrep was both successfully inserted *and* added + /// to the sitrep history as the current sitrep. + /// + /// - `Err(`[`InsertSitrepError::ParentNotCurrent`]`)` if the sitrep's + /// `parent_sitrep_id` is not the current sitrep, indicating that it was + /// generated based on out of date inputs. + /// + /// This error indicates that the sitrep is orphaned and should be + /// deleted. It is out of date, and another sitrep has already been + /// generated based on the same inputs. + /// + /// - `Err(`[`InsertSitrepError::Other`]`)` if another error occurred while + /// inserting the sitrep. + pub async fn fm_sitrep_insert( + &self, + opctx: &OpContext, + sitrep: &Sitrep, + ) -> Result<(), InsertSitrepError> { + let conn = self.pool_connection_authorized(opctx).await?; + + // TODO(eliza): there should probably be an authz object for the fm sitrep? + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // Create the sitrep metadata record. + diesel::insert_into(sitrep_dsl::fm_sitrep) + .values(model::SitrepMetadata::from(sitrep.metadata.clone())) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert sitrep metadata record") + })?; + + // TODO(eliza): other sitrep records would be inserted here... + + // Now, try to make the sitrep current. + let query = InsertSitrepVersionQuery { sitrep_id: sitrep.id() }; + query + .execute_async(&*conn) + .await + .map_err(|e| query.decode_error(e)) + .map(|_| ()) + } } -/// Query to insert a new current sitrep. +/// Errors returned by [`Datastore::fm_sitrep_insert`]. +#[derive(Debug, thiserror::Error)] +pub enum InsertSitrepError { + #[error(transparent)] + Other(#[from] Error), + /// The parent sitrep ID is no longer the current sitrep. + #[error("sitrep {0}'s parent is not the current sitrep")] + ParentNotCurrent(SitrepUuid), +} + +/// Query to insert a new sitrep version into the `fm_sitrep_history` table, +/// making it the current sitrep. +/// +/// This implements the "compare-and-swap" operation [described in RFD +/// 603](https://rfd.shared.oxide.computer/rfd/0603#_creating_sitreps). In +/// particular, this query will insert a new sitrep version into the +/// `fm_sitrep_history` table IF AND ONLY IF one of the following conditions +/// are true: +/// +/// 1. The new sitrep's parent sitrep ID is the current sitrep (i.e. the sitrep +/// with the highest version number in `fm_sitrep_history`) +/// 2. The new sitrep's parent sitrep ID is `NULL`, AND there are no other +/// sitreps in `fm_sitrep_history` (i.e., we are inserting the first-ever +/// sitrep) +/// +/// Upholding these invariants ensures that sitreps are sequentially consistent, +/// and `fm_sitrep_history` always contains a linear history of sitreps which +/// were generated based on the previous current sitrep. +/// +/// The CTE used to perform this operation is based on the one used in the +/// `deployment` module to insert blueprints into the `bp_target` table. It +/// differs in that it does not perform an existence check on the sitrep to be +/// made current. This is because the `db::datastore::deployment` module's +/// public API treats inserting a new blueprint and setting it as the current +/// target as separate operations, so it is possible for a consumer of the API +/// to try and set a blueprint as the target without first having created it. +/// Here, however, we only ever set a sitrep as the current sitrep in the +/// `Datastore::fm_sitrep_insert` method, which also creates the sitrep. So, it +/// is impossible for a consumer of this API to attempt to make a sitrep current +/// without having first created it. +/// +/// The SQL generated for this CTE looks like this: /// /// ```sql /// WITH @@ -104,10 +237,7 @@ impl DataStore { /// SELECT /// "version" AS version, /// "sitrep_id" AS sitrep_id, -/// "inv_collection"."time_done" AS inv_collection_time_done, -/// FROM "sitrep_history" -/// INNER JOIN "sitrep" ON "sitrep_history"."sitrep_id" = "sitrep"."id" -/// LEFT JOIN "inv_collection" ON "sitrep"."inv_collection_id" = "inv_collection"."id" +/// FROM "fm_sitrep_history" /// ORDER BY "version" DESC /// LIMIT 1 /// ), @@ -117,67 +247,44 @@ impl DataStore { /// -- in runtime errors in specific cases, allowing us to give accurate /// -- error messages. /// -- -/// -- These checks are not required for correct behavior by the insert -/// -- below. If we removed them, the insert would insert 0 rows if -/// -- these checks would have failed. But they make it easier to report -/// -- specific problems to our caller. -/// -- -/// -- The specific cases we check here are noted below. +/// -- This checks that the sitrep descends directly from the current +/// -- sitrep, and will fail the query if it does not. /// check_validity AS MATERIALIZED ( /// SELECT CAST(IF( -/// -- Check for whether our new sitrep's inventory collection was -/// -- completed earlier than the inventory collection of the parent -/// -- sitrep. If this is the case, the new sitrep was generated based -/// -- on an older inventory correction, and may be invalid. +/// -- Check for whether our new sitrep's parent matches our current +/// -- sitrep. There are two cases here: The first is the common case +/// -- (i.e., the new sitrep has a parent: does it match the current +/// -- sitrep ID?). The second is the bootstrapping check: if we're +/// -- trying to insert a new sitrep that does not have a parent, +/// -- we should not have a sitrep target at all. /// -- -/// -- Note that if the parent's inventory collection no longer exists -/// -- in the inv_collection table, it has been deleted, and therefore -/// -- we just kinda assume it must be older than ours. -/// -- However, if *ours* no longer exists, then it must be older than -/// -- the parent's inventory collection. -/// (SELECT 1 FROM "sitrep" -/// INNER JOIN "inv_collection" ON "sitrep"."inv_collection_id" = "inv_collection"."id" -/// WHERE -/// "id" = AND -/// "inv_collection"."time_done" < current_sitrep.inv_collection_time_done +/// -- If either of these cases fails, we return `parent-not-current`. +/// ( +/// SELECT "parent_sitrep_id" FROM "sitrep", current_sitrep +/// WHERE +/// "id" = +/// AND current_sitrep.sitrep_id = "parent_sitrep_id" +/// ) IS NOT NULL +/// OR +/// ( +/// SELECT 1 FROM "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NULL +/// AND NOT EXISTS (SELECT version FROM current_sitrep) /// ) = 1, -/// 'inv-collection-time-travelled', -/// IF( -/// -- Check for whether our new sitrep's parent matches our current -/// -- sitrep. There are two cases here: The first is the common case -/// -- (i.e., the new sitrep has a parent: does it match the current -/// -- sitrep ID?). The second is the bootstrapping check: if we're -/// -- trying to insert a new sitrep that does not have a parent, -/// -- we should not have a sitrep target at all. -/// -- -/// -- If either of these cases fails, we return `parent-not-current`. -/// ( -/// SELECT "parent_sitrep_id" FROM "sitrep", current_sitrep -/// WHERE -/// "id" = -/// AND current_sitrep.sitrep_id = "parent_sitrep_id" -/// ) IS NOT NULL -/// OR -/// ( -/// SELECT 1 FROM "sitrep" -/// WHERE -/// "id" = -/// AND "parent_sitrep_id" IS NULL -/// AND NOT EXISTS (SELECT version FROM current_sitrep) -/// ) = 1, -/// -- Sometime between v22.1.9 and v22.2.19, Cockroach's type checker -/// -- became too smart for our `CAST(... as UUID)` error checking -/// -- gadget: it can infer that `` must be a UUID, so -/// -- then tries to parse 'parent-not-target' and 'no-such-blueprint' -/// -- as UUIDs _during typechecking_, which causes the query to always -/// -- fail. We can defeat this by casting the UUID to text here, which -/// -- will allow the 'parent-not-target' and 'no-such-blueprint' -/// -- sentinels to survive type checking, making it to query execution -/// -- where they will only be cast to UUIDs at runtime in the failure -/// -- cases they're supposed to catch. -/// CAST( AS text), -/// 'parent-not-current' -/// ) +/// -- Sometime between v22.1.9 and v22.2.19, Cockroach's type checker +/// -- became too smart for our `CAST(... as UUID)` error checking +/// -- gadget: it can infer that `` must be a UUID, so +/// -- then tries to parse 'parent-not-target' and 'no-such-blueprint' +/// -- as UUIDs _during typechecking_, which causes the query to always +/// -- fail. We can defeat this by casting the UUID to text here, which +/// -- will allow the 'parent-not-target' and 'no-such-blueprint' +/// -- sentinels to survive type checking, making it to query execution +/// -- where they will only be cast to UUIDs at runtime in the failure +/// -- cases they're supposed to catch. +/// CAST( AS text), +/// 'parent-not-current' /// ) AS UUID) /// ), /// @@ -216,34 +323,16 @@ impl DataStore { #[derive(Debug, Clone, Copy)] struct InsertSitrepVersionQuery { sitrep_id: SitrepUuid, - time_made_current: DateTime, -} - -#[derive(Debug)] -pub enum InsertSitrepError { - Other(DieselError), - /// The parent sitrep ID is no longer the current sitrep. - ParentNotCurrent(SitrepUuid), - InvCollectionTimeTravelled(SitrepUuid), - InventoryCollectionTimeTravelled, } // Uncastable sentinel used to detect we attempt to make a sitrep current when // its parent sitrep ID is no longer the current sitrep. const PARENT_NOT_CURRENT: &str = "parent-not-current"; -// Uncastable sentinel used to detect we attempt to make a sitrep current when -// its inventory collection ID refers to an inventory collection that is older -// than the parent sitrep's inventory collection. -const INV_COLLECTION_TIME_TRAVELLED: &str = "inv-collection-time-travelled"; - // Error messages generated from the above sentinel values. const PARENT_NOT_CURRENT_ERROR_MESSAGE: &str = "could not parse \ \"parent-not-current\" as type uuid: \ uuid: incorrect UUID length: parent-not-current"; -const INV_COLLECTION_TIME_TRAVELLED_ERROR_MESSAGE: &str = "could not parse \ - \"inv-collection-time-travelled\" as type uuid: \ - uuid: incorrect UUID length: inv-collection-time-travelled"; impl InsertSitrepVersionQuery { fn decode_error(&self, err: DieselError) -> InsertSitrepError { @@ -253,15 +342,406 @@ impl InsertSitrepVersionQuery { { InsertSitrepError::ParentNotCurrent(self.sitrep_id) } - DieselError::DatabaseError(DatabaseErrorKind::Unknown, info) - if info.message() - == INV_COLLECTION_TIME_TRAVELLED_ERROR_MESSAGE => - { - InsertSitrepError::InventoryCollectionTimeTravelled( - self.sitrep_id, - ) + err => { + let err = public_error_from_diesel(err, ErrorHandler::Server) + .internal_context("failed to insert new sitrep version"); + InsertSitrepError::Other(err) } - other => InsertSitrepError::Other(other), } } } + +impl QueryId for InsertSitrepVersionQuery { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} + +impl QueryFragment for InsertSitrepVersionQuery { + fn walk_ast<'a>( + &'a self, + mut out: AstPass<'_, 'a, Pg>, + ) -> diesel::QueryResult<()> { + use nexus_db_schema::schema; + const CURRENT_SITREP: &'static str = "current_sitrep"; + type FromClause = + diesel::internal::table_macro::StaticQueryFragmentInstance; + const SITREP_FROM_CLAUSE: FromClause = + FromClause::new(); + const SITREP_HISTORY_FROM_CLAUSE: FromClause< + schema::fm_sitrep_history::table, + > = FromClause::new(); + + out.push_sql("WITH "); + + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(" AS (SELECT "); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(" AS version,"); + out.push_identifier(history_dsl::sitrep_id::NAME)?; + out.push_sql(" AS sitrep_id"); + out.push_sql(" FROM "); + SITREP_HISTORY_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" ORDER BY "); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(" DESC LIMIT 1),"); + + out.push_sql( + "check_validity AS MATERIALIZED ( \ + SELECT \ + CAST( \ + IF(", + ); + out.push_sql("(SELECT "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" FROM "); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(", "); + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(".sitrep_id = "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + ") IS NOT NULL \ + OR \ + (SELECT 1 FROM ", + ); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + "IS NULL \ + AND NOT EXISTS ( \ + SELECT version FROM current_sitrep) \ + ) = 1, ", + ); + out.push_sql(" CAST("); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AS text), "); + out.push_bind_param::( + &PARENT_NOT_CURRENT, + )?; + out.push_sql( + ") \ + AS UUID) \ + ), ", + ); + + out.push_sql("new_sitrep AS (SELECT 1 AS new_version FROM "); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + " IS NULL \ + AND NOT EXISTS \ + (SELECT version FROM current_sitrep) \ + UNION \ + SELECT current_sitrep.version + 1 FROM \ + current_sitrep, ", + ); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" IS NOT NULL AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" = current_sitrep.sitrep_id) "); + + out.push_sql("INSERT INTO "); + SITREP_HISTORY_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql("("); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(","); + out.push_identifier(history_dsl::sitrep_id::NAME)?; + out.push_sql(","); + out.push_identifier(history_dsl::time_made_current::NAME)?; + out.push_sql(") SELECT new_sitrep.new_version, "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(", NOW()"); + out.push_sql(" FROM new_sitrep"); + + Ok(()) + } +} + +impl RunQueryDsl for InsertSitrepVersionQuery {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + use chrono::Utc; + use omicron_test_utils::dev; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::OmicronZoneUuid; + + #[tokio::test] + async fn explain_insert_sitrep_version_query() { + let logctx = dev::test_setup_log("explain_insert_sitrep_version_query"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = InsertSitrepVersionQuery { sitrep_id: SitrepUuid::nil() }; + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + eprintln!("{explanation}"); + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_without_parent() { + // Setup + let logctx = dev::test_setup_log("test_insert_sitrep_without_parent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Base case: there should be no current sitrep. + let current = datastore.fm_sitrep_read_current(&opctx).await.unwrap(); + assert!(current.is_none()); + + // Okay, let's create a new sitrep. + let sitrep = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id: OmicronZoneUuid::new_v4(), + comment: "TEST SITREP PLEASE IGNORE".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + + datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap(); + + let current = datastore + .fm_sitrep_read_current(&opctx) + .await + .expect("should successfully read current sitrep"); + let (version, current_sitrep) = current.expect("sitrep should be Some"); + assert_eq!(version.id, sitrep.metadata.id); + assert_eq!(version.version, 1); + assert_eq!(sitrep.id(), current_sitrep.id()); + assert_eq!(sitrep.parent_id(), current_sitrep.parent_id()); + assert_eq!( + sitrep.metadata.creator_id, + current_sitrep.metadata.creator_id + ); + assert_eq!(sitrep.metadata.comment, current_sitrep.metadata.comment); + + // Trying to insert the same sitrep again should fail. + let err = + datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap_err(); + assert!(err.to_string().contains("duplicate key")); + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_current_parent() { + let logctx = + dev::test_setup_log("test_insert_sitrep_with_current_parent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let creator_id = OmicronZoneUuid::new_v4(); + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Create a second sitrep with the first as parent + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 2".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep2).await.expect( + "inserting a sitrep whose parent is current should succeed", + ); + + // Verify the second sitrep is now current + let (version, current_sitrep) = datastore + .fm_sitrep_read_current(&opctx) + .await + .unwrap() + .expect("current sitrep should be Some"); + assert_eq!(version.id, sitrep2.id()); + assert_eq!(version.version, 2); + assert_eq!(sitrep2.id(), current_sitrep.id()); + assert_eq!(sitrep2.parent_id(), current_sitrep.parent_id()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_nonexistent_parent_fails() { + let logctx = dev::test_setup_log( + "test_insert_sitrep_with_nonexistent_parent_fails", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let creator_id = OmicronZoneUuid::new_v4(); + + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Try to insert a sitrep with a non-existent parent ID + let nonexistent_id = SitrepUuid::new_v4(); + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP WITH BAD PARENT".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(nonexistent_id), + }, + }; + + let result = datastore.fm_sitrep_insert(&opctx, &sitrep2).await; + + // Should fail with ParentNotCurrent error + match result { + Err(super::InsertSitrepError::ParentNotCurrent(_)) => {} + _ => panic!("expected ParentNotCurrent error, got {result:?}"), + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_outdated_parent_fails() { + let logctx = dev::test_setup_log( + "test_insert_sitrep_with_outdated_parent_fails", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let creator_id = OmicronZoneUuid::new_v4(); + + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Create a second sitrep with the first as parent + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 2".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep2).await.unwrap(); + + // Try to create a third sitrep with sitrep1 (outdated) as parent. + // This should fail, as sitrep2 is now the current sitrep. + let sitrep3 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id: OmicronZoneUuid::new_v4(), + comment: "TEST SITREP 3 WITH OUTDATED PARENT".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + let result = datastore.fm_sitrep_insert(&opctx, &sitrep3).await; + + // Should fail with ParentNotCurrent error + match result { + Err(InsertSitrepError::ParentNotCurrent(_)) => {} + _ => panic!("expected ParentNotCurrent error, got {result:?}"), + } + + // Verify sitrep2 is still current + let (version, current_sitrep) = datastore + .fm_sitrep_read_current(&opctx) + .await + .unwrap() + .expect("current sitrep should be Some"); + assert_eq!(version.id, sitrep2.id()); + assert_eq!(version.version, 2); + assert_eq!(sitrep2.id(), current_sitrep.id()); + assert_eq!(sitrep2.parent_id(), current_sitrep.parent_id()); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index f102141a81..c96712c273 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -15,6 +15,16 @@ pub struct Sitrep { // TODO(eliza): draw the rest of the sitrep } +impl Sitrep { + pub fn id(&self) -> SitrepUuid { + self.metadata.id + } + + pub fn parent_id(&self) -> Option { + self.metadata.parent_sitrep_id + } +} + #[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] pub struct SitrepVersion { pub id: SitrepUuid, From 6565dcb57ac8f9095968a78c76d29ddae018cbd7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 Oct 2025 10:37:16 -0700 Subject: [PATCH 07/23] wip omdb stuff --- dev-tools/omdb/src/bin/omdb/db.rs | 14 ++ dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 262 +++++++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 dev-tools/omdb/src/bin/omdb/db/sitrep.rs diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index fe9d81cfce..782a497c51 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -180,6 +180,7 @@ mod blueprints; mod db_metadata; mod ereport; mod saga; +mod sitrep; mod user_data_export; const NO_ACTIVE_PROPOLIS_MSG: &str = ""; @@ -378,6 +379,13 @@ enum DbCommands { RegionSnapshotReplacement(RegionSnapshotReplacementArgs), /// Commands for querying and interacting with sagas Saga(saga::SagaArgs), + /// Commands for querying and interacting with fault management situation + /// reports. + Sitrep(sitrep::SitrepArgs), + /// Show the current history of fault management situation reports. + /// + /// This is an alias for `omdb db sitrep history`. + Sitreps(sitrep::SitrepHistoryArgs), /// Print information about sleds Sleds(SledsArgs), /// Print information about customer instances. @@ -1297,6 +1305,12 @@ impl DbArgs { DbCommands::Saga(args) => { args.exec(&omdb, &opctx, &datastore).await } + DbCommands::Sitrep(args) => { + sitrep::cmd_db_sitrep(&opctx, &datastore, &fetch_opts, args).await + } + DbCommands::Sitreps(args) => { + sitrep::cmd_db_sitrep_history(&datastore, &fetch_opts, args).await + } DbCommands::Sleds(args) => { cmd_db_sleds(&opctx, &datastore, &fetch_opts, args).await } diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs new file mode 100644 index 0000000000..c35d5f0749 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -0,0 +1,262 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb db sitrep` subcommands + +use crate::db::DbFetchOptions; +use crate::db::check_limit; +use crate::helpers::const_max_len; +use crate::helpers::datetime_rfc3339_concise; +use anyhow::Context; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::{DateTime, Utc}; +use clap::Args; +use clap::Subcommand; +use diesel::prelude::*; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::model; +use nexus_db_queries::db::pagination::paginated; +use nexus_types::fm; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::PaginationOrder; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SitrepUuid; +use tabled::Tabled; +use uuid::Uuid; + +use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; + +#[derive(Debug, Args, Clone)] +pub(super) struct SitrepArgs { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand, Clone)] +enum Commands { + /// List the current situation report history. + History(SitrepHistoryArgs), + + /// Show the current situation report. + /// + /// This is an alias for `omdb db sitrep info current`. + Current(ShowArgs), + + /// Show details on a situation report. + #[clap(alias = "info")] + Info { + /// The UUID of the sitrep to show, or "current" to show the current + /// sitrep. + sitrep: SitrepIdOrCurrent, + + #[clap(flatten)] + args: ShowArgs, + }, +} + +#[derive(Debug, Args, Clone)] +pub(super) struct SitrepHistoryArgs { + /// If present, start at this sitrep version. + /// + /// If this is not set, the list will start with the current sitrep. This + /// option is useful when the number of sitreps exceeds the database fetch + /// limit. + #[arg(long, short, alias = "starting_at")] + from: Option, +} + +#[derive(Debug, Args, Clone)] +struct ShowArgs {} + +#[derive(Debug, Clone, Copy)] +enum SitrepIdOrCurrent { + Current, + Id(SitrepUuid), +} + +impl std::str::FromStr for SitrepIdOrCurrent { + type Err = omicron_uuid_kinds::ParseError; + + fn from_str(s: &str) -> Result { + let s = s.trim(); + if s.eq_ignore_ascii_case("current") { + Ok(Self::Current) + } else { + let id = s.parse()?; + Ok(Self::Id(id)) + } + } +} + +pub(super) async fn cmd_db_sitrep( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &SitrepArgs, +) -> anyhow::Result<()> { + match args.command { + Commands::History(ref args) => { + cmd_db_sitrep_history(datastore, fetch_opts, args).await + } + Commands::Info { sitrep, ref args } => { + cmd_db_sitrep_show(opctx, datastore, fetch_opts, args, sitrep).await + } + Commands::Current(ref args) => { + cmd_db_sitrep_show( + opctx, + datastore, + fetch_opts, + args, + SitrepIdOrCurrent::Current, + ) + .await + } + } +} + +pub(super) async fn cmd_db_sitrep_history( + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &SitrepHistoryArgs, +) -> anyhow::Result<()> { + let ctx = || { + if let Some(from) = args.from { + format!( + "listing fault management sitrep history (starting at {from})" + ) + } else { + "listing fault management sitrep history".to_string() + } + }; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SitrepRow { + v: u32, + id: Uuid, + #[tabled(display_with = "datetime_rfc3339_concise")] + created_at: DateTime, + comment: String, + } + + let conn = datastore.pool_connection_for_tests().await?; + let marker = args.from.map(model::SqlU32::new); + let pagparams = DataPageParams { + marker: marker.as_ref(), + direction: PaginationOrder::Descending, + limit: fetch_opts.fetch_limit, + }; + let sitreps: Vec<(model::SitrepVersion, model::SitrepMetadata)> = + paginated( + history_dsl::fm_sitrep_history, + history_dsl::version, + &pagparams, + ) + .inner_join( + sitrep_dsl::fm_sitrep.on(history_dsl::sitrep_id.eq(sitrep_dsl::id)), + ) + .select(( + model::SitrepVersion::as_select(), + model::SitrepMetadata::as_select(), + )) + .load_async(&*conn) + .await + .with_context(ctx)?; + + check_limit(&sitreps, fetch_opts.fetch_limit, ctx); + + let rows = sitreps.into_iter().map(|(version, metadata)| { + let model::SitrepMetadata { + id, + time_created, + comment, + creator_id: _, + parent_sitrep_id: _, + inv_collection_id: _, + } = metadata; + SitrepRow { + v: version.version.into(), + id: id.into_untyped_uuid(), + created_at: time_created, + comment: comment, + } + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{table}"); + + Ok(()) +} + +async fn cmd_db_sitrep_show( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &ShowArgs, + sitrep: SitrepIdOrCurrent, +) -> anyhow::Result<()> { + let ctx = || match sitrep { + SitrepIdOrCurrent::Current => { + "looking up the current fault management sitrep".to_string() + } + SitrepIdOrCurrent::Id(id) => { + format!("looking up fault management sitrep {id:?}") + } + }; + + let (maybe_version, sitrep) = match sitrep { + SitrepIdOrCurrent::Id(id) => { + let sitrep = + datastore.fm_sitrep_read(opctx, id).await.with_context(ctx)?; + let conn = datastore + .pool_connection_for_tests() + .await + .with_context(ctx)?; + let version = history_dsl::fm_sitrep_history + .filter(history_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .select(model::SitrepVersion::as_select()) + .first_async(&*conn) + .await + .optional() + .with_context(ctx)? + .map(Into::into); + (version, sitrep) + } + SitrepIdOrCurrent::Current => { + let Some((version, sitrep)) = + datastore.fm_sitrep_read_current(opctx).await? + else { + anyhow::bail!("no current sitrep exists at this time"); + }; + (Some(version), sitrep) + } + }; + + let fm::Sitrep { metadata } = sitrep; + println!("\n{:=<80}", "== FAULT MANAGEMENT SITUATION REPORT "); + let fm::SitrepMetadata { + id, + creator_id, + time_created, + parent_sitrep_id, + inv_collection_id, + comment, + } = metadata; + + const ID: &'static str = "ID"; + const PARENT_SITREP_ID: &'static str = "parent sitrep ID"; + const CREATED_BY: &'static str = "created by"; + const CREATED_AT: &'static str = "created at"; + + const WIDTH: usize = + const_max_len(&[ID, PARENT_SITREP_ID, CREATED_AT, CREATED_BY]); + println!(" {ID:>WIDTH$}: {id}"); + + Ok(()) +} From 05c850414c55574e90fd54daf38ae3380ae08ca3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 Oct 2025 13:13:39 -0700 Subject: [PATCH 08/23] finish omdb sitrep cmds --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 113 +++++++++++++++++++++-- dev-tools/omdb/tests/test_all_output.rs | 1 + 2 files changed, 104 insertions(+), 10 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index c35d5f0749..d0702e5668 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -28,6 +28,7 @@ use uuid::Uuid; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; +use nexus_db_schema::schema::inv_collection::dsl as inv_collection_dsl; #[derive(Debug, Args, Clone)] pub(super) struct SitrepArgs { @@ -197,8 +198,8 @@ pub(super) async fn cmd_db_sitrep_history( async fn cmd_db_sitrep_show( opctx: &OpContext, datastore: &DataStore, - fetch_opts: &DbFetchOptions, - args: &ShowArgs, + _fetch_opts: &DbFetchOptions, + _args: &ShowArgs, sitrep: SitrepIdOrCurrent, ) -> anyhow::Result<()> { let ctx = || match sitrep { @@ -209,15 +210,12 @@ async fn cmd_db_sitrep_show( format!("looking up fault management sitrep {id:?}") } }; + let conn = datastore.pool_connection_for_tests().await?; let (maybe_version, sitrep) = match sitrep { SitrepIdOrCurrent::Id(id) => { let sitrep = datastore.fm_sitrep_read(opctx, id).await.with_context(ctx)?; - let conn = datastore - .pool_connection_for_tests() - .await - .with_context(ctx)?; let version = history_dsl::fm_sitrep_history .filter(history_dsl::sitrep_id.eq(id.into_untyped_uuid())) .select(model::SitrepVersion::as_select()) @@ -239,7 +237,6 @@ async fn cmd_db_sitrep_show( }; let fm::Sitrep { metadata } = sitrep; - println!("\n{:=<80}", "== FAULT MANAGEMENT SITUATION REPORT "); let fm::SitrepMetadata { id, creator_id, @@ -253,10 +250,106 @@ async fn cmd_db_sitrep_show( const PARENT_SITREP_ID: &'static str = "parent sitrep ID"; const CREATED_BY: &'static str = "created by"; const CREATED_AT: &'static str = "created at"; + const COMMENT: &'static str = "comment"; + const STATUS: &'static str = "status"; + const VERSION: &'static str = " version"; + const MADE_CURRENT_AT: &'static str = " made current at"; + const INV_COLLECTION_ID: &'static str = "inventory collection ID"; + const INV_STARTED_AT: &'static str = " started at"; + const INV_FINISHED_AT: &'static str = " finished at"; + + const WIDTH: usize = const_max_len(&[ + ID, + PARENT_SITREP_ID, + CREATED_AT, + CREATED_BY, + COMMENT, + STATUS, + VERSION, + MADE_CURRENT_AT, + INV_COLLECTION_ID, + INV_STARTED_AT, + INV_FINISHED_AT, + ]); - const WIDTH: usize = - const_max_len(&[ID, PARENT_SITREP_ID, CREATED_AT, CREATED_BY]); - println!(" {ID:>WIDTH$}: {id}"); + println!("\n{:=<80}", "== FAULT MANAGEMENT SITUATION REPORT "); + println!(" {ID:>WIDTH$}: {id:?}"); + println!(" {PARENT_SITREP_ID:>WIDTH$}: {parent_sitrep_id:?}"); + println!(" {CREATED_BY:>WIDTH$}: {creator_id}"); + println!(" {CREATED_AT:>WIDTH$}: {time_created}"); + if comment.is_empty() { + println!(" {COMMENT:>WIDTH$}: N/A\n"); + } else { + println!(" {COMMENT:>WIDTH$}:"); + println!("{}\n", textwrap::indent(&comment, " ")); + } + + match maybe_version { + None => println!( + " {STATUS:>WIDTH$}: not committed to the sitrep history" + ), + Some(fm::SitrepVersion { version, time_made_current, .. }) => { + let current_version = + datastore.fm_get_current_sitrep_version(&opctx).await; + if matches!(current_version, Ok(Some(ref v)) if v.id == id) { + println!(" {STATUS:>WIDTH$}: this is the current sitrep!",); + } else { + println!(" {STATUS:>WIDTH$}: in the sitrep history"); + } + println!(" {VERSION:>WIDTH$}: v{version}"); + println!(" {MADE_CURRENT_AT:>WIDTH$}: {time_made_current}"); + match current_version { + Ok(Some(v)) if v.id == id => {} + Ok(Some(fm::SitrepVersion { version, id, .. })) => { + println!( + "(i) note: the current sitrep is {id:?} \ + (at v{version})", + ); + } + Ok(None) => { + eprintln!( + "/!\\ WEIRD: this sitrep is in the sitrep history, \ + but there is no current sitrep. this should not \ + happen!" + ); + } + Err(err) => { + eprintln!( + "/!\\ failed to determine the current sitrep \ + version: {err}" + ); + } + }; + } + } + + println!("\n{:-<80}", "== DIAGNOSIS INPUTS "); + println!(" {INV_COLLECTION_ID:>WIDTH$}: {inv_collection_id:?}"); + let inv_collection = inv_collection_dsl::inv_collection + .filter( + inv_collection_dsl::id.eq(inv_collection_id.into_untyped_uuid()), + ) + .select(model::InvCollection::as_select()) + .first_async(&*conn) + .await + .optional(); + match inv_collection { + Err(err) => { + eprintln!( + "/!\\ failed to fetch inventory collection details: {err}" + ); + } + Ok(Some(model::InvCollection { time_started, time_done, .. })) => { + println!(" {INV_STARTED_AT:>WIDTH$}: {time_started}"); + println!(" {INV_FINISHED_AT:>WIDTH$}: {time_done}"); + } + Ok(None) => { + println!( + " note: this collection no longer exists (perhaps it has \ + been pruned?)" + ) + } + } Ok(()) } diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 61208f00a9..bd10d1d86d 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -92,6 +92,7 @@ async fn test_omdb_usage_errors() { &["db", "ereport", "reporters", "--help"], &["db", "ereport", "info", "--help"], &["db", "sleds", "--help"], + &["db", "sitrep", "--help"], &["db", "saga"], &["db", "snapshots"], &["db", "network"], From bef2996bfe9f635e877de81fdd70d01293e0bab9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 Oct 2025 13:23:14 -0700 Subject: [PATCH 09/23] add bg task details --- dev-tools/omdb/src/bin/omdb/nexus.rs | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 23b0cf0701..b46c0b7429 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -50,6 +50,7 @@ use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::OximeterReadMode; use nexus_types::deployment::OximeterReadPolicy; +use nexus_types::fm; use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::BlueprintPlannerStatus; use nexus_types::internal_api::background::BlueprintRendezvousStatus; @@ -65,6 +66,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use nexus_types::internal_api::background::SitrepLoadStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleEreportStatus; @@ -1234,6 +1236,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "webhook_deliverator" => { print_task_webhook_deliverator(details); } + "fm_sitrep_loader" => { + print_task_fm_sitrep_loader(details); + } _ => { println!( "warning: unknown background task: {:?} \ @@ -3098,6 +3103,33 @@ mod ereporter_status_fields { pub const NUM_WIDTH: usize = 4; } +fn print_task_fm_sitrep_loader(details: &serde_json::Value) { + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(SitrepLoadStatus::Error(error)) => { + println!(" task did not complete successfully: {error}"); + } + Ok(SitrepLoadStatus::NoSitrep) => { + println!(" no FM situation report available to load"); + } + Ok(SitrepLoadStatus::Loaded { version, time_loaded }) => { + println!( + " loaded latest FM situation report as of {}:", + humantime::format_rfc3339_millis(time_loaded.into()) + ); + let fm::SitrepVersion { id, version, time_made_current } = version; + println!(" sitrep {id:?} (v{version})"); + println!( + " made current at: {}", + humantime::format_rfc3339_millis(time_made_current.into()), + ); + } + }; +} + const ERRICON: &str = "/!\\"; fn warn_if_nonzero(n: usize) -> &'static str { From 585b39d9d3dc74f7a7a46342de1637ffc2ee16dd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 Oct 2025 13:27:38 -0700 Subject: [PATCH 10/23] add sitrep_loader to config tomls --- nexus/tests/config.test.toml | 4 ++++ smf/nexus/multi-sled/config-partial.toml | 3 +++ smf/nexus/single-sled/config-partial.toml | 3 +++ 3 files changed, 10 insertions(+) diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index e629fd435c..f9abc06cfd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -192,6 +192,10 @@ webhook_deliverator.first_retry_backoff_secs = 10 webhook_deliverator.second_retry_backoff_secs = 20 read_only_region_replacement_start.period_secs = 999999 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 14b0281cc2..3107fbd39b 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -94,6 +94,9 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 32e20ee79f..e7e8b98455 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -94,6 +94,9 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From c4acdac8dcaa65ac9da0b813f4537f66e024e3a1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 Oct 2025 13:28:33 -0700 Subject: [PATCH 11/23] fix clap --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index d0702e5668..29d3a19b49 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -47,7 +47,7 @@ enum Commands { Current(ShowArgs), /// Show details on a situation report. - #[clap(alias = "info")] + #[clap(alias = "show")] Info { /// The UUID of the sitrep to show, or "current" to show the current /// sitrep. From 18142e3ec8d8a48739c93ef297bd71a20183fa85 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 10:15:34 -0700 Subject: [PATCH 12/23] finally update OMDB tests --- dev-tools/omdb/tests/env.out | 12 ++++++++ dev-tools/omdb/tests/successes.out | 16 +++++++++++ dev-tools/omdb/tests/usage_errors.out | 40 +++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 5a67324313..576b628312 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -99,6 +99,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -315,6 +319,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -518,6 +526,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 62a1578d0f..9a0f173bca 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -334,6 +334,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -609,6 +613,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_sitrep_loader" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report available to load + task: "instance_reincarnation" configured period: every m last completed activation: , triggered by @@ -1144,6 +1154,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_sitrep_loader" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report available to load + task: "instance_reincarnation" configured period: every m last completed activation: , triggered by diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index d920ff0521..b8f1f4d15f 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -132,6 +132,9 @@ Commands: region-snapshot-replacement Query for information about region snapshot replacements, optionally manually triggering one saga Commands for querying and interacting with sagas + sitrep Commands for querying and interacting with fault management situation + reports + sitreps Show the current history of fault management situation reports sleds Print information about sleds instance Print information about customer instances instances Alias to `omdb instance list` @@ -193,6 +196,9 @@ Commands: region-snapshot-replacement Query for information about region snapshot replacements, optionally manually triggering one saga Commands for querying and interacting with sagas + sitrep Commands for querying and interacting with fault management situation + reports + sitreps Show the current history of fault management situation reports sleds Print information about sleds instance Print information about customer instances instances Alias to `omdb instance list` @@ -707,6 +713,40 @@ Safety Options: --------------------------------------------- stderr: ============================================= +EXECUTING COMMAND: omdb ["db", "sitrep", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +Commands for querying and interacting with fault management situation reports + +Usage: omdb db sitrep [OPTIONS] + +Commands: + history List the current situation report history + current Show the current situation report + info Show details on a situation report + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= EXECUTING COMMAND: omdb ["db", "saga"] termination: Exited(2) --------------------------------------------- From b9c5c0745502da27258d078b70c88a5e00a74b39 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 10:21:37 -0700 Subject: [PATCH 13/23] migrations --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/dbinit.sql | 2 +- schema/crdb/fm-sitrep/up01.sql | 39 +++++++++++++++++++++++++++ schema/crdb/fm-sitrep/up02.sql | 15 +++++++++++ schema/crdb/fm-sitrep/up03.sql | 3 +++ 5 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/fm-sitrep/up01.sql create mode 100644 schema/crdb/fm-sitrep/up02.sql create mode 100644 schema/crdb/fm-sitrep/up03.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index be8d77012f..63517caf05 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(201, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(202, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(202, "fm-sitrep"), KnownVersion::new(201, "scim-client-bearer-token"), KnownVersion::new(200, "dual-stack-network-interfaces"), KnownVersion::new(199, "multicast-pool-support"), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9fa533c449..083e0078c1 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6921,7 +6921,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '201.0.0', NULL) + (TRUE, NOW(), NOW(), '202.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/fm-sitrep/up01.sql b/schema/crdb/fm-sitrep/up01.sql new file mode 100644 index 0000000000..41296e4232 --- /dev/null +++ b/schema/crdb/fm-sitrep/up01.sql @@ -0,0 +1,39 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( + -- The ID of this sitrep. + id UUID PRIMARY KEY, + -- The ID of the parent sitrep. + -- + -- A sitrep's _parent_ is the sitrep that was current when the planning + -- phase that produced that sitrep ran. The parent sitrep is a planning + -- input that produced this sitrep. + -- + -- This is effectively a foreign key back to this table; however, it is + -- allowed to be NULL: the initial sitrep has no parent. Additionally, + -- it may be non-NULL but no longer reference a row in this table: once a + -- child sitrep has been created from a parent, it's possible for the + -- parent to be deleted. We do not NULL out this field on such a deletion, + -- so we can always see that there had been a particular parent even if + -- it's now gone. + parent_sitrep_id UUID, + -- The ID of the inventory collection that was used as input to this + -- sitrep. + -- + -- This is a foreign key that references a row in the `inv_collection` + -- table (and other inventory records associated with that collection). + -- + -- Note that inventory collections are pruned on a separate schedule + -- from sitreps, so the inventory collection records may not exist. + inv_collection_id UUID NOT NULL, + + -- These fields are not semantically meaningful and are intended + -- debugging purposes. + + -- The time at which this sitrep was created. + time_created TIMESTAMPTZ NOT NULL, + -- The Omicron zone UUID of the Nexus instance that created this + -- sitrep. + creator_id UUID NOT NULL, + -- A human-readable description of the changes represented by this + -- sitrep. + comment TEXT NOT NULL +); diff --git a/schema/crdb/fm-sitrep/up02.sql b/schema/crdb/fm-sitrep/up02.sql new file mode 100644 index 0000000000..c78eb85eac --- /dev/null +++ b/schema/crdb/fm-sitrep/up02.sql @@ -0,0 +1,15 @@ +-- The history of current sitreps. +-- +-- The sitrep with the highest `version` in this table is the current sitrep. +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_history ( + -- Monotonically increasing version for all FM sitreps. + version INT8 PRIMARY KEY, + + -- Effectively a foreign key into the `fm_sitrep` table, but may + -- reference a fm_sitrep that has been deleted (if this sitrep is + -- no longer current; the current sitrep must not be deleted). + sitrep_id UUID NOT NULL, + + -- Timestamp for when this sitrep was made current. + time_made_current TIMESTAMPTZ NOT NULL +); diff --git a/schema/crdb/fm-sitrep/up03.sql b/schema/crdb/fm-sitrep/up03.sql new file mode 100644 index 0000000000..91cd68adf6 --- /dev/null +++ b/schema/crdb/fm-sitrep/up03.sql @@ -0,0 +1,3 @@ +CREATE UNIQUE INDEX IF NOT EXISTS + lookup_sitrep_version_by_id +ON omicron.public.fm_sitrep_history (sitrep_id); From 6640ab6f361cc0c8d6ef4ef05a83b637d6fa0742 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 10:49:18 -0700 Subject: [PATCH 14/23] commentary + a few API tweaks --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- nexus/db-model/src/fm_sitrep.rs | 11 ++- nexus/db-queries/src/db/datastore/fm.rs | 48 +++++++------ .../app/background/tasks/fm_sitrep_load.rs | 2 +- nexus/types/src/fm.rs | 67 +++++++++++++++++-- 5 files changed, 101 insertions(+), 29 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 29d3a19b49..06a0cf8a7e 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -290,7 +290,7 @@ async fn cmd_db_sitrep_show( ), Some(fm::SitrepVersion { version, time_made_current, .. }) => { let current_version = - datastore.fm_get_current_sitrep_version(&opctx).await; + datastore.fm_current_sitrep_version(&opctx).await; if matches!(current_version, Ok(Some(ref v)) if v.id == id) { println!(" {STATUS:>WIDTH$}: this is the current sitrep!",); } else { diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm_sitrep.rs index 0f25f5d4ff..287685fb44 100644 --- a/nexus/db-model/src/fm_sitrep.rs +++ b/nexus/db-model/src/fm_sitrep.rs @@ -2,7 +2,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Types for representing fault management sitreps in the database. +//! Types for representing fault management situation reports (sitreps) in the +//! database. +//! +//! The fault management sitrep, and the ways in which it is represented in +//! CRDB, is described in detail in [RFD +//! 603](https://rfd.shared.oxide.computer/rfd/0603). +//! +//! These types are used when inserting and reading sitreps in CRDB; when in +//! use, the sitrep is represented as a [`nexus_types::fm::Sitrep`]. See the +//! documentation in [`nexus_types::fm`] for more information. use crate::SqlU32; use crate::typed_uuid::DbTypedUuid; diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index b5cc68b4ab..48df23bebe 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -34,20 +34,23 @@ use omicron_uuid_kinds::SitrepUuid; use uuid::Uuid; impl DataStore { - pub async fn fm_get_current_sitrep_version( + /// Reads the current [sitrep version](fm::SitrepVersion) from CRDB. + /// + /// If no sitreps have been generated, this returns `None`. + pub async fn fm_current_sitrep_version( &self, opctx: &OpContext, ) -> Result, Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; let version = self - .fm_get_current_sitrep_version_on_conn(&conn) + .fm_current_sitrep_version_on_conn(&conn) .await? .map(Into::into); Ok(version) } - async fn fm_get_current_sitrep_version_on_conn( + async fn fm_current_sitrep_version_on_conn( &self, conn: &async_bb8_diesel::Connection, ) -> Result, Error> { @@ -60,17 +63,17 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Reads the [`fm::SitrepMetadata`] describing the sitrep with the given + /// ID, if one exists. pub async fn fm_sitrep_metadata_read( &self, opctx: &OpContext, id: SitrepUuid, - ) -> Result, Error> { + ) -> Result { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - let meta = self - .fm_sitrep_metadata_read_on_conn(id, &conn) - .await? - .map(Into::into); + let meta = + self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); Ok(meta) } @@ -78,23 +81,34 @@ impl DataStore { &self, id: SitrepUuid, conn: &async_bb8_diesel::Connection, - ) -> Result, Error> { + ) -> Result { sitrep_dsl::fm_sitrep .filter(sitrep_dsl::id.eq(id.into_untyped_uuid())) .select(model::SitrepMetadata::as_select()) .first_async(conn) .await .optional() - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!("sitrep {id:?}")) + }) } + /// Reads the *entire* current sitrep, along with its version. + /// + /// This is equivalent to reading the current sitrep version using + /// [`DataStore::fm_current_sitrep_version`], and then reading the sitrep + /// itself using [`DataStore::fm_sitrep_read_on_conn`]. + /// + /// If this method returns `None`, there is no current sitrep, meaning that + /// no sitreps have been created. pub async fn fm_sitrep_read_current( &self, opctx: &OpContext, - ) -> Result, Error> { + ) -> Result, Error> { let conn = self.pool_connection_authorized(opctx).await?; let version: fm::SitrepVersion = - match self.fm_get_current_sitrep_version_on_conn(&conn).await? { + match self.fm_current_sitrep_version_on_conn(&conn).await? { Some(version) => version.into(), None => return Ok(None), }; @@ -102,6 +116,7 @@ impl DataStore { Ok(Some((version, sitrep))) } + /// Reads the entire content of the sitrep with the provided ID, if one exists. pub async fn fm_sitrep_read( &self, opctx: &OpContext, @@ -117,13 +132,8 @@ impl DataStore { id: SitrepUuid, conn: &async_bb8_diesel::Connection, ) -> Result { - let metadata = self - .fm_sitrep_metadata_read_on_conn(id, &conn) - .await? - .ok_or_else(|| { - Error::non_resourcetype_not_found(format!("sitrep {id:?}")) - })? - .into(); + let metadata = + self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); // TODO(eliza): this is where we would read all the other sitrep data, // if there was any. diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 09c7a19973..fabef9ce8f 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -81,7 +81,7 @@ impl SitrepLoader { let time_loaded = Utc::now(); let current_version: SitrepVersion = match self .datastore - .fm_get_current_sitrep_version(opctx) + .fm_current_sitrep_version(opctx) .await { Ok(Some(version)) => version.into(), diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index c96712c273..466a277edc 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -3,14 +3,34 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Fault management types. +//! +//! Of particular importance is the [`Sitrep`], which is the top-level data +//! structure containing fault management state. use chrono::{DateTime, Utc}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +/// A fault management situation report, or _sitrep_. +/// +/// The sitrep is a data structure that represents a snapshot of the state of +/// the system as understood by the control plane's fault management subsystem. +/// At any point in time, a single sitrep is considered the "current" sitrep. +/// Each sitrep records a _parent sitrep ID_, which indicates the sitrep that +/// was current at the time that the sitrep was created. +/// A sitrep may only be made current if its parent is the current sitrep. +/// This ensures that there is a sequentially consistent history of sitreps. +/// The fault management subsystem only considers data from the current sitrep +/// when making decisions and diagnoses. +/// +/// The sitrep, how it is represented in the database, and how the fault +/// management subsystem creates and interacts with sitreps, is described in +/// detail in [RFD 603](https://rfd.shared.oxide.computer/rfd/0603). #[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] pub struct Sitrep { + /// Metadata describing this sitrep, when it was created, its parent sitrep + /// ID, and which Nexus produced it. pub metadata: SitrepMetadata, // TODO(eliza): draw the rest of the sitrep } @@ -25,19 +45,52 @@ impl Sitrep { } } -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] -pub struct SitrepVersion { - pub id: SitrepUuid, - pub version: u32, - pub time_made_current: DateTime, -} - +/// Metadata describing a sitrep. +/// +/// This corresponds to the records stored in the `fm_sitrep` database table. #[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] pub struct SitrepMetadata { + /// The ID of this sitrep. pub id: SitrepUuid, + + /// The ID of the parent sitrep. + /// + /// A sitrep's _parent_ is the sitrep that was current when the planning + /// phase that produced that sitrep ran. The parent sitrep is a planning + /// input that produced this sitrep. + /// + /// The parent sitrep ID is optional, because this sitrep _may_ be the first + /// sitrep ever generated by the system. However, once a current sitrep has + /// been set, no subsequent sitrep should be created without a parent. pub parent_sitrep_id: Option, + + /// The ID of the inventory collection that was used as planning input to + /// this sitrep. + /// + /// When generating a new sitrep, the fault manager should ensure that the + /// inventory collection it uses as input is at least as new as the parent + /// sitrep's inventory collection. pub inv_collection_id: CollectionUuid, + + /// The Omicron zone UUID of the Nexus that generated this sitrep. + /// + /// This is intended for debugging purposes. pub creator_id: OmicronZoneUuid, + + /// A human-readable (but mechanically generated) string describing the + /// reason(s) this sitrep was created. + /// + /// This is intended for debugging purposes. pub comment: String, + + /// The time at which this sitrep was created. pub time_created: DateTime, } + +/// An entry in the sitrep version history. +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct SitrepVersion { + pub id: SitrepUuid, + pub version: u32, + pub time_made_current: DateTime, +} From 218056e6fc8a9da77b12532bb308f592f75115f7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 13:03:37 -0700 Subject: [PATCH 15/23] clippy tidiness --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 2 +- nexus/src/app/background/tasks/fm_sitrep_load.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 06a0cf8a7e..c65865e385 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -182,7 +182,7 @@ pub(super) async fn cmd_db_sitrep_history( v: version.version.into(), id: id.into_untyped_uuid(), created_at: time_created, - comment: comment, + comment, } }); diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index fabef9ce8f..10f87dea9f 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -84,7 +84,7 @@ impl SitrepLoader { .fm_current_sitrep_version(opctx) .await { - Ok(Some(version)) => version.into(), + Ok(Some(version)) => version, Ok(None) => match old { Some(SitrepVersion { version, id, .. }) => { // We should never go from "some sitrep" to "no sitrep"; From e9d87b24997081ccd6b539c7bddc7f028be47a10 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 13:07:13 -0700 Subject: [PATCH 16/23] fixup docs --- nexus/db-queries/src/db/datastore/fm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 48df23bebe..47600c9bdc 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -98,7 +98,7 @@ impl DataStore { /// /// This is equivalent to reading the current sitrep version using /// [`DataStore::fm_current_sitrep_version`], and then reading the sitrep - /// itself using [`DataStore::fm_sitrep_read_on_conn`]. + /// itself using [`DataStore::fm_sitrep_read`]. /// /// If this method returns `None`, there is no current sitrep, meaning that /// no sitreps have been created. @@ -196,7 +196,7 @@ impl DataStore { } } -/// Errors returned by [`Datastore::fm_sitrep_insert`]. +/// Errors returned by [`DataStore::fm_sitrep_insert`]. #[derive(Debug, thiserror::Error)] pub enum InsertSitrepError { #[error(transparent)] From 410889976b40ab6bf979f820dc0004e714ddffec Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 Oct 2025 13:08:07 -0700 Subject: [PATCH 17/23] whoops make module public so the error is visible --- nexus/db-queries/src/db/datastore/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index fd434eff4e..24a58fb39a 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -71,7 +71,7 @@ mod disk; mod dns; mod ereport; mod external_ip; -mod fm; +pub mod fm; mod identity_provider; mod image; pub mod instance; From 570dda5d17af43ea4f83c4da48b797fb8fd7c417 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 31 Oct 2025 11:24:07 -0700 Subject: [PATCH 18/23] you gotta remember to update the config tests --- nexus-config/src/nexus_config.rs | 2 ++ nexus/examples/config-second.toml | 5 +++++ nexus/examples/config.toml | 5 +++++ smf/nexus/multi-sled/config-partial.toml | 3 ++- smf/nexus/single-sled/config-partial.toml | 3 ++- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 44c2ccc8a9..58ef5b6ce7 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -1189,6 +1189,7 @@ mod test { webhook_deliverator.first_retry_backoff_secs = 45 webhook_deliverator.second_retry_backoff_secs = 46 sp_ereport_ingester.period_secs = 47 + fm.sitrep_load_period_secs = 48 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1534,6 +1535,7 @@ mod test { alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 sp_ereport_ingester.period_secs = 44 + fm.sitrep_load_period_secs = 45 [default_region_allocation_strategy] type = "random" diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 2de7bb187c..c70b1c8eaf 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -170,6 +170,11 @@ alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index e95e40496d..f2aa7f6d84 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -154,6 +154,11 @@ alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 3107fbd39b..859539452a 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -94,7 +94,8 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 -# How frequently to check for a new fault management sitrep (made by any Nexus). +# How frequently to check for a new fault management sitrep (made by any +# Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index e7e8b98455..9aa67515d7 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -94,7 +94,8 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 -# How frequently to check for a new fault management sitrep (made by any Nexus). +# How frequently to check for a new fault management sitrep (made by any +# Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 From aae8d15dc646ddf82b8f47f6d3747f3a5cff0e7c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 1 Nov 2025 11:32:01 -0700 Subject: [PATCH 19/23] reorder fields --- nexus/db-model/src/fm_sitrep.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm_sitrep.rs index 287685fb44..d9d7ac3c2d 100644 --- a/nexus/db-model/src/fm_sitrep.rs +++ b/nexus/db-model/src/fm_sitrep.rs @@ -25,9 +25,9 @@ pub struct SitrepMetadata { pub id: DbTypedUuid, pub parent_sitrep_id: Option>, pub inv_collection_id: DbTypedUuid, + pub time_created: DateTime, pub creator_id: DbTypedUuid, pub comment: String, - pub time_created: DateTime, } impl From for nexus_types::fm::SitrepMetadata { From 8522c68e26112e237b60a94827f793b5c4d55139 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 1 Nov 2025 12:35:48 -0700 Subject: [PATCH 20/23] add sitrep loader test --- .../app/background/tasks/fm_sitrep_load.rs | 141 +++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 10f87dea9f..3f1545a0be 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -175,5 +175,144 @@ impl SitrepLoader { #[cfg(test)] mod test { - // TODO + use super::*; + use crate::app::background::BackgroundTask; + use nexus_db_queries::db::pub_test_utils::TestDatabase; + use nexus_types::fm::SitrepMetadata; + use omicron_test_utils::dev; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::OmicronZoneUuid; + use omicron_uuid_kinds::SitrepUuid; + + #[tokio::test] + async fn test_load_sitreps() { + let logctx = dev::test_setup_log("test_inventory_loader"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let (tx, mut sitrep_rx) = watch::channel(None); + let mut task = SitrepLoader::new(datastore.clone(), tx); + + // Initially, there should be no sitrep. + let status = task.activate(&opctx).await; + assert_eq!(*sitrep_rx.borrow_and_update(), None); + let status = serde_json::from_value::(status).unwrap(); + assert_eq!(status, Status::NoSitrep); + + // Now, create an initial sitrep. + let sitrep1_id = SitrepUuid::new_v4(); + let sitrep1 = Sitrep { + metadata: SitrepMetadata { + id: sitrep1_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: None, + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 1".to_string(), + time_created: Utc::now(), + }, + }; + datastore + .fm_sitrep_insert(&opctx, &sitrep1) + .await + .expect("sitrep should be inserted successfully"); + + // It should be loaded. + let status = task.activate(&opctx).await; + assert_eq!( + true, + sitrep_rx.has_changed().unwrap(), + "sitrep watch should have changed when a sitrep was loaded" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the new sitrep should have been loaded"); + let (ref loaded_version1, ref loaded_sitrep) = *snapshot; + // N.B.: we just compare the IDs here as comparing the whole struct may + // not be equal, since the `time_created` field may have been rounded in + // CRDB. Which is a shame, but whatever. :/ + assert_eq!(loaded_sitrep.metadata.id, sitrep1.metadata.id); + dbg!(loaded_version1); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version1); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // A subsequent activation should see the same sitrep. + let status = task.activate(&opctx).await; + assert_eq!( + false, + sitrep_rx.has_changed().unwrap(), + "sitrep watch should not change if the same sitrep was loaded" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the same should have been loaded"); + let (ref loaded_version2, ref loaded_sitrep) = *snapshot; + assert_eq!(loaded_sitrep.metadata.id, sitrep1.metadata.id); + dbg!(loaded_version1, loaded_version2); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version2); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // Now, create a new sitrep. + let sitrep2_id = SitrepUuid::new_v4(); + let sitrep2 = Sitrep { + metadata: SitrepMetadata { + id: sitrep2_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: Some(sitrep1_id), + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 2".to_string(), + time_created: Utc::now(), + }, + }; + datastore + .fm_sitrep_insert(&opctx, &sitrep2) + .await + .expect("sitrep2 should be inserted successfully"); + + // It should be loaded. + let status = task.activate(&opctx).await; + assert_eq!( + true, + sitrep_rx.has_changed().unwrap(), + "loading a new sitrep should update the watch" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the new sitrep should have been loaded"); + let (ref loaded_version3, ref loaded_sitrep) = *snapshot; + assert_eq!(loaded_sitrep.metadata.id, sitrep2.metadata.id); + dbg!(loaded_version3); + assert_ne!(loaded_version3, loaded_version2); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version3); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // XXX(eliza): It would be nice to also be able to test that an orphaned + // sitrep (which has not been linked into the sitrep history chain) is + // *not* loaded even if it exists. However, that would require + // `nexus-db-queries` to expose separate interfaces for creating a + // sitrep and inserting it into the history, which I have intentionally + // chosen *not* to do to make it harder to do it by mistake. + // So, ¯\_(ツ)_/¯ + + // Cleanup + db.terminate().await; + logctx.cleanup_successful(); + } } From dfffd6913dc38b87617b6f125524daeb689ab383 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Nov 2025 10:24:11 -0800 Subject: [PATCH 21/23] check that the current version hasn't gone down --- nexus/src/app/background/tasks/fm_sitrep_load.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 3f1545a0be..d7864a6bcc 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -124,6 +124,20 @@ impl SitrepLoader { debug!(log, "current sitrep has not changed"); return Status::Loaded { version, time_loaded }; } + Some(old) if current_version.version < old.version => { + warn!( + log, + "current sitrep version v{} is less than the previously \ + loaded version v{}; ignoring it", + current_version.version, + old.version, + ); + return Status::Error(format!( + "current sitrep version v{} is less than the previously \ + loaded version v{}; ignoring it", + current_version.version, old.version, + )); + } Some(SitrepVersion { version, id, .. }) if version == current_version.version && id != current_version.id => From 649a3273588a36770da3ebee6cb3041fa90e4ff8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Nov 2025 10:41:38 -0800 Subject: [PATCH 22/23] make OMDB's view of the current sitrep more consistent --- dev-tools/omdb/src/bin/omdb/db/sitrep.rs | 32 ++++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index c65865e385..587df1aba9 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -210,8 +210,13 @@ async fn cmd_db_sitrep_show( format!("looking up fault management sitrep {id:?}") } }; - let conn = datastore.pool_connection_for_tests().await?; + let current_version = datastore + .fm_current_sitrep_version(&opctx) + .await + .context("failed to look up the current sitrep version")?; + + let conn = datastore.pool_connection_for_tests().await?; let (maybe_version, sitrep) = match sitrep { SitrepIdOrCurrent::Id(id) => { let sitrep = @@ -227,11 +232,14 @@ async fn cmd_db_sitrep_show( (version, sitrep) } SitrepIdOrCurrent::Current => { - let Some((version, sitrep)) = - datastore.fm_sitrep_read_current(opctx).await? - else { + let Some(version) = current_version.clone() else { anyhow::bail!("no current sitrep exists at this time"); }; + + let sitrep = datastore + .fm_sitrep_read(opctx, version.id) + .await + .with_context(ctx)?; (Some(version), sitrep) } }; @@ -289,9 +297,7 @@ async fn cmd_db_sitrep_show( " {STATUS:>WIDTH$}: not committed to the sitrep history" ), Some(fm::SitrepVersion { version, time_made_current, .. }) => { - let current_version = - datastore.fm_current_sitrep_version(&opctx).await; - if matches!(current_version, Ok(Some(ref v)) if v.id == id) { + if matches!(current_version, Some(ref v) if v.id == id) { println!(" {STATUS:>WIDTH$}: this is the current sitrep!",); } else { println!(" {STATUS:>WIDTH$}: in the sitrep history"); @@ -299,26 +305,20 @@ async fn cmd_db_sitrep_show( println!(" {VERSION:>WIDTH$}: v{version}"); println!(" {MADE_CURRENT_AT:>WIDTH$}: {time_made_current}"); match current_version { - Ok(Some(v)) if v.id == id => {} - Ok(Some(fm::SitrepVersion { version, id, .. })) => { + Some(v) if v.id == id => {} + Some(fm::SitrepVersion { version, id, .. }) => { println!( "(i) note: the current sitrep is {id:?} \ (at v{version})", ); } - Ok(None) => { + None => { eprintln!( "/!\\ WEIRD: this sitrep is in the sitrep history, \ but there is no current sitrep. this should not \ happen!" ); } - Err(err) => { - eprintln!( - "/!\\ failed to determine the current sitrep \ - version: {err}" - ); - } }; } } From cb1813f4035516b7c20bf5133e557e596f4a152c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Nov 2025 10:57:02 -0800 Subject: [PATCH 23/23] rename model module --- nexus/db-model/src/{fm_sitrep.rs => fm.rs} | 0 nexus/db-model/src/lib.rs | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename nexus/db-model/src/{fm_sitrep.rs => fm.rs} (100%) diff --git a/nexus/db-model/src/fm_sitrep.rs b/nexus/db-model/src/fm.rs similarity index 100% rename from nexus/db-model/src/fm_sitrep.rs rename to nexus/db-model/src/fm.rs diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 31803623c8..bd963619fb 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -89,7 +89,7 @@ mod webhook_rx; // for join-based marker trait generation. mod deployment; mod ereport; -mod fm_sitrep; +pub mod fm; pub mod nat_entry; mod omicron_zone_config; mod quota; @@ -182,7 +182,7 @@ pub use dns::*; pub use downstairs::*; pub use ereport::*; pub use external_ip::*; -pub use fm_sitrep::*; +pub use fm::*; pub use generation::*; pub use identity_provider::*; pub use image::*;