diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index fe9d81cfce..782a497c51 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -180,6 +180,7 @@ mod blueprints; mod db_metadata; mod ereport; mod saga; +mod sitrep; mod user_data_export; const NO_ACTIVE_PROPOLIS_MSG: &str = ""; @@ -378,6 +379,13 @@ enum DbCommands { RegionSnapshotReplacement(RegionSnapshotReplacementArgs), /// Commands for querying and interacting with sagas Saga(saga::SagaArgs), + /// Commands for querying and interacting with fault management situation + /// reports. + Sitrep(sitrep::SitrepArgs), + /// Show the current history of fault management situation reports. + /// + /// This is an alias for `omdb db sitrep history`. + Sitreps(sitrep::SitrepHistoryArgs), /// Print information about sleds Sleds(SledsArgs), /// Print information about customer instances. @@ -1297,6 +1305,12 @@ impl DbArgs { DbCommands::Saga(args) => { args.exec(&omdb, &opctx, &datastore).await } + DbCommands::Sitrep(args) => { + sitrep::cmd_db_sitrep(&opctx, &datastore, &fetch_opts, args).await + } + DbCommands::Sitreps(args) => { + sitrep::cmd_db_sitrep_history(&datastore, &fetch_opts, args).await + } DbCommands::Sleds(args) => { cmd_db_sleds(&opctx, &datastore, &fetch_opts, args).await } diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs new file mode 100644 index 0000000000..587df1aba9 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -0,0 +1,355 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb db sitrep` subcommands + +use crate::db::DbFetchOptions; +use crate::db::check_limit; +use crate::helpers::const_max_len; +use crate::helpers::datetime_rfc3339_concise; +use anyhow::Context; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::{DateTime, Utc}; +use clap::Args; +use clap::Subcommand; +use diesel::prelude::*; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::model; +use nexus_db_queries::db::pagination::paginated; +use nexus_types::fm; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::PaginationOrder; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SitrepUuid; +use tabled::Tabled; +use uuid::Uuid; + +use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; +use nexus_db_schema::schema::inv_collection::dsl as inv_collection_dsl; + +#[derive(Debug, Args, Clone)] +pub(super) struct SitrepArgs { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand, Clone)] +enum Commands { + /// List the current situation report history. + History(SitrepHistoryArgs), + + /// Show the current situation report. + /// + /// This is an alias for `omdb db sitrep info current`. + Current(ShowArgs), + + /// Show details on a situation report. + #[clap(alias = "show")] + Info { + /// The UUID of the sitrep to show, or "current" to show the current + /// sitrep. + sitrep: SitrepIdOrCurrent, + + #[clap(flatten)] + args: ShowArgs, + }, +} + +#[derive(Debug, Args, Clone)] +pub(super) struct SitrepHistoryArgs { + /// If present, start at this sitrep version. + /// + /// If this is not set, the list will start with the current sitrep. This + /// option is useful when the number of sitreps exceeds the database fetch + /// limit. + #[arg(long, short, alias = "starting_at")] + from: Option, +} + +#[derive(Debug, Args, Clone)] +struct ShowArgs {} + +#[derive(Debug, Clone, Copy)] +enum SitrepIdOrCurrent { + Current, + Id(SitrepUuid), +} + +impl std::str::FromStr for SitrepIdOrCurrent { + type Err = omicron_uuid_kinds::ParseError; + + fn from_str(s: &str) -> Result { + let s = s.trim(); + if s.eq_ignore_ascii_case("current") { + Ok(Self::Current) + } else { + let id = s.parse()?; + Ok(Self::Id(id)) + } + } +} + +pub(super) async fn cmd_db_sitrep( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &SitrepArgs, +) -> anyhow::Result<()> { + match args.command { + Commands::History(ref args) => { + cmd_db_sitrep_history(datastore, fetch_opts, args).await + } + Commands::Info { sitrep, ref args } => { + cmd_db_sitrep_show(opctx, datastore, fetch_opts, args, sitrep).await + } + Commands::Current(ref args) => { + cmd_db_sitrep_show( + opctx, + datastore, + fetch_opts, + args, + SitrepIdOrCurrent::Current, + ) + .await + } + } +} + +pub(super) async fn cmd_db_sitrep_history( + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &SitrepHistoryArgs, +) -> anyhow::Result<()> { + let ctx = || { + if let Some(from) = args.from { + format!( + "listing fault management sitrep history (starting at {from})" + ) + } else { + "listing fault management sitrep history".to_string() + } + }; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SitrepRow { + v: u32, + id: Uuid, + #[tabled(display_with = "datetime_rfc3339_concise")] + created_at: DateTime, + comment: String, + } + + let conn = datastore.pool_connection_for_tests().await?; + let marker = args.from.map(model::SqlU32::new); + let pagparams = DataPageParams { + marker: marker.as_ref(), + direction: PaginationOrder::Descending, + limit: fetch_opts.fetch_limit, + }; + let sitreps: Vec<(model::SitrepVersion, model::SitrepMetadata)> = + paginated( + history_dsl::fm_sitrep_history, + history_dsl::version, + &pagparams, + ) + .inner_join( + sitrep_dsl::fm_sitrep.on(history_dsl::sitrep_id.eq(sitrep_dsl::id)), + ) + .select(( + model::SitrepVersion::as_select(), + model::SitrepMetadata::as_select(), + )) + .load_async(&*conn) + .await + .with_context(ctx)?; + + check_limit(&sitreps, fetch_opts.fetch_limit, ctx); + + let rows = sitreps.into_iter().map(|(version, metadata)| { + let model::SitrepMetadata { + id, + time_created, + comment, + creator_id: _, + parent_sitrep_id: _, + inv_collection_id: _, + } = metadata; + SitrepRow { + v: version.version.into(), + id: id.into_untyped_uuid(), + created_at: time_created, + comment, + } + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{table}"); + + Ok(()) +} + +async fn cmd_db_sitrep_show( + opctx: &OpContext, + datastore: &DataStore, + _fetch_opts: &DbFetchOptions, + _args: &ShowArgs, + sitrep: SitrepIdOrCurrent, +) -> anyhow::Result<()> { + let ctx = || match sitrep { + SitrepIdOrCurrent::Current => { + "looking up the current fault management sitrep".to_string() + } + SitrepIdOrCurrent::Id(id) => { + format!("looking up fault management sitrep {id:?}") + } + }; + + let current_version = datastore + .fm_current_sitrep_version(&opctx) + .await + .context("failed to look up the current sitrep version")?; + + let conn = datastore.pool_connection_for_tests().await?; + let (maybe_version, sitrep) = match sitrep { + SitrepIdOrCurrent::Id(id) => { + let sitrep = + datastore.fm_sitrep_read(opctx, id).await.with_context(ctx)?; + let version = history_dsl::fm_sitrep_history + .filter(history_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .select(model::SitrepVersion::as_select()) + .first_async(&*conn) + .await + .optional() + .with_context(ctx)? + .map(Into::into); + (version, sitrep) + } + SitrepIdOrCurrent::Current => { + let Some(version) = current_version.clone() else { + anyhow::bail!("no current sitrep exists at this time"); + }; + + let sitrep = datastore + .fm_sitrep_read(opctx, version.id) + .await + .with_context(ctx)?; + (Some(version), sitrep) + } + }; + + let fm::Sitrep { metadata } = sitrep; + let fm::SitrepMetadata { + id, + creator_id, + time_created, + parent_sitrep_id, + inv_collection_id, + comment, + } = metadata; + + const ID: &'static str = "ID"; + const PARENT_SITREP_ID: &'static str = "parent sitrep ID"; + const CREATED_BY: &'static str = "created by"; + const CREATED_AT: &'static str = "created at"; + const COMMENT: &'static str = "comment"; + const STATUS: &'static str = "status"; + const VERSION: &'static str = " version"; + const MADE_CURRENT_AT: &'static str = " made current at"; + const INV_COLLECTION_ID: &'static str = "inventory collection ID"; + const INV_STARTED_AT: &'static str = " started at"; + const INV_FINISHED_AT: &'static str = " finished at"; + + const WIDTH: usize = const_max_len(&[ + ID, + PARENT_SITREP_ID, + CREATED_AT, + CREATED_BY, + COMMENT, + STATUS, + VERSION, + MADE_CURRENT_AT, + INV_COLLECTION_ID, + INV_STARTED_AT, + INV_FINISHED_AT, + ]); + + println!("\n{:=<80}", "== FAULT MANAGEMENT SITUATION REPORT "); + println!(" {ID:>WIDTH$}: {id:?}"); + println!(" {PARENT_SITREP_ID:>WIDTH$}: {parent_sitrep_id:?}"); + println!(" {CREATED_BY:>WIDTH$}: {creator_id}"); + println!(" {CREATED_AT:>WIDTH$}: {time_created}"); + if comment.is_empty() { + println!(" {COMMENT:>WIDTH$}: N/A\n"); + } else { + println!(" {COMMENT:>WIDTH$}:"); + println!("{}\n", textwrap::indent(&comment, " ")); + } + + match maybe_version { + None => println!( + " {STATUS:>WIDTH$}: not committed to the sitrep history" + ), + Some(fm::SitrepVersion { version, time_made_current, .. }) => { + if matches!(current_version, Some(ref v) if v.id == id) { + println!(" {STATUS:>WIDTH$}: this is the current sitrep!",); + } else { + println!(" {STATUS:>WIDTH$}: in the sitrep history"); + } + println!(" {VERSION:>WIDTH$}: v{version}"); + println!(" {MADE_CURRENT_AT:>WIDTH$}: {time_made_current}"); + match current_version { + Some(v) if v.id == id => {} + Some(fm::SitrepVersion { version, id, .. }) => { + println!( + "(i) note: the current sitrep is {id:?} \ + (at v{version})", + ); + } + None => { + eprintln!( + "/!\\ WEIRD: this sitrep is in the sitrep history, \ + but there is no current sitrep. this should not \ + happen!" + ); + } + }; + } + } + + println!("\n{:-<80}", "== DIAGNOSIS INPUTS "); + println!(" {INV_COLLECTION_ID:>WIDTH$}: {inv_collection_id:?}"); + let inv_collection = inv_collection_dsl::inv_collection + .filter( + inv_collection_dsl::id.eq(inv_collection_id.into_untyped_uuid()), + ) + .select(model::InvCollection::as_select()) + .first_async(&*conn) + .await + .optional(); + match inv_collection { + Err(err) => { + eprintln!( + "/!\\ failed to fetch inventory collection details: {err}" + ); + } + Ok(Some(model::InvCollection { time_started, time_done, .. })) => { + println!(" {INV_STARTED_AT:>WIDTH$}: {time_started}"); + println!(" {INV_FINISHED_AT:>WIDTH$}: {time_done}"); + } + Ok(None) => { + println!( + " note: this collection no longer exists (perhaps it has \ + been pruned?)" + ) + } + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 23b0cf0701..b46c0b7429 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -50,6 +50,7 @@ use nexus_types::deployment::ClickhouseMode; use nexus_types::deployment::ClickhousePolicy; use nexus_types::deployment::OximeterReadMode; use nexus_types::deployment::OximeterReadPolicy; +use nexus_types::fm; use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::BlueprintPlannerStatus; use nexus_types::internal_api::background::BlueprintRendezvousStatus; @@ -65,6 +66,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use nexus_types::internal_api::background::SitrepLoadStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleEreportStatus; @@ -1234,6 +1236,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "webhook_deliverator" => { print_task_webhook_deliverator(details); } + "fm_sitrep_loader" => { + print_task_fm_sitrep_loader(details); + } _ => { println!( "warning: unknown background task: {:?} \ @@ -3098,6 +3103,33 @@ mod ereporter_status_fields { pub const NUM_WIDTH: usize = 4; } +fn print_task_fm_sitrep_loader(details: &serde_json::Value) { + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(SitrepLoadStatus::Error(error)) => { + println!(" task did not complete successfully: {error}"); + } + Ok(SitrepLoadStatus::NoSitrep) => { + println!(" no FM situation report available to load"); + } + Ok(SitrepLoadStatus::Loaded { version, time_loaded }) => { + println!( + " loaded latest FM situation report as of {}:", + humantime::format_rfc3339_millis(time_loaded.into()) + ); + let fm::SitrepVersion { id, version, time_made_current } = version; + println!(" sitrep {id:?} (v{version})"); + println!( + " made current at: {}", + humantime::format_rfc3339_millis(time_made_current.into()), + ); + } + }; +} + const ERRICON: &str = "/!\\"; fn warn_if_nonzero(n: usize) -> &'static str { diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 5a67324313..576b628312 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -99,6 +99,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -315,6 +319,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -518,6 +526,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 62a1578d0f..9a0f173bca 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -334,6 +334,10 @@ task: "external_endpoints" on each one +task: "fm_sitrep_loader" + loads the current fault management situation report from the database + + task: "instance_reincarnation" schedules start sagas for failed instances that can be automatically restarted @@ -609,6 +613,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_sitrep_loader" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report available to load + task: "instance_reincarnation" configured period: every m last completed activation: , triggered by @@ -1144,6 +1154,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_sitrep_loader" + configured period: every s + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report available to load + task: "instance_reincarnation" configured period: every m last completed activation: , triggered by diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 61208f00a9..bd10d1d86d 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -92,6 +92,7 @@ async fn test_omdb_usage_errors() { &["db", "ereport", "reporters", "--help"], &["db", "ereport", "info", "--help"], &["db", "sleds", "--help"], + &["db", "sitrep", "--help"], &["db", "saga"], &["db", "snapshots"], &["db", "network"], diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index d920ff0521..b8f1f4d15f 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -132,6 +132,9 @@ Commands: region-snapshot-replacement Query for information about region snapshot replacements, optionally manually triggering one saga Commands for querying and interacting with sagas + sitrep Commands for querying and interacting with fault management situation + reports + sitreps Show the current history of fault management situation reports sleds Print information about sleds instance Print information about customer instances instances Alias to `omdb instance list` @@ -193,6 +196,9 @@ Commands: region-snapshot-replacement Query for information about region snapshot replacements, optionally manually triggering one saga Commands for querying and interacting with sagas + sitrep Commands for querying and interacting with fault management situation + reports + sitreps Show the current history of fault management situation reports sleds Print information about sleds instance Print information about customer instances instances Alias to `omdb instance list` @@ -707,6 +713,40 @@ Safety Options: --------------------------------------------- stderr: ============================================= +EXECUTING COMMAND: omdb ["db", "sitrep", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +Commands for querying and interacting with fault management situation reports + +Usage: omdb db sitrep [OPTIONS] + +Commands: + history List the current situation report history + current Show the current situation report + info Show details on a situation report + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= EXECUTING COMMAND: omdb ["db", "saga"] termination: Exited(2) --------------------------------------------- diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 588374407b..58ef5b6ce7 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -441,6 +441,8 @@ pub struct BackgroundTaskConfig { pub webhook_deliverator: WebhookDeliveratorConfig, /// configuration for SP ereport ingester task pub sp_ereport_ingester: SpEreportIngesterConfig, + /// configuration for fault management background tasks + pub fm: FmTasksConfig, } #[serde_as] @@ -870,6 +872,21 @@ impl Default for SpEreportIngesterConfig { } } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct FmTasksConfig { + /// period (in seconds) for periodic activations of the background task that + /// reads the latest fault management sitrep from the database. + #[serde_as(as = "DurationSeconds")] + pub sitrep_load_period_secs: Duration, +} + +impl Default for FmTasksConfig { + fn default() -> Self { + Self { sitrep_load_period_secs: Duration::from_secs(15) } + } +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -1172,6 +1189,7 @@ mod test { webhook_deliverator.first_retry_backoff_secs = 45 webhook_deliverator.second_retry_backoff_secs = 46 sp_ereport_ingester.period_secs = 47 + fm.sitrep_load_period_secs = 48 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1416,6 +1434,9 @@ mod test { period_secs: Duration::from_secs(47), disable: false, }, + fm: FmTasksConfig { + sitrep_load_period_secs: Duration::from_secs(48), + } }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -1514,6 +1535,7 @@ mod test { alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 sp_ereport_ingester.period_secs = 44 + fm.sitrep_load_period_secs = 45 [default_region_allocation_strategy] type = "random" diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index bc71033bf3..8ea150eaef 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -51,6 +51,7 @@ pub struct BackgroundTasks { pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, pub task_reconfigurator_config_loader: Activator, + pub task_fm_sitrep_loader: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs new file mode 100644 index 0000000000..d9d7ac3c2d --- /dev/null +++ b/nexus/db-model/src/fm.rs @@ -0,0 +1,93 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for representing fault management situation reports (sitreps) in the +//! database. +//! +//! The fault management sitrep, and the ways in which it is represented in +//! CRDB, is described in detail in [RFD +//! 603](https://rfd.shared.oxide.computer/rfd/0603). +//! +//! These types are used when inserting and reading sitreps in CRDB; when in +//! use, the sitrep is represented as a [`nexus_types::fm::Sitrep`]. See the +//! documentation in [`nexus_types::fm`] for more information. + +use crate::SqlU32; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_history}; +use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_sitrep)] +pub struct SitrepMetadata { + pub id: DbTypedUuid, + pub parent_sitrep_id: Option>, + pub inv_collection_id: DbTypedUuid, + pub time_created: DateTime, + pub creator_id: DbTypedUuid, + pub comment: String, +} + +impl From for nexus_types::fm::SitrepMetadata { + fn from(db_meta: SitrepMetadata) -> Self { + let SitrepMetadata { + id, + parent_sitrep_id, + inv_collection_id, + creator_id, + comment, + time_created, + } = db_meta; + Self { + id: id.into(), + parent_sitrep_id: parent_sitrep_id.map(Into::into), + inv_collection_id: inv_collection_id.into(), + creator_id: creator_id.into(), + comment, + time_created, + } + } +} + +impl From for SitrepMetadata { + fn from(db_meta: nexus_types::fm::SitrepMetadata) -> Self { + let nexus_types::fm::SitrepMetadata { + id, + parent_sitrep_id, + inv_collection_id, + creator_id, + comment, + time_created, + } = db_meta; + Self { + id: id.into(), + parent_sitrep_id: parent_sitrep_id.map(Into::into), + inv_collection_id: inv_collection_id.into(), + creator_id: creator_id.into(), + comment, + time_created, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = fm_sitrep_history)] +pub struct SitrepVersion { + pub version: SqlU32, + pub sitrep_id: DbTypedUuid, + pub time_made_current: DateTime, +} + +impl From for nexus_types::fm::SitrepVersion { + fn from(db_version: SitrepVersion) -> Self { + let SitrepVersion { sitrep_id, version, time_made_current } = + db_version; + Self { + id: sitrep_id.into(), + version: version.into(), + time_made_current, + } + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 32e4d4747d..bd963619fb 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -89,6 +89,7 @@ mod webhook_rx; // for join-based marker trait generation. mod deployment; mod ereport; +pub mod fm; pub mod nat_entry; mod omicron_zone_config; mod quota; @@ -181,6 +182,7 @@ pub use dns::*; pub use downstairs::*; pub use ereport::*; pub use external_ip::*; +pub use fm::*; pub use generation::*; pub use identity_provider::*; pub use image::*; diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index be8d77012f..63517caf05 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(201, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(202, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(202, "fm-sitrep"), KnownVersion::new(201, "scim-client-bearer-token"), KnownVersion::new(200, "dual-stack-network-interfaces"), KnownVersion::new(199, "multicast-pool-support"), diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs new file mode 100644 index 0000000000..47600c9bdc --- /dev/null +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -0,0 +1,757 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on fault management internal data, such as situation +//! reports (sitreps). +//! +//! See [RFD 603](https://rfd.shared.oxide.computer/rfd/0603) for details on the +//! fault management sitrep. + +use super::DataStore; +use crate::authz; +use crate::context::OpContext; +use crate::db::model; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::pg::Pg; +use diesel::prelude::*; +use diesel::query_builder::AstPass; +use diesel::query_builder::QueryFragment; +use diesel::query_builder::QueryId; +use diesel::result::DatabaseErrorKind; +use diesel::result::Error as DieselError; +use diesel::sql_types; +use nexus_db_errors::ErrorHandler; +use nexus_db_errors::public_error_from_diesel; +use nexus_db_lookup::DbConnection; +use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; +use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; +use nexus_types::fm; +use nexus_types::fm::Sitrep; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SitrepUuid; +use uuid::Uuid; + +impl DataStore { + /// Reads the current [sitrep version](fm::SitrepVersion) from CRDB. + /// + /// If no sitreps have been generated, this returns `None`. + pub async fn fm_current_sitrep_version( + &self, + opctx: &OpContext, + ) -> Result, Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + let version = self + .fm_current_sitrep_version_on_conn(&conn) + .await? + .map(Into::into); + Ok(version) + } + + async fn fm_current_sitrep_version_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + ) -> Result, Error> { + history_dsl::fm_sitrep_history + .order_by(history_dsl::version.desc()) + .select(model::SitrepVersion::as_select()) + .first_async(conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Reads the [`fm::SitrepMetadata`] describing the sitrep with the given + /// ID, if one exists. + pub async fn fm_sitrep_metadata_read( + &self, + opctx: &OpContext, + id: SitrepUuid, + ) -> Result { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + let meta = + self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); + Ok(meta) + } + + async fn fm_sitrep_metadata_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result { + sitrep_dsl::fm_sitrep + .filter(sitrep_dsl::id.eq(id.into_untyped_uuid())) + .select(model::SitrepMetadata::as_select()) + .first_async(conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!("sitrep {id:?}")) + }) + } + + /// Reads the *entire* current sitrep, along with its version. + /// + /// This is equivalent to reading the current sitrep version using + /// [`DataStore::fm_current_sitrep_version`], and then reading the sitrep + /// itself using [`DataStore::fm_sitrep_read`]. + /// + /// If this method returns `None`, there is no current sitrep, meaning that + /// no sitreps have been created. + pub async fn fm_sitrep_read_current( + &self, + opctx: &OpContext, + ) -> Result, Error> { + let conn = self.pool_connection_authorized(opctx).await?; + let version: fm::SitrepVersion = + match self.fm_current_sitrep_version_on_conn(&conn).await? { + Some(version) => version.into(), + None => return Ok(None), + }; + let sitrep = self.fm_sitrep_read_on_conn(version.id, &conn).await?; + Ok(Some((version, sitrep))) + } + + /// Reads the entire content of the sitrep with the provided ID, if one exists. + pub async fn fm_sitrep_read( + &self, + opctx: &OpContext, + id: SitrepUuid, + ) -> Result { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + self.fm_sitrep_read_on_conn(id, &conn).await + } + + async fn fm_sitrep_read_on_conn( + &self, + id: SitrepUuid, + conn: &async_bb8_diesel::Connection, + ) -> Result { + let metadata = + self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); + + // TODO(eliza): this is where we would read all the other sitrep data, + // if there was any. + + Ok(Sitrep { metadata }) + } + + /// Insert the provided [`Sitrep`] into the database, and attempt to mark it + /// as the current sitrep. + /// + /// If the sitrep's parent is not the current sitrep, the new sitrep is not + /// added to the sitrep history, and an error is returned. See [this + /// section](https://rfd.shared.oxide.computer/rfd/0603#_creating_sitreps) + /// in RFD 603 for details. + /// + /// # Returns + /// + /// - `Ok(())` if the new sitrep was both successfully inserted *and* added + /// to the sitrep history as the current sitrep. + /// + /// - `Err(`[`InsertSitrepError::ParentNotCurrent`]`)` if the sitrep's + /// `parent_sitrep_id` is not the current sitrep, indicating that it was + /// generated based on out of date inputs. + /// + /// This error indicates that the sitrep is orphaned and should be + /// deleted. It is out of date, and another sitrep has already been + /// generated based on the same inputs. + /// + /// - `Err(`[`InsertSitrepError::Other`]`)` if another error occurred while + /// inserting the sitrep. + pub async fn fm_sitrep_insert( + &self, + opctx: &OpContext, + sitrep: &Sitrep, + ) -> Result<(), InsertSitrepError> { + let conn = self.pool_connection_authorized(opctx).await?; + + // TODO(eliza): there should probably be an authz object for the fm sitrep? + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // Create the sitrep metadata record. + diesel::insert_into(sitrep_dsl::fm_sitrep) + .values(model::SitrepMetadata::from(sitrep.metadata.clone())) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert sitrep metadata record") + })?; + + // TODO(eliza): other sitrep records would be inserted here... + + // Now, try to make the sitrep current. + let query = InsertSitrepVersionQuery { sitrep_id: sitrep.id() }; + query + .execute_async(&*conn) + .await + .map_err(|e| query.decode_error(e)) + .map(|_| ()) + } +} + +/// Errors returned by [`DataStore::fm_sitrep_insert`]. +#[derive(Debug, thiserror::Error)] +pub enum InsertSitrepError { + #[error(transparent)] + Other(#[from] Error), + /// The parent sitrep ID is no longer the current sitrep. + #[error("sitrep {0}'s parent is not the current sitrep")] + ParentNotCurrent(SitrepUuid), +} + +/// Query to insert a new sitrep version into the `fm_sitrep_history` table, +/// making it the current sitrep. +/// +/// This implements the "compare-and-swap" operation [described in RFD +/// 603](https://rfd.shared.oxide.computer/rfd/0603#_creating_sitreps). In +/// particular, this query will insert a new sitrep version into the +/// `fm_sitrep_history` table IF AND ONLY IF one of the following conditions +/// are true: +/// +/// 1. The new sitrep's parent sitrep ID is the current sitrep (i.e. the sitrep +/// with the highest version number in `fm_sitrep_history`) +/// 2. The new sitrep's parent sitrep ID is `NULL`, AND there are no other +/// sitreps in `fm_sitrep_history` (i.e., we are inserting the first-ever +/// sitrep) +/// +/// Upholding these invariants ensures that sitreps are sequentially consistent, +/// and `fm_sitrep_history` always contains a linear history of sitreps which +/// were generated based on the previous current sitrep. +/// +/// The CTE used to perform this operation is based on the one used in the +/// `deployment` module to insert blueprints into the `bp_target` table. It +/// differs in that it does not perform an existence check on the sitrep to be +/// made current. This is because the `db::datastore::deployment` module's +/// public API treats inserting a new blueprint and setting it as the current +/// target as separate operations, so it is possible for a consumer of the API +/// to try and set a blueprint as the target without first having created it. +/// Here, however, we only ever set a sitrep as the current sitrep in the +/// `Datastore::fm_sitrep_insert` method, which also creates the sitrep. So, it +/// is impossible for a consumer of this API to attempt to make a sitrep current +/// without having first created it. +/// +/// The SQL generated for this CTE looks like this: +/// +/// ```sql +/// WITH +/// -- Subquery to fetch the current sitrep (i.e., the row with the max +/// -- version). +/// current_sitrep AS ( +/// SELECT +/// "version" AS version, +/// "sitrep_id" AS sitrep_id, +/// FROM "fm_sitrep_history" +/// ORDER BY "version" DESC +/// LIMIT 1 +/// ), +/// +/// -- Error checking subquery: This uses similar tricks as elsewhere in +/// -- this crate to `CAST(... AS UUID)` with non-UUID values that result +/// -- in runtime errors in specific cases, allowing us to give accurate +/// -- error messages. +/// -- +/// -- This checks that the sitrep descends directly from the current +/// -- sitrep, and will fail the query if it does not. +/// check_validity AS MATERIALIZED ( +/// SELECT CAST(IF( +/// -- Check for whether our new sitrep's parent matches our current +/// -- sitrep. There are two cases here: The first is the common case +/// -- (i.e., the new sitrep has a parent: does it match the current +/// -- sitrep ID?). The second is the bootstrapping check: if we're +/// -- trying to insert a new sitrep that does not have a parent, +/// -- we should not have a sitrep target at all. +/// -- +/// -- If either of these cases fails, we return `parent-not-current`. +/// ( +/// SELECT "parent_sitrep_id" FROM "sitrep", current_sitrep +/// WHERE +/// "id" = +/// AND current_sitrep.sitrep_id = "parent_sitrep_id" +/// ) IS NOT NULL +/// OR +/// ( +/// SELECT 1 FROM "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NULL +/// AND NOT EXISTS (SELECT version FROM current_sitrep) +/// ) = 1, +/// -- Sometime between v22.1.9 and v22.2.19, Cockroach's type checker +/// -- became too smart for our `CAST(... as UUID)` error checking +/// -- gadget: it can infer that `` must be a UUID, so +/// -- then tries to parse 'parent-not-target' and 'no-such-blueprint' +/// -- as UUIDs _during typechecking_, which causes the query to always +/// -- fail. We can defeat this by casting the UUID to text here, which +/// -- will allow the 'parent-not-target' and 'no-such-blueprint' +/// -- sentinels to survive type checking, making it to query execution +/// -- where they will only be cast to UUIDs at runtime in the failure +/// -- cases they're supposed to catch. +/// CAST( AS text), +/// 'parent-not-current' +/// ) AS UUID) +/// ), +/// +/// -- Determine the new version number to use: either 1 if this is the +/// -- first sitrep being made the current sitrep, or 1 higher than +/// -- the previous sitrep's version. +/// -- +/// -- The final clauses of each of these WHERE clauses repeat the +/// -- checks performed above in `check_validity`, and will cause this +/// -- subquery to return no rows if we should not allow the new +/// -- target to be set. +/// new_sitrep AS ( +/// SELECT 1 AS new_version FROM "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NULL +/// AND NOT EXISTS (SELECT version FROM current_sitrep) +/// UNION +/// SELECT current_sitrep.version + 1 FROM current_sitrep, "sitrep" +/// WHERE +/// "id" = +/// AND "parent_sitrep_id" IS NOT NULL +/// AND "parent_sitrep_id" = current_sitrep.sitrep_id +/// ) +/// +/// -- Perform the actual insertion. +/// INSERT INTO "sitrep_history"( +/// "version","sitrep_id","time_made_current" +/// ) +/// SELECT +/// new_sitrep.new_version, +/// , +/// NOW() +/// FROM new_sitrep +/// ``` +#[derive(Debug, Clone, Copy)] +struct InsertSitrepVersionQuery { + sitrep_id: SitrepUuid, +} + +// Uncastable sentinel used to detect we attempt to make a sitrep current when +// its parent sitrep ID is no longer the current sitrep. +const PARENT_NOT_CURRENT: &str = "parent-not-current"; + +// Error messages generated from the above sentinel values. +const PARENT_NOT_CURRENT_ERROR_MESSAGE: &str = "could not parse \ + \"parent-not-current\" as type uuid: \ + uuid: incorrect UUID length: parent-not-current"; + +impl InsertSitrepVersionQuery { + fn decode_error(&self, err: DieselError) -> InsertSitrepError { + match err { + DieselError::DatabaseError(DatabaseErrorKind::Unknown, info) + if info.message() == PARENT_NOT_CURRENT_ERROR_MESSAGE => + { + InsertSitrepError::ParentNotCurrent(self.sitrep_id) + } + err => { + let err = public_error_from_diesel(err, ErrorHandler::Server) + .internal_context("failed to insert new sitrep version"); + InsertSitrepError::Other(err) + } + } + } +} + +impl QueryId for InsertSitrepVersionQuery { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} + +impl QueryFragment for InsertSitrepVersionQuery { + fn walk_ast<'a>( + &'a self, + mut out: AstPass<'_, 'a, Pg>, + ) -> diesel::QueryResult<()> { + use nexus_db_schema::schema; + const CURRENT_SITREP: &'static str = "current_sitrep"; + type FromClause = + diesel::internal::table_macro::StaticQueryFragmentInstance; + const SITREP_FROM_CLAUSE: FromClause = + FromClause::new(); + const SITREP_HISTORY_FROM_CLAUSE: FromClause< + schema::fm_sitrep_history::table, + > = FromClause::new(); + + out.push_sql("WITH "); + + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(" AS (SELECT "); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(" AS version,"); + out.push_identifier(history_dsl::sitrep_id::NAME)?; + out.push_sql(" AS sitrep_id"); + out.push_sql(" FROM "); + SITREP_HISTORY_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" ORDER BY "); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(" DESC LIMIT 1),"); + + out.push_sql( + "check_validity AS MATERIALIZED ( \ + SELECT \ + CAST( \ + IF(", + ); + out.push_sql("(SELECT "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" FROM "); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(", "); + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(CURRENT_SITREP)?; + out.push_sql(".sitrep_id = "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + ") IS NOT NULL \ + OR \ + (SELECT 1 FROM ", + ); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + "IS NULL \ + AND NOT EXISTS ( \ + SELECT version FROM current_sitrep) \ + ) = 1, ", + ); + out.push_sql(" CAST("); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AS text), "); + out.push_bind_param::( + &PARENT_NOT_CURRENT, + )?; + out.push_sql( + ") \ + AS UUID) \ + ), ", + ); + + out.push_sql("new_sitrep AS (SELECT 1 AS new_version FROM "); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql( + " IS NULL \ + AND NOT EXISTS \ + (SELECT version FROM current_sitrep) \ + UNION \ + SELECT current_sitrep.version + 1 FROM \ + current_sitrep, ", + ); + SITREP_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); + out.push_identifier(sitrep_dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(" AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" IS NOT NULL AND "); + out.push_identifier(sitrep_dsl::parent_sitrep_id::NAME)?; + out.push_sql(" = current_sitrep.sitrep_id) "); + + out.push_sql("INSERT INTO "); + SITREP_HISTORY_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql("("); + out.push_identifier(history_dsl::version::NAME)?; + out.push_sql(","); + out.push_identifier(history_dsl::sitrep_id::NAME)?; + out.push_sql(","); + out.push_identifier(history_dsl::time_made_current::NAME)?; + out.push_sql(") SELECT new_sitrep.new_version, "); + out.push_bind_param::( + self.sitrep_id.as_untyped_uuid(), + )?; + out.push_sql(", NOW()"); + out.push_sql(" FROM new_sitrep"); + + Ok(()) + } +} + +impl RunQueryDsl for InsertSitrepVersionQuery {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + use chrono::Utc; + use omicron_test_utils::dev; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::OmicronZoneUuid; + + #[tokio::test] + async fn explain_insert_sitrep_version_query() { + let logctx = dev::test_setup_log("explain_insert_sitrep_version_query"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = InsertSitrepVersionQuery { sitrep_id: SitrepUuid::nil() }; + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + eprintln!("{explanation}"); + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_without_parent() { + // Setup + let logctx = dev::test_setup_log("test_insert_sitrep_without_parent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Base case: there should be no current sitrep. + let current = datastore.fm_sitrep_read_current(&opctx).await.unwrap(); + assert!(current.is_none()); + + // Okay, let's create a new sitrep. + let sitrep = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id: OmicronZoneUuid::new_v4(), + comment: "TEST SITREP PLEASE IGNORE".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + + datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap(); + + let current = datastore + .fm_sitrep_read_current(&opctx) + .await + .expect("should successfully read current sitrep"); + let (version, current_sitrep) = current.expect("sitrep should be Some"); + assert_eq!(version.id, sitrep.metadata.id); + assert_eq!(version.version, 1); + assert_eq!(sitrep.id(), current_sitrep.id()); + assert_eq!(sitrep.parent_id(), current_sitrep.parent_id()); + assert_eq!( + sitrep.metadata.creator_id, + current_sitrep.metadata.creator_id + ); + assert_eq!(sitrep.metadata.comment, current_sitrep.metadata.comment); + + // Trying to insert the same sitrep again should fail. + let err = + datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap_err(); + assert!(err.to_string().contains("duplicate key")); + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_current_parent() { + let logctx = + dev::test_setup_log("test_insert_sitrep_with_current_parent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let creator_id = OmicronZoneUuid::new_v4(); + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Create a second sitrep with the first as parent + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 2".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep2).await.expect( + "inserting a sitrep whose parent is current should succeed", + ); + + // Verify the second sitrep is now current + let (version, current_sitrep) = datastore + .fm_sitrep_read_current(&opctx) + .await + .unwrap() + .expect("current sitrep should be Some"); + assert_eq!(version.id, sitrep2.id()); + assert_eq!(version.version, 2); + assert_eq!(sitrep2.id(), current_sitrep.id()); + assert_eq!(sitrep2.parent_id(), current_sitrep.parent_id()); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_nonexistent_parent_fails() { + let logctx = dev::test_setup_log( + "test_insert_sitrep_with_nonexistent_parent_fails", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let creator_id = OmicronZoneUuid::new_v4(); + + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Try to insert a sitrep with a non-existent parent ID + let nonexistent_id = SitrepUuid::new_v4(); + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP WITH BAD PARENT".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(nonexistent_id), + }, + }; + + let result = datastore.fm_sitrep_insert(&opctx, &sitrep2).await; + + // Should fail with ParentNotCurrent error + match result { + Err(super::InsertSitrepError::ParentNotCurrent(_)) => {} + _ => panic!("expected ParentNotCurrent error, got {result:?}"), + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_sitrep_with_outdated_parent_fails() { + let logctx = dev::test_setup_log( + "test_insert_sitrep_with_outdated_parent_fails", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let creator_id = OmicronZoneUuid::new_v4(); + + // Create an initial sitrep (no parent) + let sitrep1 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 1".to_string(), + time_created: Utc::now(), + parent_sitrep_id: None, + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + + // Create a second sitrep with the first as parent + let sitrep2 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id, + comment: "TEST SITREP 2".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + datastore.fm_sitrep_insert(&opctx, &sitrep2).await.unwrap(); + + // Try to create a third sitrep with sitrep1 (outdated) as parent. + // This should fail, as sitrep2 is now the current sitrep. + let sitrep3 = nexus_types::fm::Sitrep { + metadata: nexus_types::fm::SitrepMetadata { + id: SitrepUuid::new_v4(), + inv_collection_id: CollectionUuid::new_v4(), + creator_id: OmicronZoneUuid::new_v4(), + comment: "TEST SITREP 3 WITH OUTDATED PARENT".to_string(), + time_created: Utc::now(), + parent_sitrep_id: Some(sitrep1.id()), + }, + }; + let result = datastore.fm_sitrep_insert(&opctx, &sitrep3).await; + + // Should fail with ParentNotCurrent error + match result { + Err(InsertSitrepError::ParentNotCurrent(_)) => {} + _ => panic!("expected ParentNotCurrent error, got {result:?}"), + } + + // Verify sitrep2 is still current + let (version, current_sitrep) = datastore + .fm_sitrep_read_current(&opctx) + .await + .unwrap() + .expect("current sitrep should be Some"); + assert_eq!(version.id, sitrep2.id()); + assert_eq!(version.version, 2); + assert_eq!(sitrep2.id(), current_sitrep.id()); + assert_eq!(sitrep2.parent_id(), current_sitrep.parent_id()); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 3667323d49..24a58fb39a 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -71,6 +71,7 @@ mod disk; mod dns; mod ereport; mod external_ip; +pub mod fm; mod identity_provider; mod image; pub mod instance; diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index ae62b1a327..1e114683c6 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2812,3 +2812,26 @@ table! { bearer_token -> Text, } } + +table! { + fm_sitrep (id) { + id -> Uuid, + parent_sitrep_id -> Nullable, + inv_collection_id -> Uuid, + time_created -> Timestamptz, + creator_id -> Uuid, + comment -> Text, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, inv_collection); + +table! { + fm_sitrep_history (version) { + version -> Int8, + sitrep_id -> Uuid, + time_made_current -> Timestamptz, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_sitrep_history); diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 2de7bb187c..c70b1c8eaf 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -170,6 +170,11 @@ alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index e95e40496d..f2aa7f6d84 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -154,6 +154,11 @@ alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index c4a788f78f..de240a5056 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -103,6 +103,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; +use super::tasks::fm_sitrep_load; use super::tasks::instance_reincarnation; use super::tasks::instance_updater; use super::tasks::instance_watcher; @@ -145,6 +146,7 @@ use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::PendingMgsUpdates; +use nexus_types::fm; use nexus_types::inventory::Collection; use omicron_uuid_kinds::OmicronZoneUuid; use oximeter::types::ProducerRegistry; @@ -254,6 +256,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator: Activator::new(), task_sp_ereport_ingester: Activator::new(), task_reconfigurator_config_loader: Activator::new(), + task_fm_sitrep_loader: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -334,6 +337,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator, task_sp_ereport_ingester, task_reconfigurator_config_loader, + task_fm_sitrep_loader, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -1045,7 +1049,7 @@ impl BackgroundTasksInitializer { description: "collects error reports from service processors", period: config.sp_ereport_ingester.period_secs, task_impl: Box::new(ereport_ingester::SpEreportIngester::new( - datastore, + datastore.clone(), resolver, nexus_id, config.sp_ereport_ingester.disable, @@ -1055,6 +1059,21 @@ impl BackgroundTasksInitializer { activator: task_sp_ereport_ingester, }); + driver.register(TaskDefinition { + name: "fm_sitrep_loader", + description: + "loads the current fault management situation report from \ + the database", + period: config.fm.sitrep_load_period_secs, + task_impl: Box::new(fm_sitrep_load::SitrepLoader::new( + datastore, + args.sitrep_load_tx, + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_fm_sitrep_loader, + }); + driver } } @@ -1093,6 +1112,9 @@ pub struct BackgroundTasksData { pub mgs_updates_tx: watch::Sender, /// handle for controlling Nexus quiesce pub nexus_quiesce: NexusQuiesceHandle, + /// Channel for exposing the latest loaded fault-management sitrep. + pub sitrep_load_tx: + watch::Sender>>, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs new file mode 100644 index 0000000000..d7864a6bcc --- /dev/null +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -0,0 +1,332 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for loading the current fault management sitrep +//! from the DB + +use crate::app::background::BackgroundTask; +use chrono::Utc; +use futures::future::BoxFuture; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::Sitrep; +use nexus_types::fm::SitrepVersion; +use nexus_types::internal_api::background::SitrepLoadStatus as Status; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct SitrepLoader { + datastore: Arc, + tx: watch::Sender, +} + +type CurrentSitrep = Option>; + +impl BackgroundTask for SitrepLoader { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.load_if_needed(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let err = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + json!({ "error": err }) + } + } + }) + } +} + +impl SitrepLoader { + pub fn new( + datastore: Arc, + tx: watch::Sender, + ) -> Self { + Self { datastore, tx } + } + + #[allow(dead_code)] // subsequent PRs will consume this + pub fn watcher(&self) -> watch::Receiver { + self.tx.subscribe() + } + + async fn load_if_needed(&self, opctx: &OpContext) -> Status { + // Set up a logger for this activation that includes metadata about + // the current sitrep. + let (old, log) = match &*self.tx.borrow() { + None => (None, opctx.log.clone()), + Some(old) => { + let (ref old_version, _) = **old; + let log = opctx.log.new(slog::o!( + // since this is a TypedUuid, use `Debug` to avoid + // including () + "original_id" => format!("{:?}", old_version.id), + "original_made_current" => old_version.time_made_current.to_string(), + "original_version" => old_version.version, + )); + (Some(old_version.clone()), log) + } + }; + + // Get the ID of the current sitrep. + let time_loaded = Utc::now(); + let current_version: SitrepVersion = match self + .datastore + .fm_current_sitrep_version(opctx) + .await + { + Ok(Some(version)) => version, + Ok(None) => match old { + Some(SitrepVersion { version, id, .. }) => { + // We should never go from "some sitrep" to "no sitrep"; + // pruning should always keep a small number of old sitreps + // around until we have new ones to replace them. + // + // In this case we won't replace our channel contents with + // `None`; we'll keep around whatever old collection we had + // loaded. + warn!(log, "previously had a sitrep, but now none exist"); + return Status::Error(format!( + "previously loaded sitrep {id:?} (v{version}), \ + but now no sitreps exist", + )); + } + None => { + // Had no sitrep; still have no sitrep. + return Status::NoSitrep; + } + }, + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, + "failed to read current sitrep version"; + &err + ); + return Status::Error(format!( + "failed to read current sitrep version: {err}" + )); + } + }; + + // Have we already loaded this sitrep? + match old { + Some(version) if version.id == current_version.id => { + debug!(log, "current sitrep has not changed"); + return Status::Loaded { version, time_loaded }; + } + Some(old) if current_version.version < old.version => { + warn!( + log, + "current sitrep version v{} is less than the previously \ + loaded version v{}; ignoring it", + current_version.version, + old.version, + ); + return Status::Error(format!( + "current sitrep version v{} is less than the previously \ + loaded version v{}; ignoring it", + current_version.version, old.version, + )); + } + Some(SitrepVersion { version, id, .. }) + if version == current_version.version + && id != current_version.id => + { + // Well, this is weird! Entries in the `sitrep_version` table + // should not change IDs once they are created, that seems like + // a bug. Nonetheless, we will load the new UUID, but we should + // say something about this, as it's a bit odd. + warn!( + log, + "sitrep ID associated with the current version in the \ + database has changed; this is not supposed to happen!"; + "current_id" => ?current_version.id, + ); + } + _ => (), + } + + let sitrep = match self + .datastore + .fm_sitrep_read(opctx, current_version.id) + .await + { + Ok(sitrep) => sitrep, + Err(err) => { + let err = InlineErrorChain::new(&err); + error!( + log, + "failed to load current sitrep"; + "current_id" => ?current_version.id, + "current_version" => ?current_version.version, + &err + ); + return Status::Error(format!( + "failed to read current sitrep {:?} (v{}): {err}", + current_version.id, current_version.version + )); + } + }; + + let sitrep = Arc::new((current_version.clone(), sitrep)); + self.tx.send_modify(|s| { + *s = Some(sitrep); + }); + + Status::Loaded { version: current_version, time_loaded } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::app::background::BackgroundTask; + use nexus_db_queries::db::pub_test_utils::TestDatabase; + use nexus_types::fm::SitrepMetadata; + use omicron_test_utils::dev; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::OmicronZoneUuid; + use omicron_uuid_kinds::SitrepUuid; + + #[tokio::test] + async fn test_load_sitreps() { + let logctx = dev::test_setup_log("test_inventory_loader"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let (tx, mut sitrep_rx) = watch::channel(None); + let mut task = SitrepLoader::new(datastore.clone(), tx); + + // Initially, there should be no sitrep. + let status = task.activate(&opctx).await; + assert_eq!(*sitrep_rx.borrow_and_update(), None); + let status = serde_json::from_value::(status).unwrap(); + assert_eq!(status, Status::NoSitrep); + + // Now, create an initial sitrep. + let sitrep1_id = SitrepUuid::new_v4(); + let sitrep1 = Sitrep { + metadata: SitrepMetadata { + id: sitrep1_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: None, + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 1".to_string(), + time_created: Utc::now(), + }, + }; + datastore + .fm_sitrep_insert(&opctx, &sitrep1) + .await + .expect("sitrep should be inserted successfully"); + + // It should be loaded. + let status = task.activate(&opctx).await; + assert_eq!( + true, + sitrep_rx.has_changed().unwrap(), + "sitrep watch should have changed when a sitrep was loaded" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the new sitrep should have been loaded"); + let (ref loaded_version1, ref loaded_sitrep) = *snapshot; + // N.B.: we just compare the IDs here as comparing the whole struct may + // not be equal, since the `time_created` field may have been rounded in + // CRDB. Which is a shame, but whatever. :/ + assert_eq!(loaded_sitrep.metadata.id, sitrep1.metadata.id); + dbg!(loaded_version1); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version1); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // A subsequent activation should see the same sitrep. + let status = task.activate(&opctx).await; + assert_eq!( + false, + sitrep_rx.has_changed().unwrap(), + "sitrep watch should not change if the same sitrep was loaded" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the same should have been loaded"); + let (ref loaded_version2, ref loaded_sitrep) = *snapshot; + assert_eq!(loaded_sitrep.metadata.id, sitrep1.metadata.id); + dbg!(loaded_version1, loaded_version2); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version2); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // Now, create a new sitrep. + let sitrep2_id = SitrepUuid::new_v4(); + let sitrep2 = Sitrep { + metadata: SitrepMetadata { + id: sitrep2_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: Some(sitrep1_id), + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 2".to_string(), + time_created: Utc::now(), + }, + }; + datastore + .fm_sitrep_insert(&opctx, &sitrep2) + .await + .expect("sitrep2 should be inserted successfully"); + + // It should be loaded. + let status = task.activate(&opctx).await; + assert_eq!( + true, + sitrep_rx.has_changed().unwrap(), + "loading a new sitrep should update the watch" + ); + let snapshot = sitrep_rx + .borrow_and_update() + .clone() + .expect("the new sitrep should have been loaded"); + let (ref loaded_version3, ref loaded_sitrep) = *snapshot; + assert_eq!(loaded_sitrep.metadata.id, sitrep2.metadata.id); + dbg!(loaded_version3); + assert_ne!(loaded_version3, loaded_version2); + let status = serde_json::from_value::(status).unwrap(); + match status { + Status::Loaded { version, .. } => { + assert_eq!(&version, loaded_version3); + } + status => panic!("expected Status::Loaded, got {status:?}",), + }; + + // XXX(eliza): It would be nice to also be able to test that an orphaned + // sitrep (which has not been linked into the sitrep history chain) is + // *not* loaded even if it exists. However, that would require + // `nexus-db-queries` to expose separate interfaces for creating a + // sitrep and inserting it into the history, which I have intentionally + // chosen *not* to do to make it harder to do it by mistake. + // So, ¯\_(ツ)_/¯ + + // Cleanup + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 6ec34c5b2b..ca9e431463 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -18,6 +18,7 @@ pub mod dns_propagation; pub mod dns_servers; pub mod ereport_ingester; pub mod external_endpoints; +pub mod fm_sitrep_load; pub mod instance_reincarnation; pub mod instance_updater; pub mod instance_watcher; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index f4cf081305..b977690e8f 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -30,6 +30,7 @@ use nexus_mgs_updates::ArtifactCache; use nexus_mgs_updates::MgsUpdateDriver; use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::ReconfiguratorConfigParam; +use nexus_types::fm; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; use omicron_common::api::external::ByteCount; @@ -286,6 +287,11 @@ pub struct Nexus { #[allow(dead_code)] repo_depot_resolver: Box, + /// Watch channel containing the currently-loaded fault management sitrep. + #[allow(dead_code)] + sitrep_load_rx: + watch::Receiver>>, + /// handle to pull update status data update_status: UpdateStatusHandle, @@ -485,6 +491,8 @@ impl Nexus { let mgs_update_status_rx = mgs_update_driver.status_rx(); let _mgs_driver_task = tokio::spawn(mgs_update_driver.run()); + let (sitrep_load_tx, sitrep_load_rx) = watch::channel(None); + let nexus = Nexus { id: config.deployment.id, rack_id, @@ -540,6 +548,7 @@ impl Nexus { repo_depot_resolver, update_status: UpdateStatusHandle::new(blueprint_load_rx), quiesce, + sitrep_load_rx, }; // TODO-cleanup all the extra Arcs here seems wrong @@ -624,6 +633,7 @@ impl Nexus { tuf_artifact_replication_rx, mgs_updates_tx, blueprint_load_tx, + sitrep_load_tx, }, ); diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index e629fd435c..f9abc06cfd 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -192,6 +192,10 @@ webhook_deliverator.first_retry_backoff_secs = 10 webhook_deliverator.second_retry_backoff_secs = 20 read_only_region_replacement_start.period_secs = 999999 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 + [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs new file mode 100644 index 0000000000..466a277edc --- /dev/null +++ b/nexus/types/src/fm.rs @@ -0,0 +1,96 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management types. +//! +//! Of particular importance is the [`Sitrep`], which is the top-level data +//! structure containing fault management state. + +use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// A fault management situation report, or _sitrep_. +/// +/// The sitrep is a data structure that represents a snapshot of the state of +/// the system as understood by the control plane's fault management subsystem. +/// At any point in time, a single sitrep is considered the "current" sitrep. +/// Each sitrep records a _parent sitrep ID_, which indicates the sitrep that +/// was current at the time that the sitrep was created. +/// A sitrep may only be made current if its parent is the current sitrep. +/// This ensures that there is a sequentially consistent history of sitreps. +/// The fault management subsystem only considers data from the current sitrep +/// when making decisions and diagnoses. +/// +/// The sitrep, how it is represented in the database, and how the fault +/// management subsystem creates and interacts with sitreps, is described in +/// detail in [RFD 603](https://rfd.shared.oxide.computer/rfd/0603). +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct Sitrep { + /// Metadata describing this sitrep, when it was created, its parent sitrep + /// ID, and which Nexus produced it. + pub metadata: SitrepMetadata, + // TODO(eliza): draw the rest of the sitrep +} + +impl Sitrep { + pub fn id(&self) -> SitrepUuid { + self.metadata.id + } + + pub fn parent_id(&self) -> Option { + self.metadata.parent_sitrep_id + } +} + +/// Metadata describing a sitrep. +/// +/// This corresponds to the records stored in the `fm_sitrep` database table. +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct SitrepMetadata { + /// The ID of this sitrep. + pub id: SitrepUuid, + + /// The ID of the parent sitrep. + /// + /// A sitrep's _parent_ is the sitrep that was current when the planning + /// phase that produced that sitrep ran. The parent sitrep is a planning + /// input that produced this sitrep. + /// + /// The parent sitrep ID is optional, because this sitrep _may_ be the first + /// sitrep ever generated by the system. However, once a current sitrep has + /// been set, no subsequent sitrep should be created without a parent. + pub parent_sitrep_id: Option, + + /// The ID of the inventory collection that was used as planning input to + /// this sitrep. + /// + /// When generating a new sitrep, the fault manager should ensure that the + /// inventory collection it uses as input is at least as new as the parent + /// sitrep's inventory collection. + pub inv_collection_id: CollectionUuid, + + /// The Omicron zone UUID of the Nexus that generated this sitrep. + /// + /// This is intended for debugging purposes. + pub creator_id: OmicronZoneUuid, + + /// A human-readable (but mechanically generated) string describing the + /// reason(s) this sitrep was created. + /// + /// This is intended for debugging purposes. + pub comment: String, + + /// The time at which this sitrep was created. + pub time_created: DateTime, +} + +/// An entry in the sitrep version history. +#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +pub struct SitrepVersion { + pub id: SitrepUuid, + pub version: u32, + pub time_made_current: DateTime, +} diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 6fcbed2eb7..5e3ccbe0c8 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -757,6 +757,19 @@ pub struct EreporterStatus { pub errors: Vec, } +/// The status of a `fm_sitrep_loader` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SitrepLoadStatus { + /// An error occurred. + Error(String), + + /// There is no current sitrep. + NoSitrep, + + /// We've loaded the most recent sitrep as of `time_loaded`. + Loaded { version: crate::fm::SitrepVersion, time_loaded: DateTime }, +} + #[cfg(test)] mod test { use super::TufRepoInfo; diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 3a4056f7f4..fd1f2243bf 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -32,6 +32,7 @@ pub mod authn; pub mod deployment; pub mod external_api; +pub mod fm; pub mod identity; pub mod internal_api; pub mod inventory; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ca22be8fd9..083e0078c1 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6767,6 +6767,73 @@ ON omicron.public.host_ereport ( ) WHERE time_deleted IS NULL; +/* + * Fault management situation reports (and accessories) + * + * See RFD 603 for details: + * https://rfd.shared.oxide.computer/rfd/603 +*/ + +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( + -- The ID of this sitrep. + id UUID PRIMARY KEY, + -- The ID of the parent sitrep. + -- + -- A sitrep's _parent_ is the sitrep that was current when the planning + -- phase that produced that sitrep ran. The parent sitrep is a planning + -- input that produced this sitrep. + -- + -- This is effectively a foreign key back to this table; however, it is + -- allowed to be NULL: the initial sitrep has no parent. Additionally, + -- it may be non-NULL but no longer reference a row in this table: once a + -- child sitrep has been created from a parent, it's possible for the + -- parent to be deleted. We do not NULL out this field on such a deletion, + -- so we can always see that there had been a particular parent even if + -- it's now gone. + parent_sitrep_id UUID, + -- The ID of the inventory collection that was used as input to this + -- sitrep. + -- + -- This is a foreign key that references a row in the `inv_collection` + -- table (and other inventory records associated with that collection). + -- + -- Note that inventory collections are pruned on a separate schedule + -- from sitreps, so the inventory collection records may not exist. + inv_collection_id UUID NOT NULL, + + -- These fields are not semantically meaningful and are intended + -- debugging purposes. + + -- The time at which this sitrep was created. + time_created TIMESTAMPTZ NOT NULL, + -- The Omicron zone UUID of the Nexus instance that created this + -- sitrep. + creator_id UUID NOT NULL, + -- A human-readable description of the changes represented by this + -- sitrep. + comment TEXT NOT NULL +); + +-- The history of current sitreps. +-- +-- The sitrep with the highest `version` in this table is the current sitrep. +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_history ( + -- Monotonically increasing version for all FM sitreps. + version INT8 PRIMARY KEY, + + -- Effectively a foreign key into the `fm_sitrep` table, but may + -- reference a fm_sitrep that has been deleted (if this sitrep is + -- no longer current; the current sitrep must not be deleted). + sitrep_id UUID NOT NULL, + + -- Timestamp for when this sitrep was made current. + time_made_current TIMESTAMPTZ NOT NULL +); + +CREATE UNIQUE INDEX IF NOT EXISTS + lookup_sitrep_version_by_id +ON omicron.public.fm_sitrep_history (sitrep_id); + -- Metadata for the schema itself. -- -- This table may be read by Nexuses with different notions of "what the schema should be". @@ -6854,7 +6921,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '201.0.0', NULL) + (TRUE, NOW(), NOW(), '202.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/fm-sitrep/up01.sql b/schema/crdb/fm-sitrep/up01.sql new file mode 100644 index 0000000000..41296e4232 --- /dev/null +++ b/schema/crdb/fm-sitrep/up01.sql @@ -0,0 +1,39 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep ( + -- The ID of this sitrep. + id UUID PRIMARY KEY, + -- The ID of the parent sitrep. + -- + -- A sitrep's _parent_ is the sitrep that was current when the planning + -- phase that produced that sitrep ran. The parent sitrep is a planning + -- input that produced this sitrep. + -- + -- This is effectively a foreign key back to this table; however, it is + -- allowed to be NULL: the initial sitrep has no parent. Additionally, + -- it may be non-NULL but no longer reference a row in this table: once a + -- child sitrep has been created from a parent, it's possible for the + -- parent to be deleted. We do not NULL out this field on such a deletion, + -- so we can always see that there had been a particular parent even if + -- it's now gone. + parent_sitrep_id UUID, + -- The ID of the inventory collection that was used as input to this + -- sitrep. + -- + -- This is a foreign key that references a row in the `inv_collection` + -- table (and other inventory records associated with that collection). + -- + -- Note that inventory collections are pruned on a separate schedule + -- from sitreps, so the inventory collection records may not exist. + inv_collection_id UUID NOT NULL, + + -- These fields are not semantically meaningful and are intended + -- debugging purposes. + + -- The time at which this sitrep was created. + time_created TIMESTAMPTZ NOT NULL, + -- The Omicron zone UUID of the Nexus instance that created this + -- sitrep. + creator_id UUID NOT NULL, + -- A human-readable description of the changes represented by this + -- sitrep. + comment TEXT NOT NULL +); diff --git a/schema/crdb/fm-sitrep/up02.sql b/schema/crdb/fm-sitrep/up02.sql new file mode 100644 index 0000000000..c78eb85eac --- /dev/null +++ b/schema/crdb/fm-sitrep/up02.sql @@ -0,0 +1,15 @@ +-- The history of current sitreps. +-- +-- The sitrep with the highest `version` in this table is the current sitrep. +CREATE TABLE IF NOT EXISTS omicron.public.fm_sitrep_history ( + -- Monotonically increasing version for all FM sitreps. + version INT8 PRIMARY KEY, + + -- Effectively a foreign key into the `fm_sitrep` table, but may + -- reference a fm_sitrep that has been deleted (if this sitrep is + -- no longer current; the current sitrep must not be deleted). + sitrep_id UUID NOT NULL, + + -- Timestamp for when this sitrep was made current. + time_made_current TIMESTAMPTZ NOT NULL +); diff --git a/schema/crdb/fm-sitrep/up03.sql b/schema/crdb/fm-sitrep/up03.sql new file mode 100644 index 0000000000..91cd68adf6 --- /dev/null +++ b/schema/crdb/fm-sitrep/up03.sql @@ -0,0 +1,3 @@ +CREATE UNIQUE INDEX IF NOT EXISTS + lookup_sitrep_version_by_id +ON omicron.public.fm_sitrep_history (sitrep_id); diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 14b0281cc2..859539452a 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -94,6 +94,10 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 32e20ee79f..9aa67515d7 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -94,6 +94,10 @@ read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 sp_ereport_ingester.period_secs = 30 +# How frequently to check for a new fault management sitrep (made by any +# Nexus). +# This is cheap, so we should check frequently. +fm.sitrep_load_period_secs = 15 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 354c677149..54c497a5d2 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -74,6 +74,7 @@ impl_typed_uuid_kinds! { Region = {}, SiloGroup = {}, SiloUser = {}, + Sitrep = {}, Sled = {}, SpUpdate = {}, SupportBundle = {},