diff --git a/Cargo.lock b/Cargo.lock index af4763cdfc0..c98870575c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6883,6 +6883,28 @@ dependencies = [ "uuid", ] +[[package]] +name = "nexus-fm" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "ereport-types", + "iddqd", + "nexus-reconfigurator-planning", + "nexus-types", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "rand 0.9.2", + "schemars 0.8.22", + "serde", + "serde_json", + "slog", + "slog-error-chain", + "typed-rng", +] + [[package]] name = "nexus-internal-api" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 4c38098a7b6..04daf4c2aa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,6 +86,7 @@ members = [ "nexus/db-schema", "nexus/defaults", "nexus/external-api", + "nexus/fm", "nexus/internal-api", "nexus/inventory", "nexus/lockstep-api", diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs index 439798d577b..4fe302465de 100644 --- a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs +++ b/dev-tools/omdb/src/bin/omdb/db/sitrep.rs @@ -238,7 +238,7 @@ async fn cmd_db_sitrep_show( } }; - let fm::Sitrep { metadata } = sitrep; + let fm::Sitrep { metadata, cases } = sitrep; let fm::SitrepMetadata { id, creator_id, @@ -345,5 +345,12 @@ async fn cmd_db_sitrep_show( } } + if !cases.is_empty() { + println!("\n{:-<80}\n", "== CASES"); + for case in cases { + println!("{}", case.display_indented(4, Some(id))); + } + } + Ok(()) } diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index af77d9297b5..440a3dd78e5 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -32,6 +32,7 @@ pub struct Ereport { Serialize, Deserialize, JsonSchema, + Hash, )] #[repr(transparent)] #[serde(from = "u64", into = "u64")] @@ -102,7 +103,18 @@ impl TryFrom for Ena { } /// Unique identifier for an ereport. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Serialize, + Deserialize, + PartialOrd, + Ord, + Hash, +)] pub struct EreportId { pub restart_id: EreporterRestartUuid, pub ena: Ena, diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 4885362ed36..bc43209d59c 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -930,6 +930,11 @@ pub struct FmTasksConfig { /// garbage collects unneeded fault management sitreps in the database. #[serde_as(as = "DurationSeconds")] pub sitrep_gc_period_secs: Duration, + /// period (in seconds) for periodic activations of the background task that + /// updates externally-visible database tables to match the current situation + /// report. + #[serde_as(as = "DurationSeconds")] + pub rendezvouz_period_secs: Duration, } impl Default for FmTasksConfig { @@ -940,6 +945,9 @@ impl Default for FmTasksConfig { // time the current sitrep changes, and activating it more // frequently won't make things more responsive. sitrep_gc_period_secs: Duration::from_secs(600), + // This, too, is activated whenever a new sitrep is loaded, so we + // need not set the periodic activation interval too high. + rendezvouz_period_secs: Duration::from_secs(300), } } } @@ -1281,6 +1289,7 @@ mod test { fm.sitrep_gc_period_secs = 49 probe_distributor.period_secs = 50 multicast_reconciler.period_secs = 60 + fm.rendezvous_period_secs = 51 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1529,6 +1538,7 @@ mod test { fm: FmTasksConfig { sitrep_load_period_secs: Duration::from_secs(48), sitrep_gc_period_secs: Duration::from_secs(49), + rendezvouz_period_secs: Duration::from_secs(51), }, probe_distributor: ProbeDistributorConfig { period_secs: Duration::from_secs(50), @@ -1640,6 +1650,7 @@ mod test { fm.sitrep_load_period_secs = 45 fm.sitrep_gc_period_secs = 46 probe_distributor.period_secs = 47 + fm.rendezvous_period_secs = 48 multicast_reconciler.period_secs = 60 [default_region_allocation_strategy] diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index bcbd2443b2a..114be16053f 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -51,6 +51,7 @@ pub struct BackgroundTasks { pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, pub task_reconfigurator_config_loader: Activator, + pub task_fm_rendezvous: Activator, pub task_fm_sitrep_loader: Activator, pub task_fm_sitrep_gc: Activator, pub task_probe_distributor: Activator, diff --git a/nexus/db-model/src/alert_class.rs b/nexus/db-model/src/alert_class.rs index 5f0b2129707..39004961b9b 100644 --- a/nexus/db-model/src/alert_class.rs +++ b/nexus/db-model/src/alert_class.rs @@ -4,6 +4,7 @@ use super::impl_enum_type; use nexus_types::external_api::views; +use omicron_common::api::external::Error; use serde::de::{self, Deserialize, Deserializer}; use serde::ser::{Serialize, Serializer}; use std::fmt; @@ -30,6 +31,8 @@ impl_enum_type!( TestFooBaz => b"test.foo.baz" TestQuuxBar => b"test.quux.bar" TestQuuxBarBaz => b"test.quux.bar.baz" + PsuInserted => b"hw.insert.power.power_shelf.psu" + PsuRemoved => b"hw.remove.power.power_shelf.psu" ); impl AlertClass { @@ -44,6 +47,8 @@ impl AlertClass { Self::TestFooBaz => "test.foo.baz", Self::TestQuuxBar => "test.quux.bar", Self::TestQuuxBarBaz => "test.quux.bar.baz", + Self::PsuInserted => "hw.insert.power.power_shelf.psu", + Self::PsuRemoved => "hw.remove.power.power_shelf.psu", } } @@ -76,6 +81,12 @@ impl AlertClass { | Self::TestQuuxBarBaz => { "This is a test of the emergency alert system" } + Self::PsuInserted => { + "A power supply unit (PSU) has been inserted into the power shelf" + } + Self::PsuRemoved => { + "A power supply unit (PSU) has been removed from the power shelf" + } } } @@ -84,6 +95,32 @@ impl AlertClass { ::VARIANTS; } +impl From for AlertClass { + fn from(input: nexus_types::fm::AlertClass) -> Self { + use nexus_types::fm::AlertClass as In; + match input { + In::PsuRemoved => Self::PsuRemoved, + In::PsuInserted => Self::PsuInserted, + } + } +} + +impl TryFrom for nexus_types::fm::AlertClass { + type Error = Error; + + fn try_from(input: AlertClass) -> Result { + use nexus_types::fm::AlertClass as Out; + match input { + AlertClass::PsuRemoved => Ok(Out::PsuRemoved), + AlertClass::PsuInserted => Ok(Out::PsuInserted), + class => Err(Error::invalid_value( + "alert_class", + format!("'{class}' is not a FM alert class"), + )), + } + } +} + impl fmt::Display for AlertClass { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index d9d7ac3c2dc..353a2174aac 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -19,6 +19,13 @@ use chrono::{DateTime, Utc}; use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_history}; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; +mod alert_request; +pub use alert_request::*; +mod case; +pub use case::*; +mod diagnosis_engine; +pub use diagnosis_engine::*; + #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_sitrep)] pub struct SitrepMetadata { diff --git a/nexus/db-model/src/fm/alert_request.rs b/nexus/db-model/src/fm/alert_request.rs new file mode 100644 index 00000000000..551085aa065 --- /dev/null +++ b/nexus/db-model/src/fm/alert_request.rs @@ -0,0 +1,55 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management alert requests. + +use crate::AlertClass; +use crate::DbTypedUuid; +use nexus_db_schema::schema::fm_alert_request; +use nexus_types::fm; +use omicron_uuid_kinds::{ + AlertKind, CaseKind, CaseUuid, SitrepKind, SitrepUuid, +}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_alert_request)] +pub struct AlertRequest { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub requested_sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + #[diesel(column_name = "class")] + pub class: AlertClass, + pub payload: serde_json::Value, +} + +impl AlertRequest { + pub fn new( + current_sitrep_id: SitrepUuid, + case_id: CaseUuid, + req: fm::AlertRequest, + ) -> Self { + let fm::AlertRequest { id, requested_sitrep_id, payload, class } = req; + AlertRequest { + id: id.into(), + sitrep_id: current_sitrep_id.into(), + requested_sitrep_id: requested_sitrep_id.into(), + case_id: case_id.into(), + class: class.into(), + payload, + } + } +} + +impl TryFrom for fm::AlertRequest { + type Error = >::Error; + fn try_from(req: AlertRequest) -> Result { + Ok(fm::AlertRequest { + id: req.id.into(), + requested_sitrep_id: req.requested_sitrep_id.into(), + payload: req.payload, + class: req.class.try_into()?, + }) + } +} diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs new file mode 100644 index 00000000000..706f3ddfa84 --- /dev/null +++ b/nexus/db-model/src/fm/case.rs @@ -0,0 +1,168 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management cases. + +use super::AlertRequest; +use super::DiagnosisEngine; +use crate::DbTypedUuid; +use crate::SpMgsSlot; +use crate::SpType; +use crate::ereport; +use chrono::{DateTime, Utc}; +use nexus_db_schema::schema::{ + fm_case, fm_case_impacts_location, fm_ereport_in_case, +}; +use nexus_types::fm; +use omicron_uuid_kinds::{ + CaseKind, EreporterRestartKind, SitrepKind, SitrepUuid, +}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_case)] +pub struct CaseMetadata { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub de: DiagnosisEngine, + + pub created_sitrep_id: DbTypedUuid, + pub time_created: DateTime, + + pub time_closed: Option>, + pub closed_sitrep_id: Option>, + + pub comment: String, +} + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_ereport_in_case)] +pub struct CaseEreport { + pub restart_id: DbTypedUuid, + pub ena: ereport::DbEna, + pub case_id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub assigned_sitrep_id: DbTypedUuid, + pub comment: String, +} + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_case_impacts_location)] +pub struct CaseImpactsLocation { + pub sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + pub created_sitrep_id: DbTypedUuid, + pub comment: String, +} + +#[derive(Clone, Debug)] +pub struct Case { + pub metadata: CaseMetadata, + pub ereports: Vec, + pub impacted_locations: Vec, + pub alerts_requested: Vec, +} + +impl Case { + pub fn from_sitrep(sitrep_id: SitrepUuid, case: fm::Case) -> Self { + let sitrep_id = sitrep_id.into(); + let case_id = case.id.into(); + let ereports = case + .ereports + .into_iter() + .map( + |fm::case::CaseEreport { + ereport, + assigned_sitrep_id, + comment, + }| { + let restart_id = ereport.id().restart_id.into(); + let ena = ereport.id().ena.into(); + CaseEreport { + case_id, + restart_id, + ena, + comment, + sitrep_id, + assigned_sitrep_id: assigned_sitrep_id.into(), + } + }, + ) + .collect(); + let impacted_locations = case + .impacted_locations + .into_iter() + .map( + |fm::case::ImpactedLocation { + sp_type, + slot, + comment, + created_sitrep_id, + }| CaseImpactsLocation { + sitrep_id, + case_id, + sp_type: sp_type.into(), + sp_slot: SpMgsSlot::from(slot as u16), + created_sitrep_id: created_sitrep_id.into(), + comment, + }, + ) + .collect(); + let alerts_requested = case + .alerts_requested + .into_iter() + .map( + |fm::AlertRequest { + id, + class, + payload, + requested_sitrep_id, + }| AlertRequest { + sitrep_id, + case_id, + class: class.into(), + id: id.into(), + payload, + requested_sitrep_id: requested_sitrep_id.into(), + }, + ) + .collect(); + + Self { + metadata: CaseMetadata { + id: case_id, + sitrep_id, + de: case.de.into(), + created_sitrep_id: case.created_sitrep_id.into(), + time_created: case.time_created.into(), + time_closed: case.time_closed.map(Into::into), + closed_sitrep_id: case.closed_sitrep_id.map(Into::into), + comment: case.comment, + }, + ereports, + impacted_locations, + alerts_requested, + } + } +} + +impl From for fm::case::ImpactedLocation { + fn from(loc: CaseImpactsLocation) -> Self { + let CaseImpactsLocation { + sitrep_id: _, + case_id: _, + sp_type, + sp_slot, + created_sitrep_id, + comment, + } = loc; + fm::case::ImpactedLocation { + sp_type: sp_type.into(), + slot: sp_slot.0.into(), + created_sitrep_id: created_sitrep_id.into(), + comment, + } + } +} diff --git a/nexus/db-model/src/fm/diagnosis_engine.rs b/nexus/db-model/src/fm/diagnosis_engine.rs new file mode 100644 index 00000000000..7d354142bbb --- /dev/null +++ b/nexus/db-model/src/fm/diagnosis_engine.rs @@ -0,0 +1,50 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::impl_enum_type; +use nexus_types::fm; +use serde::{Deserialize, Serialize}; +use std::fmt; + +impl_enum_type!( + DiagnosisEngineEnum: + + #[derive( + Copy, + Clone, + Debug, + PartialEq, + Serialize, + Deserialize, + AsExpression, + FromSqlRow, + )] + #[serde(rename_all = "snake_case")] + pub enum DiagnosisEngine; + + PowerShelf => b"power_shelf" + +); + +impl From for fm::DiagnosisEngineKind { + fn from(de: DiagnosisEngine) -> Self { + match de { + DiagnosisEngine::PowerShelf => fm::DiagnosisEngineKind::PowerShelf, + } + } +} + +impl From for DiagnosisEngine { + fn from(fm_de: fm::DiagnosisEngineKind) -> Self { + match fm_de { + fm::DiagnosisEngineKind::PowerShelf => DiagnosisEngine::PowerShelf, + } + } +} + +impl fmt::Display for DiagnosisEngine { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fm::DiagnosisEngineKind::from(*self).fmt(f) + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 95e14165dd6..692cf62b839 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -42,7 +42,7 @@ mod downstairs; pub mod ereport; mod ereporter_type; mod external_ip; -mod fm; +pub mod fm; mod generation; mod identity_provider; mod image; @@ -191,7 +191,7 @@ pub use downstairs::*; pub use ereport::Ereport; pub use ereporter_type::*; pub use external_ip::*; -pub use fm::*; +pub use fm::{SitrepMetadata, SitrepVersion}; pub use generation::*; pub use identity_provider::*; pub use image::*; diff --git a/nexus/db-queries/src/db/datastore/alert.rs b/nexus/db-queries/src/db/datastore/alert.rs index d0b06c33943..7f339046a98 100644 --- a/nexus/db-queries/src/db/datastore/alert.rs +++ b/nexus/db-queries/src/db/datastore/alert.rs @@ -11,6 +11,8 @@ use crate::db::model::AlertClass; use crate::db::model::AlertIdentity; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind; +use diesel::result::Error as DieselError; use diesel::result::OptionalExtension; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; @@ -29,7 +31,7 @@ impl DataStore { payload: serde_json::Value, ) -> CreateResult { let conn = self.pool_connection_authorized(&opctx).await?; - diesel::insert_into(alert_dsl::alert) + let alert = diesel::insert_into(alert_dsl::alert) .values(Alert { identity: AlertIdentity::new(id), time_dispatched: None, @@ -40,7 +42,23 @@ impl DataStore { .returning(Alert::as_returning()) .get_result_async(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + ) => Error::conflict(format!("alert ID {id} already exists")), + e => public_error_from_diesel(e, ErrorHandler::Server), + })?; + + slog::debug!( + &opctx.log, + "published alert"; + "alert_id" => ?id, + "alert_class" => %class, + "time_created" => ?alert.identity.time_created, + ); + + Ok(alert) } pub async fn alert_select_next_for_dispatch( diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 79fbe44a828..4f995447c35 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -98,6 +98,14 @@ impl DataStore { ) -> LookupResult { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; + self.ereport_fetch_on_conn(&conn, id).await + } + + pub(crate) async fn ereport_fetch_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + id: fm::EreportId, + ) -> LookupResult { let restart_id = id.restart_id.into_untyped_uuid(); let ena = DbEna::from(id.ena); @@ -106,7 +114,7 @@ impl DataStore { .filter(dsl::ena.eq(ena)) .filter(dsl::time_deleted.is_null()) .select(Ereport::as_select()) - .first_async(&*conn) + .first_async(conn) .await .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index f86351e3081..aa6d7e3f817 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -12,9 +12,14 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; +use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::model; +use crate::db::model::DbTypedUuid; use crate::db::model::SqlU32; +use crate::db::model::ereport::DbEna; +use crate::db::pagination::Paginator; use crate::db::pagination::paginated; +use crate::db::pagination::paginated_multicolumn; use crate::db::raw_query_builder::QueryBuilder; use crate::db::raw_query_builder::TypedSqlQuery; use async_bb8_diesel::AsyncRunQueryDsl; @@ -26,6 +31,10 @@ use dropshot::PaginationOrder; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; +use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; +use nexus_db_schema::schema::fm_case::dsl as case_dsl; +use nexus_db_schema::schema::fm_case_impacts_location::dsl as impacted_location_dsl; +use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; use nexus_db_schema::schema::fm_sitrep_history::dsl as history_dsl; use nexus_types::fm; @@ -33,8 +42,10 @@ use nexus_types::fm::Sitrep; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; +use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SitrepUuid; +use std::sync::Arc; use uuid::Uuid; impl DataStore { @@ -120,7 +131,8 @@ impl DataStore { Ok(Some((version, sitrep))) } - /// Reads the entire content of the sitrep with the provided ID, if one exists. + /// Reads the entire content of the sitrep with the provided ID, if one + /// exists. pub async fn fm_sitrep_read( &self, opctx: &OpContext, @@ -139,10 +151,244 @@ impl DataStore { let metadata = self.fm_sitrep_metadata_read_on_conn(id, &conn).await?.into(); - // TODO(eliza): this is where we would read all the other sitrep data, - // if there was any. + let mut all_ereports = iddqd::IdOrdMap::>::new(); + let cases = { + let mut cases = iddqd::IdOrdMap::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = self + .fm_sitrep_cases_list_on_conn( + id, + &p.current_pagparams(), + &conn, + ) + .await + .map_err(|e| { + e.internal_context("failed to list sitrep cases") + })?; + paginator = p.found_batch(&batch, &|case| case.id); + + for case in batch { + // TODO(eliza): consider using a `ParallelTaskSet` to fetch the + // cases in parallel here. + let case = self.fm_case_read_on_conn(case, conn).await?; + + // Fetch ereports assigned to this case. + let mut ereports = + iddqd::IdOrdMap::with_capacity(case.ereports.len()); + for model::fm::CaseEreport { + restart_id, + ena: DbEna(ena), + comment, + assigned_sitrep_id, + .. + } in case.ereports + { + let ereport_id = fm::EreportId { + restart_id: restart_id.into(), + ena, + }; + let ereport = match all_ereports.entry(&ereport_id) { + iddqd::id_ord_map::Entry::Occupied(entry) => { + entry.get().clone() + } + iddqd::id_ord_map::Entry::Vacant(entry) => { + let ereport: fm::Ereport = self.ereport_fetch_on_conn(conn, ereport_id) + .await + .map_err(|e| e.internal_context(format!( + "failed to fetch ereport {ereport_id} for case {}", + case.metadata.id, + )))? + .try_into()?; + entry.insert(Arc::new(ereport)).clone() + } + }; + ereports + .insert_unique(fm::case::CaseEreport { + ereport, + assigned_sitrep_id: assigned_sitrep_id.into(), + comment, + }) + .unwrap(); + } + + let alerts_requested = case + .alerts_requested + .into_iter() + .map(|alert| { + let id = alert.id.into_untyped_uuid(); // Grab this so it can be added to an error. + fm::AlertRequest::try_from(alert).map_err(|e| { + e.internal_context(format!( + "invalid alert {id} for case {}", + case.metadata.id + )) + }) + }) + .collect::>()?; + + let impacted_locations = case + .impacted_locations + .into_iter() + .map(Into::into) + .collect(); + + let model::fm::CaseMetadata { + id, + sitrep_id: _, + created_sitrep_id, + time_created, + time_closed, + closed_sitrep_id, + comment, + de, + } = case.metadata; + cases + .insert_unique(fm::Case { + id: id.into(), + created_sitrep_id: created_sitrep_id.into(), + time_created: time_created.into(), + time_closed: time_closed.map(Into::into), + closed_sitrep_id: closed_sitrep_id.map(Into::into), + de: de.into(), + comment, + ereports, + alerts_requested, + impacted_locations, + }) + .expect("case UUIDs should be unique"); + } + } + + cases + }; + + Ok(Sitrep { metadata, cases }) + } + + async fn fm_sitrep_cases_list_on_conn( + &self, + sitrep_id: SitrepUuid, + pagparams: &DataPageParams<'_, DbTypedUuid>, + conn: &async_bb8_diesel::Connection, + ) -> ListResultVec { + paginated(case_dsl::fm_case, case_dsl::id, &pagparams) + .filter(case_dsl::sitrep_id.eq(sitrep_id.into_untyped_uuid())) + .select(model::fm::CaseMetadata::as_select()) + .load_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn fm_case_read_on_conn( + &self, + case: model::fm::CaseMetadata, + conn: &async_bb8_diesel::Connection, + ) -> Result { + // Read ereports assigned to this case. + let ereports = { + let mut ereports = Vec::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated_multicolumn( + case_ereport_dsl::fm_ereport_in_case, + (case_ereport_dsl::restart_id, case_ereport_dsl::ena), + &p.current_pagparams(), + ) + .filter(case_ereport_dsl::case_id.eq(case.id)) + .filter(case_ereport_dsl::sitrep_id.eq(case.sitrep_id)) + .select(model::fm::CaseEreport::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to list ereports assigned to case {}", + case.id + )) + })?; + + paginator = p.found_batch(&batch, &|ereport| { + (ereport.restart_id, ereport.ena) + }); + ereports.extend(batch); + } + ereports + }; + + // Read alerts requested for this case. + let alerts_requested = { + let mut alerts = Vec::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated( + alert_req_dsl::fm_alert_request, + alert_req_dsl::id, + &p.current_pagparams(), + ) + .filter(alert_req_dsl::case_id.eq(case.id)) + .filter(alert_req_dsl::sitrep_id.eq(case.sitrep_id)) + .select(model::fm::AlertRequest::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to list alerts requested for case {}", + case.id + )) + })?; + + paginator = p.found_batch(&batch, &|req| req.id); + alerts.extend(batch); + } + + alerts + }; + + // Read impacted locations + let impacted_locations = { + let mut locations = Vec::new(); + let mut paginator = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated_multicolumn( + impacted_location_dsl::fm_case_impacts_location, + ( + impacted_location_dsl::sp_type, + impacted_location_dsl::sp_slot, + ), + &p.current_pagparams(), + ) + .filter(impacted_location_dsl::case_id.eq(case.id)) + .filter(impacted_location_dsl::sitrep_id.eq(case.sitrep_id)) + .select(model::fm::CaseImpactsLocation::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to list impacted locations for case {}", + case.id + )) + })?; + + paginator = + p.found_batch(&batch, &|loc| (loc.sp_type, loc.sp_slot)); + locations.extend(batch); + } - Ok(Sitrep { metadata }) + locations + }; + + Ok(model::fm::Case { + metadata: case, + alerts_requested, + ereports, + impacted_locations, + }) } /// Insert the provided [`Sitrep`] into the database, and attempt to mark it @@ -171,16 +417,27 @@ impl DataStore { pub async fn fm_sitrep_insert( &self, opctx: &OpContext, - sitrep: &Sitrep, + sitrep: Sitrep, ) -> Result<(), InsertSitrepError> { let conn = self.pool_connection_authorized(opctx).await?; // TODO(eliza): there should probably be an authz object for the fm sitrep? opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let sitrep_id = sitrep.id(); + // Create the sitrep metadata record. + // + // NOTE: we must insert this record before anything else, because it's + // how orphaned sitreps are found when performing garbage collection. + // Were we to first insert some other records and insert the metadata + // record *last*, we could die when we have inserted some sitrep data + // but have yet to create the metadata record. If this occurs, those + // records could not be easily found by the garbage collection task. + // Those (unused) records would then be permanently leaked without + // manual human intervention to delete them. diesel::insert_into(sitrep_dsl::fm_sitrep) - .values(model::SitrepMetadata::from(sitrep.metadata.clone())) + .values(model::SitrepMetadata::from(sitrep.metadata)) .execute_async(&*conn) .await .map_err(|e| { @@ -188,10 +445,79 @@ impl DataStore { .internal_context("failed to insert sitrep metadata record") })?; - // TODO(eliza): other sitrep records would be inserted here... + // Create case records. + let mut cases = Vec::with_capacity(sitrep.cases.len()); + for case in sitrep.cases { + // TODO(eliza): some of this could be done in parallel using a + // `ParallelTaskSet`, if the time it takes to insert a sitrep were + // to become important? + let model::fm::Case { + metadata, + ereports, + alerts_requested, + impacted_locations, + } = model::fm::Case::from_sitrep(sitrep_id, case); + + if !ereports.is_empty() { + diesel::insert_into(case_ereport_dsl::fm_ereport_in_case) + .values(ereports) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert ereport records for case {}", + metadata.id + )) + })?; + } + + if !alerts_requested.is_empty() { + diesel::insert_into(alert_req_dsl::fm_alert_request) + .values(alerts_requested) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert ereport alert requests for case {}", + metadata.id + )) + })?; + } + + if !impacted_locations.is_empty() { + diesel::insert_into( + impacted_location_dsl::fm_case_impacts_location, + ) + .values(impacted_locations) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context(format!( + "failed to insert impacted locations for case {}", + metadata.id + )) + })?; + } + + cases.push(metadata); + } + + if !cases.is_empty() { + diesel::insert_into(case_dsl::fm_case) + .values(cases) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert case records") + })?; + } // Now, try to make the sitrep current. - let query = Self::insert_sitrep_version_query(sitrep.id()); + let query = Self::insert_sitrep_version_query(sitrep_id); query .execute_async(&*conn) .await @@ -202,7 +528,7 @@ impl DataStore { ) if info.message() == Self::PARENT_NOT_CURRENT_ERROR_MESSAGE => { - InsertSitrepError::ParentNotCurrent(sitrep.id()) + InsertSitrepError::ParentNotCurrent(sitrep_id) } err => { let err = @@ -530,9 +856,52 @@ impl DataStore { .map(|id| id.into_untyped_uuid()) .collect::>(); - // TODO(eliza): when other tables are added to store data that is part - // of the sitrep, we'll need to delete any records with matching IDs in - // those tables, too! + // Delete case ereport assignments + let case_ereports_deleted = diesel::delete( + case_ereport_dsl::fm_ereport_in_case + .filter(case_ereport_dsl::sitrep_id.eq_any(ids.clone())), + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to delete case ereport assignments") + })?; + + // Delete alert requests. + let alert_requests_deleted = diesel::delete( + alert_req_dsl::fm_alert_request + .filter(alert_req_dsl::sitrep_id.eq_any(ids.clone())), + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to delete alert requests") + })?; + + // Delete case impacts. + let case_impacted_locations_deleted = diesel::delete( + impacted_location_dsl::fm_case_impacts_location + .filter(impacted_location_dsl::sitrep_id.eq_any(ids.clone())), + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to delete case location impact lists") + })?; + + // Delete case metadata records. + let cases_deleted = diesel::delete( + case_dsl::fm_case.filter(case_dsl::sitrep_id.eq_any(ids.clone())), + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to delete case metadata") + })?; // Delete the sitrep metadata entries *last*. This is necessary because // the rest of the delete operation is unsynchronized, and it is @@ -541,10 +910,28 @@ impl DataStore { // the one that is used to determine whether a sitrep "exists" so that // the sitrep GC task can determine if it needs to be deleted, so don't // touch it until all the other records are gone. - diesel::delete(sitrep_dsl::fm_sitrep.filter(sitrep_dsl::id.eq_any(ids))) - .execute_async(&*conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + let sitreps_deleted = diesel::delete( + sitrep_dsl::fm_sitrep.filter(sitrep_dsl::id.eq_any(ids.clone())), + ) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to delete sitrep metadata") + })?; + + slog::debug!( + &opctx.log, + "deleted {sitreps_deleted} of {} sitreps sitreps", ids.len(); + "ids" => ?ids, + "sitreps_deleted" => sitreps_deleted, + "cases_deleted" => cases_deleted, + "alert_requests_deleted" => alert_requests_deleted, + "case_ereports_deleted" => case_ereports_deleted, + "case_impacted_locations_deleted" => case_impacted_locations_deleted, + ); + + Ok(sitreps_deleted) } pub async fn fm_sitrep_version_list( @@ -755,9 +1142,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep.clone()).await.unwrap(); let current = datastore .fm_sitrep_read_current(&opctx) @@ -775,8 +1163,10 @@ mod tests { assert_eq!(sitrep.metadata.comment, current_sitrep.metadata.comment); // Trying to insert the same sitrep again should fail. - let err = - datastore.fm_sitrep_insert(&opctx, &sitrep).await.unwrap_err(); + let err = datastore + .fm_sitrep_insert(&opctx, sitrep.clone()) + .await + .unwrap_err(); assert!(err.to_string().contains("duplicate key")); // Clean up. @@ -801,8 +1191,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Create a second sitrep with the first as parent let sitrep2 = nexus_types::fm::Sitrep { @@ -814,8 +1205,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep2).await.expect( + datastore.fm_sitrep_insert(&opctx, sitrep2.clone()).await.expect( "inserting a sitrep whose parent is current should succeed", ); @@ -854,8 +1246,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Try to insert a sitrep with a non-existent parent ID let nonexistent_id = SitrepUuid::new_v4(); @@ -868,9 +1261,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(nonexistent_id), }, + cases: Default::default(), }; - let result = datastore.fm_sitrep_insert(&opctx, &sitrep2).await; + let result = datastore.fm_sitrep_insert(&opctx, sitrep2).await; // Should fail with ParentNotCurrent error match result { @@ -902,8 +1296,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep1).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep1.clone()).await.unwrap(); // Create a second sitrep with the first as parent let sitrep2 = nexus_types::fm::Sitrep { @@ -915,8 +1310,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; - datastore.fm_sitrep_insert(&opctx, &sitrep2).await.unwrap(); + datastore.fm_sitrep_insert(&opctx, sitrep2.clone()).await.unwrap(); // Try to create a third sitrep with sitrep1 (outdated) as parent. // This should fail, as sitrep2 is now the current sitrep. @@ -929,8 +1325,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.id()), }, + cases: Default::default(), }; - let result = datastore.fm_sitrep_insert(&opctx, &sitrep3).await; + let result = datastore.fm_sitrep_insert(&opctx, sitrep3.clone()).await; // Should fail with ParentNotCurrent error match result { @@ -969,9 +1366,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("inserting initial sitrep should succeed"); @@ -1009,9 +1407,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.metadata.id), }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("inserting child sitrep should succeed"); @@ -1042,7 +1441,7 @@ mod tests { ) -> Result, Error> { let mut listed_orphans = BTreeSet::new(); let mut paginator = Paginator::new( - crate::db::datastore::SQL_BATCH_SIZE, + SQL_BATCH_SIZE, dropshot::PaginationOrder::Descending, ); while let Some(p) = paginator.next() { @@ -1072,8 +1471,9 @@ mod tests { time_created: Utc::now(), parent_sitrep_id, }, + cases: Default::default(), }; - match datastore.fm_sitrep_insert(&opctx, &sitrep).await { + match datastore.fm_sitrep_insert(&opctx, sitrep).await { Ok(_) => { panic!("inserting sitrep v{v} orphan {i} should not succeed") } diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 2d74db5ab9c..2f71eb7ac50 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -39,6 +39,7 @@ define_enums! { ClickhouseModeEnum => "clickhouse_mode", DatasetKindEnum => "dataset_kind", DbMetadataNexusStateEnum => "db_metadata_nexus_state", + DiagnosisEngineEnum => "diagnosis_engine", DiskTypeEnum => "disk_type", DnsGroupEnum => "dns_group", DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8be6bb768c2..943027df9af 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2954,3 +2954,57 @@ allow_tables_to_appear_in_same_query!( rendezvous_local_storage_dataset, local_storage_dataset_allocation ); + +table! { + fm_case (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + de -> crate::enums::DiagnosisEngineEnum, + + time_created -> Timestamptz, + created_sitrep_id -> Uuid, + + time_closed -> Nullable, + closed_sitrep_id -> Nullable, + + comment -> Text, + } +} + +table! { + fm_ereport_in_case (sitrep_id, restart_id, ena) { + restart_id -> Uuid, + ena -> Int8, + case_id -> Uuid, + sitrep_id -> Uuid, + assigned_sitrep_id -> Uuid, + + comment -> Text, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); + +table! { + fm_case_impacts_location (sitrep_id, case_id, sp_type, sp_slot) { + sitrep_id -> Uuid, + case_id -> Uuid, + sp_type -> crate::enums::SpTypeEnum, + sp_slot -> Int4, + created_sitrep_id -> Uuid, + comment -> Text, + } +} + +table! { + fm_alert_request (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + requested_sitrep_id -> Uuid, + case_id -> Uuid, + class -> crate::enums::AlertClassEnum, + payload -> Jsonb, + } +} + +allow_tables_to_appear_in_same_query!(fm_sitrep, fm_alert_request); diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 5a7586e29e1..dba9044b0ef 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -179,6 +179,9 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Executing the current sitrep is triggered whenever a new sitrep is loaded, +# so we need not set the periodic activation interval too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 86eaee939ae..3fc31547cf5 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -163,6 +163,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml new file mode 100644 index 00000000000..37fd0c13684 --- /dev/null +++ b/nexus/fm/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "nexus-fm" +version = "0.1.0" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +iddqd.workspace = true +nexus-types.workspace = true +omicron-uuid-kinds.workspace = true +rand.workspace = true +schemars.workspace = true +serde.workspace = true +serde_json.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +typed-rng.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +omicron-test-utils.workspace = true +nexus-reconfigurator-planning.workspace = true +ereport-types.workspace = true diff --git a/nexus/fm/src/alert.rs b/nexus/fm/src/alert.rs new file mode 100644 index 00000000000..b89aed3d326 --- /dev/null +++ b/nexus/fm/src/alert.rs @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Alert messages. + +use crate::ereport_analysis::Baseboard; +use nexus_types::fm::AlertClass; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +pub mod power_shelf; + +pub trait Alert: Serialize + JsonSchema + std::fmt::Debug { + const CLASS: AlertClass; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct VpdIdentity { + pub part_number: String, + pub revision: String, + pub serial_number: String, +} + +impl From for VpdIdentity { + fn from(baseboard: Baseboard) -> Self { + let Baseboard { part_number, serial_number, rev } = baseboard; + Self { part_number, revision: rev.to_string(), serial_number } + } +} diff --git a/nexus/fm/src/alert/power_shelf.rs b/nexus/fm/src/alert/power_shelf.rs new file mode 100644 index 00000000000..e801d724401 --- /dev/null +++ b/nexus/fm/src/alert/power_shelf.rs @@ -0,0 +1,55 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Power shelf alerts. + +use super::{Alert, VpdIdentity}; +use chrono::{DateTime, Utc}; +use nexus_types::fm::AlertClass; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "version", rename_all = "snake_case")] +pub enum PsuInserted { + V0 { + #[serde(flatten)] + psc_psu: PscPsu, + time: DateTime, + }, +} + +impl Alert for PsuInserted { + const CLASS: AlertClass = AlertClass::PsuInserted; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[serde(tag = "version", rename_all = "snake_case")] +pub enum PsuRemoved { + V0 { + #[serde(flatten)] + psc_psu: PscPsu, + time: DateTime, + }, +} + +impl Alert for PsuRemoved { + const CLASS: AlertClass = AlertClass::PsuRemoved; +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct PscPsu { + pub psc_id: Option, + pub psc_slot: u16, + pub psu_id: PsuIdentity, + pub psu_slot: Option, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct PsuIdentity { + pub manufacturer: Option, + pub part_number: Option, + pub firmware_revision: Option, + pub serial_number: Option, +} diff --git a/nexus/fm/src/builder.rs b/nexus/fm/src/builder.rs new file mode 100644 index 00000000000..8373c64a370 --- /dev/null +++ b/nexus/fm/src/builder.rs @@ -0,0 +1,96 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Sitrep builder + +use nexus_types::fm; +use nexus_types::inventory; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SitrepUuid; +use slog::Logger; + +mod case; +pub use case::{AllCases, CaseBuilder}; +pub(crate) mod rng; +pub use rng::SitrepBuilderRng; + +#[derive(Debug)] +pub struct SitrepBuilder<'a> { + pub log: Logger, + pub inventory: &'a inventory::Collection, + pub parent_sitrep: Option<&'a fm::Sitrep>, + pub sitrep_id: SitrepUuid, + pub cases: case::AllCases, + pub impact_lists: case::ImpactLists, + comment: String, +} + +impl<'a> SitrepBuilder<'a> { + pub fn new( + log: &Logger, + inventory: &'a inventory::Collection, + parent_sitrep: Option<&'a fm::Sitrep>, + ) -> Self { + Self::new_with_rng( + log, + inventory, + parent_sitrep, + SitrepBuilderRng::from_entropy(), + ) + } + + pub fn new_with_rng( + log: &Logger, + inventory: &'a inventory::Collection, + parent_sitrep: Option<&'a fm::Sitrep>, + mut rng: SitrepBuilderRng, + ) -> Self { + // TODO(eliza): should the RNG also be seeded with the parent sitrep + // UUID and/or the Omicron zone UUID? Hmm. + let sitrep_id = rng.sitrep_id(); + let log = log.new(slog::o!( + "sitrep_id" => format!("{sitrep_id:?}"), + "parent_sitrep_id" => format!("{:?}", parent_sitrep.as_ref().map(|s| s.id())), + "inv_collection_id" => format!("{:?}", inventory.id), + )); + + let (cases, impact_lists) = + case::AllCases::new(log.clone(), sitrep_id, parent_sitrep, rng); + + slog::info!( + &log, + "preparing sitrep {sitrep_id:?}"; + "existing_open_cases" => cases.cases.len(), + ); + + SitrepBuilder { + log, + sitrep_id, + inventory, + parent_sitrep, + comment: String::new(), + cases, + impact_lists, + } + } + + pub fn build(self, creator_id: OmicronZoneUuid) -> fm::Sitrep { + fm::Sitrep { + metadata: fm::SitrepMetadata { + id: self.sitrep_id, + parent_sitrep_id: self.parent_sitrep.map(|s| s.metadata.id), + inv_collection_id: self.inventory.id, + creator_id, + comment: self.comment, + time_created: chrono::Utc::now(), + }, + cases: self + .cases + .cases + .into_iter() + .map(|builder| fm::Case::from(builder)) + .collect(), + } + } +} diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs new file mode 100644 index 00000000000..65406825700 --- /dev/null +++ b/nexus/fm/src/builder/case.rs @@ -0,0 +1,298 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::rng; +use crate::alert; +use anyhow::Context; +use chrono::Utc; +use iddqd::id_ord_map::{self, IdOrdMap}; +use nexus_types::fm; +use nexus_types::inventory::SpType; +use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::SitrepUuid; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +#[derive(Debug)] +pub struct CaseBuilder { + pub log: slog::Logger, + pub case: fm::Case, + pub sitrep_id: SitrepUuid, + rng: rng::CaseBuilderRng, +} + +#[derive(Debug)] +pub struct AllCases { + log: slog::Logger, + sitrep_id: SitrepUuid, + pub cases: IdOrdMap, + rng: rng::SitrepBuilderRng, +} + +impl AllCases { + pub(super) fn new( + log: slog::Logger, + sitrep_id: SitrepUuid, + parent_sitrep: Option<&fm::Sitrep>, + mut rng: rng::SitrepBuilderRng, + ) -> (Self, ImpactLists) { + // Copy forward any open cases from the parent sitrep. + // If a case was closed in the parent sitrep, skip it. + let mut cases_by_location: HashMap<_, HashSet> = + HashMap::new(); + let cases: IdOrdMap<_> = parent_sitrep + .iter() + .flat_map(|s| s.open_cases()) + .map(|case| { + for location in &case.impacted_locations { + cases_by_location + .entry((location.sp_type, location.slot)) + .or_default() + .insert(case.id); + } + let rng = rng::CaseBuilderRng::new(case.id, &mut rng); + CaseBuilder::new(&log, sitrep_id, case.clone(), rng) + }) + .collect(); + + let cases = Self { log, sitrep_id, cases, rng }; + let impact_lists = ImpactLists { cases_by_sp: cases_by_location }; + (cases, impact_lists) + } + + pub fn open_case( + &mut self, + de: fm::DiagnosisEngineKind, + ) -> anyhow::Result> { + let (id, case_rng) = self.rng.next_case(); + let sitrep_id = self.sitrep_id; + let case = match self.cases.entry(&id) { + iddqd::id_ord_map::Entry::Occupied(_) => { + panic!("generated a colliding UUID!") + } + iddqd::id_ord_map::Entry::Vacant(entry) => { + let case = fm::Case { + id, + created_sitrep_id: self.sitrep_id, + time_created: chrono::Utc::now(), + closed_sitrep_id: None, + time_closed: None, + de, + comment: String::new(), + ereports: Default::default(), + alerts_requested: Default::default(), + impacted_locations: Default::default(), + }; + entry.insert(CaseBuilder::new( + &self.log, sitrep_id, case, case_rng, + )) + } + }; + + slog::info!( + self.log, + "opened case {id:?}"; + "case_id" => ?id, + "de" => %de + ); + + Ok(case) + } + + pub fn case(&self, id: &CaseUuid) -> Option<&CaseBuilder> { + self.cases.get(id) + } + + pub fn case_mut( + &mut self, + id: &CaseUuid, + ) -> Option> { + self.cases.get_mut(id) + } +} + +impl CaseBuilder { + fn new( + log: &slog::Logger, + sitrep_id: SitrepUuid, + case: fm::Case, + rng: rng::CaseBuilderRng, + ) -> Self { + let log = log.new(slog::o!( + "case_id" => case.id.to_string(), + "de" => case.de.to_string(), + "created_sitrep_id" => case.created_sitrep_id.to_string(), + )); + Self { log, case, sitrep_id, rng } + } + + pub fn request_alert( + &mut self, + alert: &A, + ) -> anyhow::Result<()> { + let id = self.rng.next_alert(); + let class = A::CLASS; + let req = fm::AlertRequest { + id, + class, + requested_sitrep_id: self.sitrep_id, + payload: serde_json::to_value(&alert).with_context(|| { + format!( + "failed to serialize payload for {class:?} alert {alert:?}" + ) + })?, + }; + self.case.alerts_requested.insert_unique(req).map_err(|_| { + anyhow::anyhow!("an alert with ID {id:?} already exists") + })?; + + slog::info!( + &self.log, + "requested an alert"; + "alert_id" => %id, + "alert_class" => ?class, + ); + + Ok(()) + } + + pub fn close(&mut self) { + self.case.time_closed = Some(Utc::now()); + self.case.closed_sitrep_id = Some(self.sitrep_id); + + slog::info!(&self.log, "case closed"); + } + + pub fn add_ereport( + &mut self, + report: &Arc, + comment: impl std::fmt::Display, + ) { + match self.case.ereports.insert_unique(fm::case::CaseEreport { + ereport: report.clone(), + assigned_sitrep_id: self.sitrep_id, + comment: comment.to_string(), + }) { + Ok(_) => { + slog::info!( + self.log, + "assigned ereport {} to case", report.id(); + "ereport_id" => %report.id(), + "ereport_class" => ?report.class, + ); + } + Err(_) => { + slog::warn!( + self.log, + "ereport {} already assigned to case", report.id(); + "ereport_id" => %report.id(), + "ereport_class" => ?report.class, + ); + } + } + } + + pub fn impacts_location( + &mut self, + impact_lists: &mut ImpactLists, + sp_type: SpType, + slot: u16, + comment: impl ToString, + ) -> anyhow::Result<()> { + if self.impacted_locations.contains_key(&(sp_type, slot)) { + return Err(anyhow::anyhow!( + "case already impacts this location ({sp_type} {slot})" + )); + } + + impact_lists + .cases_by_sp + .entry((sp_type, slot)) + .or_default() + .insert(self.id); + + let comment = comment.to_string(); + slog::info!( + &self.log, + "case impacts location"; + "sp_type" => %sp_type, + "slot" => %slot, + "comment" => %comment, + ); + let created_sitrep_id = self.sitrep_id; + self.impacted_locations + .insert_unique(fm::case::ImpactedLocation { + sp_type, + slot, + created_sitrep_id, + comment: comment.to_string(), + }) + .expect( + "we just checked that there wasn't already an entry for this \ + location", + ); + + Ok(()) + } + + /// Returns an iterator over all ereports that were assigned to this case in + /// the current sitrep. + pub fn new_ereports( + &self, + ) -> impl Iterator> + '_ { + self.ereports.iter().filter_map(|ereport| { + if ereport.assigned_sitrep_id == self.sitrep_id { + Some(&ereport.ereport) + } else { + None + } + }) + } +} + +impl From for fm::Case { + fn from(CaseBuilder { case, .. }: CaseBuilder) -> Self { + case + } +} + +impl core::ops::Deref for CaseBuilder { + type Target = fm::Case; + fn deref(&self) -> &Self::Target { + &self.case + } +} + +impl core::ops::DerefMut for CaseBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.case + } +} + +impl iddqd::IdOrdItem for CaseBuilder { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.case.id + } + + iddqd::id_upcast!(); +} + +#[derive(Debug)] +pub struct ImpactLists { + cases_by_sp: HashMap<(SpType, u16), HashSet>, +} + +impl ImpactLists { + pub fn cases_impacting_sp( + &self, + sp_type: SpType, + slot: u16, + ) -> impl Iterator + '_ { + self.cases_by_sp + .get(&(sp_type, slot)) + .into_iter() + .flat_map(|ids| ids.iter().copied()) + } +} diff --git a/nexus/fm/src/builder/rng.rs b/nexus/fm/src/builder/rng.rs new file mode 100644 index 00000000000..8217e6dfbcc --- /dev/null +++ b/nexus/fm/src/builder/rng.rs @@ -0,0 +1,78 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! RNGs for sitrep generation to allow reproduceable UUID generation +//! (particularly for tests). +//! +//! This is similar to the `nexus_reconfigurator_planning::planner::rng` +//! module. + +use omicron_uuid_kinds::AlertKind; +use omicron_uuid_kinds::AlertUuid; +use omicron_uuid_kinds::CaseKind; +use omicron_uuid_kinds::CaseUuid; +use omicron_uuid_kinds::SitrepUuid; +use rand::SeedableRng as _; +use rand::rngs::StdRng; +use std::hash::Hash; +use typed_rng::TypedUuidRng; + +#[derive(Clone, Debug)] +pub struct SitrepBuilderRng { + parent: StdRng, + case_rng: TypedUuidRng, +} + +impl SitrepBuilderRng { + pub fn from_entropy() -> Self { + Self::new_from_parent(StdRng::from_os_rng()) + } + + pub fn from_seed(seed: H) -> Self { + // Important to add some more bytes here, so that builders with the + // same seed but different purposes don't end up with the same UUIDs. + const SEED_EXTRA: &str = "sitrep-builder"; + Self::new_from_parent(typed_rng::from_seed(seed, SEED_EXTRA)) + } + + pub fn new_from_parent(mut parent: StdRng) -> Self { + let case_rng = TypedUuidRng::from_parent_rng(&mut parent, "case"); + + Self { parent, case_rng } + } + + pub(super) fn sitrep_id(&mut self) -> SitrepUuid { + // we only need a single sitrep UUID, so no sense storing a whole RNG + // for it in the builder RNGs... + TypedUuidRng::from_parent_rng(&mut self.parent, "sitrep").next() + } + + pub(super) fn next_case(&mut self) -> (CaseUuid, CaseBuilderRng) { + let case_id = self.case_rng.next(); + let rng = CaseBuilderRng::new(case_id, self); + (case_id, rng) + } +} + +#[derive(Clone, Debug)] +pub(super) struct CaseBuilderRng { + alert_rng: TypedUuidRng, +} + +impl CaseBuilderRng { + pub(super) fn new( + case_id: CaseUuid, + sitrep: &mut SitrepBuilderRng, + ) -> Self { + let alert_rng = TypedUuidRng::from_parent_rng( + &mut sitrep.parent, + (case_id, "alert"), + ); + Self { alert_rng } + } + + pub(super) fn next_alert(&mut self) -> AlertUuid { + self.alert_rng.next() + } +} diff --git a/nexus/fm/src/diagnosis.rs b/nexus/fm/src/diagnosis.rs new file mode 100644 index 00000000000..feb89f760b4 --- /dev/null +++ b/nexus/fm/src/diagnosis.rs @@ -0,0 +1,35 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Diagnosis engines + +use crate::CaseBuilder; +use crate::SitrepBuilder; +use nexus_types::fm; +use std::sync::Arc; + +pub mod power_shelf; + +pub trait DiagnosisEngine { + fn kind(&self) -> fm::DiagnosisEngineKind; + + /// Called for each new ereprot received since the parent sitrep. + fn analyze_ereport( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ereport: &Arc, + ) -> anyhow::Result<()>; + + /// Called for each case belonging to this diagnosis engine opened in the + /// parent sitrep. + fn analyze_open_case( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ereport: &mut CaseBuilder, + ) -> anyhow::Result<()>; + + /// Complete this diagnosis engine's analysis, making any necessary changes + /// to the sitrep being constructed. + fn finish(&mut self, sitrep: &mut SitrepBuilder<'_>) -> anyhow::Result<()>; +} diff --git a/nexus/fm/src/diagnosis/power_shelf.rs b/nexus/fm/src/diagnosis/power_shelf.rs new file mode 100644 index 00000000000..c0961a014f0 --- /dev/null +++ b/nexus/fm/src/diagnosis/power_shelf.rs @@ -0,0 +1,794 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Power shelf diagnosis + +use super::DiagnosisEngine; +use crate::CaseBuilder; +use crate::SitrepBuilder; +use crate::alert; +use crate::ereport_analysis; +use crate::ereport_analysis::ParsedEreport; +use nexus_types::fm::DiagnosisEngineKind; +use nexus_types::fm::Ereport; +use nexus_types::fm::case::CaseEreport; +use nexus_types::fm::case::ImpactedLocation; +use nexus_types::fm::ereport; +use nexus_types::inventory::SpType; +use omicron_uuid_kinds::CaseUuid; +use serde::de::DeserializeOwned; +use serde_json::Value; +use slog_error_chain::InlineErrorChain; +use std::collections::HashMap; +use std::sync::Arc; + +pub struct PowerShelfDiagnosis { + log: slog::Logger, + cases_by_shelf: [HashMap; 2], +} + +#[derive(Default)] +struct PscCase { + psus_impacted: PsuSet, +} + +type PsuSet = [bool; N_PSUS]; +const N_PSUS: usize = 6; + +const PSU_REMOVE_CLASS: &str = "hw.remove.psu"; +const PSU_INSERT_CLASS: &str = "hw.insert.psu"; +const PSU_PWR_GOOD_CLASS: &str = "hw.pwr.pwr_good.good"; +const PSU_PWR_BAD_CLASS: &str = "hw.pwr.pwr_good.bad"; +const PSU_EREPORT_CLASSES: &[&str] = &[ + PSU_REMOVE_CLASS, + PSU_INSERT_CLASS, + PSU_PWR_GOOD_CLASS, + PSU_PWR_BAD_CLASS, +]; + +impl PowerShelfDiagnosis { + pub fn new(log: &slog::Logger) -> Self { + Self { + log: log.new(slog::o!("de" => "power_shelf")), + cases_by_shelf: [HashMap::new(), HashMap::new()], + } + } + + fn cases_for_shelf_and_psu( + &self, + shelf: u16, + psu: usize, + ) -> impl Iterator { + self.cases_by_shelf[shelf as usize] + .iter() + .filter(move |(_, case)| case.psus_impacted[psu - 1]) + } +} + +impl DiagnosisEngine for PowerShelfDiagnosis { + fn kind(&self) -> DiagnosisEngineKind { + DiagnosisEngineKind::PowerShelf + } + + fn analyze_open_case( + &mut self, + _sitrep: &mut SitrepBuilder<'_>, + case: &mut CaseBuilder, + ) -> anyhow::Result<()> { + slog::debug!( + self.log, + "analyzing open case from parent sitrep..."; + "case_id" => %case.id + ); + + // ooh, a case we alerady opened! let's figure out what its deal is... + for &ImpactedLocation { sp_type, slot, ref comment, .. } in + &case.impacted_locations + { + // skip non-PSC impacts + if sp_type != SpType::Power { + continue; + } + + if matches!(slot, 0 | 1) { + slog::debug!( + &self.log, + "open case impacts power shelf {slot}"; + "case_id" => %case.id, + "power_shelf" => slot, + "comment" => %comment, + ); + // make sure it's tracked. + self.cases_by_shelf[slot as usize].entry(case.id).or_default(); + } else { + slog::warn!( + &self.log, + "this is weird: I only know about power shelves numbered \ + 1 and 0, but found a case that claims to impact power \ + shelf {slot}"; + "case_id" => %case.id, + "power_shelf" => slot, + "comment" => %comment, + ); + } + } + + for CaseEreport { ereport, comment, assigned_sitrep_id } in + &case.ereports + { + let class = match &ereport.class { + // This is one we care about + Some(ref class) + if PSU_EREPORT_CLASSES.contains(&class.as_ref()) => + { + slog::debug!( + self.log, + "analyzing ereport assigned to open case..."; + "case_id" => %case.id, + "ereport_id" => %ereport.id, + "ereport_class" => %class, + "comment" => %comment, + "assigned_sitrep_id" => %assigned_sitrep_id, + ); + class + } + class => { + slog::debug!( + &self.log, + "an ereport with an unknown or missing class was \ + assigned to this case (presumably by another DE); \ + skipping it..."; + "case_id" => %case.id, + "ereport_id" => %ereport.id, + "ereport_class" => ?class, + "comment" => %comment, + ); + continue; + } + }; + + let ereport::Reporter::Sp { sp_type: SpType::Power, slot: shelf } = + ereport.reporter + else { + slog::debug!( + self.log, + "skipping ereport that was not reported by a power shelf"; + "case_id" => %case.id, + "ereport_id" => %ereport.id, + "ereport_class" => %class, + "ereport_id" => %ereport.id, + "reporter" => %ereport.reporter, + ); + continue; + }; + + let tracked_case = + self.cases_by_shelf[shelf as usize].entry(case.id).or_default(); + + // Does the ereport include a PSU slot? + if let Some(slot) = ereport_psu_slot(&ereport, &self.log) { + slog::debug!( + &self.log, + "found an ereport associated with PSU slot {slot}"; + "case_id" => %case.id, + "ereport_id" => %ereport.id, + "ereport_class" => %class, + "shelf" => shelf, + "slot" => slot, + ); + tracked_case.psus_impacted[slot] = true; + } + + // TODO: can we stuff a nice parsed representation in there? + } + + Ok(()) + } + + fn analyze_ereport( + &mut self, + sitrep: &mut SitrepBuilder<'_>, + ereport: &Arc, + ) -> anyhow::Result<()> { + // Skip non-power shelf reports + let ereport::Reporter::Sp { sp_type: SpType::Power, slot } = + ereport.reporter + else { + slog::trace!( + self.log, + "skipping ereport that was not reported by a power shelf"; + "ereport_id" => %ereport.id, + "reporter" => %ereport.reporter, + ); + return Ok(()); + }; + let shelf = slot; + + let Some(class) = ereport.data.class.as_deref() else { + slog::warn!( + &self.log, + "ignoring PSC ereport with no class"; + "ereport" => %ereport.id, + "shelf" => shelf, + ); + return Ok(()); + }; + let comment = match class { + c if c == PSU_REMOVE_CLASS => "was removed", + c if c == PSU_INSERT_CLASS => "was inserted", + c if c == PSU_PWR_GOOD_CLASS => "asserted PWR_GOOD", + c if c == PSU_PWR_BAD_CLASS => "deasserted PWR_GOOD", + unknown => { + slog::warn!( + &self.log, + "ignoring unhandled PSC ereport class"; + "ereport_class" => %unknown, + "ereport" => %ereport.id, + "shelf" => shelf, + ); + return Ok(()); + } + }; + + // PSU-specific ereports: inserted, removed, faulted, or un-faulted + let Some(psu_slot) = ereport_psu_slot(&ereport, &self.log) else { + const MSG: &str = + "ereports for PSU events should include a PSU slot"; + slog::warn!( + self.log, + "{MSG}; {} has class {class} but did not include one", + ereport.id; + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + "shelf" => shelf, + ); + anyhow::bail!( + "{MSG}; {} has class {class} but did not include one", + ereport.id + ); + }; + + let mut tracked = false; + for (case_id, _case) in self.cases_for_shelf_and_psu(shelf, psu_slot) { + tracked = true; + let mut case = sitrep.cases.case_mut(case_id).ok_or_else(|| { + anyhow::anyhow!( + "we have tracked case {case_id} but it no longer exists \ + in the sitrep builder (this is a bug)", + ) + })?; + case.add_ereport(ereport, format!("PSU {psu_slot} {comment}")); + + // TODO: can we stuff a nice parsed representation in there? + } + + // we did not find existing case(s) involving this PSU; open a new one. + // + // TODO(eliza): this logic will need to change eventually as we get + // smarter about analyzing faults effecting multiple PSUs in a shelf. + if !tracked { + let mut case = + sitrep.cases.open_case(DiagnosisEngineKind::PowerShelf)?; + case.add_ereport(ereport, format!("PSU {psu_slot} {comment}")); + + // TODO: can we stuff a nice parsed representation in there? + case.comment = format!( + "opened when power shelf {shelf} PSU {psu_slot} {comment}" + ); + case.impacts_location( + &mut sitrep.impact_lists, + SpType::Power, + shelf, + "this is the power shelf where the PSU event occurred", + )?; + self.cases_by_shelf[shelf as usize].insert(case.id, { + let mut case = PscCase { psus_impacted: [false; N_PSUS] }; + case.psus_impacted[psu_slot - 1] = true; + case + }); + } + + Ok(()) + } + + fn finish(&mut self, sitrep: &mut SitrepBuilder<'_>) -> anyhow::Result<()> { + // TODO: + // + // - debouncing + // - determine whether undiagnosed cases can now be diagnosed + // - determine whether those cases have been resolved (looking at + // any newly-added ereports, and inventory data/health endpoint + // observations) + // - determine what Active Problems should be requested, updated, + // and closed + // - determine what alerts should be requested + let tracked_cases = self.cases_by_shelf.iter().enumerate().flat_map( + |(shelf, cases)| cases.iter().map(move |(k, v)| (shelf, k, v)), + ); + for (shelf, case_id, slots) in tracked_cases { + let mut case = sitrep.cases.case_mut(case_id).ok_or_else(|| { + anyhow::anyhow!( + "we are tracking case {case_id} but it no longer exists \ + in the sitrep builder (this is a bug)" + ) + })?; + slog::debug!( + &self.log, + "analyzing tracked case..."; + "case_id" => %case_id, + "shelf" => shelf, + ); + + // These will be used for diagnosing problems eventually... + // TODO(eliza): these currently always assume that we expect 6 + // PSU/shelf... + let mut psus_ok = [true; 6]; + let mut psus_present = [true; 6]; + + // TODO(eliza): this iterates over ereports ordered by + // (restart_id,ENA)...but the restart IDs are not temporally + // ordered. we should figure that out... + let mut case_ereports = + case.ereports.iter().map(|e| e.clone()).collect::>(); + case_ereports + .sort_by_key(|e| (e.ereport.time_collected, e.ereport.id)); + for CaseEreport { ereport, assigned_sitrep_id, comment } in + case_ereports + { + let ereport::Reporter::Sp { + sp_type: SpType::Power, + slot: psc_slot, + } = ereport.reporter + else { + continue; + }; + let Some(class) = &ereport.class else { + slog::warn!( + self.log, + "skipping ereport with no class"; + "case_id" => %case_id, + "ereport_id" => %ereport.id(), + "ereport_reporter" => %ereport.reporter, + "assigned_in_sitrep" => ?assigned_sitrep_id, + "comment" => %comment, + ); + continue; + }; + + let parsed_ereport = match ParsedEreport::from_raw(&ereport) { + Ok(e) => e, + Err(err) => { + slog::warn!( + self.log, + "could not interpret ereport!"; + "case_id" => %case_id, + "ereport_id" => %ereport.id(), + "ereport_reporter" => %ereport.reporter, + "ereport_class" => %class, + "assigned_in_sitrep" => ?assigned_sitrep_id, + "comment" => %comment, + "error" => %InlineErrorChain::new(&err), + ); + continue; + } + }; + let PscEreport { psu, class } = parsed_ereport.report; + if psu.slot as usize >= N_PSUS { + slog::warn!( + self.log, + "i only know about 6 PSU slots but blah blah blah; \ + slot {} doesnt exist", psu.slot; + "case_id" => %case_id, + "ereport_id" => %ereport.id(), + "ereport_reporter" => %ereport.reporter, + "assigned_in_sitrep" => ?assigned_sitrep_id, + "comment" => %comment, + ); + continue; + } + match class { + EreportClass::PsuInserted => { + psus_present[psu.slot as usize - 1] = true; + // assume that if the ereport was assigned in a previous + // case, any alerts needed were already requested. + // + // XXX(eliza): is this actually a good heuristic in the + // face of software updates potentially introducing new + // alerts or new DE logic for generating them? i dunno. + // on the other hand, we wouldn't want a new Nexus + // version that introduces alerting for new events to + // suddenly pop a bunch of alerts into existence for + // something that happened ages ago. but on the other + // hand, if a case was closed, we wouldn't even be + // analyzing it here...i dunno. figure this out. + // + // TODO(eliza): debounce; check if we currently think + // this PSU is in that slot already before alerting + // again... + if assigned_sitrep_id == sitrep.sitrep_id { + case.request_alert( + &alert::power_shelf::PsuInserted::V0 { + psc_psu: alert::power_shelf::PscPsu { + psc_id: parsed_ereport + .baseboard + .map(alert::VpdIdentity::from), + psc_slot, + psu_slot: Some(psu.slot as u16), + psu_id: psu.fruid.into(), + }, + time: ereport.time_collected, + }, + )?; + } + } + EreportClass::PsuRemoved => { + psus_present[psu.slot as usize - 1] = false; + // assume that if the ereport was assigned in a previous + // case, any alerts needed were already requested. + // + // XXX(eliza): is this actually a good heuristic in the + // face of software updates potentially introducing new + // alerts or new DE logic for generating them? i dunno. + // on the other hand, we wouldn't want a new Nexus + // version that introduces alerting for new events to + // suddenly pop a bunch of alerts into existence for + // something that happened ages ago. but on the other + // hand, if a case was closed, we wouldn't even be + // analyzing it here...i dunno. figure this out. + if assigned_sitrep_id == sitrep.sitrep_id { + case.request_alert( + &alert::power_shelf::PsuRemoved::V0 { + psc_psu: alert::power_shelf::PscPsu { + psc_id: parsed_ereport + .baseboard + .map(alert::VpdIdentity::from), + psc_slot, + psu_slot: Some(psu.slot as u16), + psu_id: psu.fruid.into(), + }, + time: ereport.time_collected, + }, + )?; + } + } + _ => { + eprintln!("TODO ELIZA {class:?}"); + } + } + } + + slog::info!(&self.log, + "analyzed all ereports for case {case_id}"; + "case_id" => %case_id, + "shelf" => shelf, + "psus_present" => ?psus_present, + "psus_ok" => ?psus_ok, + ); + // TODO(eliza): check this against inventory, expected number of + // PSUs... + // TODO(eliza): this is where we would open/update/resolve Active + // Problems... + if psus_present == [true; 6] && psus_ok == [true; 6] { + case.close(); + } + } + + Ok(()) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +enum Shelf { + Power0 = 0, + Power1 = 1, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +enum ShelfComponent { + Unknown, + Psc, + MultiplePsus, + SpecificPsu(Psu), +} + +#[derive( + Copy, + Clone, + Debug, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, +)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +#[repr(u8)] +enum Psu { + Psu0 = 0, + Psu1 = 1, + Psu2 = 2, + Psu3 = 3, + Psu4 = 4, + Psu5 = 5, +} + +fn ereport_psu_slot(ereport: &Ereport, log: &slog::Logger) -> Option { + let slot = + grab_json_value::(&ereport, "slot", &ereport.report, log)?; + if slot >= N_PSUS { + slog::warn!( + &log, + "this is weird: I only know about power shelves with \ + {N_PSUS} PSU SLOTS, but this ereport claims to \ + involve slot {slot}"; + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + "slot" => slot, + ); + None + } else { + Some(slot) + } +} + +fn grab_json_value( + ereport: &Ereport, + key: &str, + obj: &Value, + log: &slog::Logger, +) -> Option { + let v = match obj.get(key) { + Some(v) => v, + None => { + slog::warn!( + log, + "expected ereport to contain a '{key}' field"; + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + ); + return None; + } + }; + match serde_json::from_value(v.clone()) { + Ok(v) => Some(v), + Err(e) => { + slog::warn!( + log, + "expected ereport '{key}' field to deserialize as a {}", + std::any::type_name::(); + "ereport_id" => %ereport.id, + "ereport_class" => ?ereport.class, + "error" => %e, + ); + None + } + } +} + +impl ParsedEreport { + fn psc_location(&self) -> anyhow::Result<(Shelf, ShelfComponent)> { + let shelf = match self.ereport.reporter { + ereport::Reporter::Sp { sp_type: SpType::Power, slot: 0 } => { + Shelf::Power0 + } + ereport::Reporter::Sp { sp_type: SpType::Power, slot: 1 } => { + Shelf::Power1 + } + ereport::Reporter::Sp { sp_type: SpType::Power, slot } => { + anyhow::bail!( + "I only know about power shelves 0 and 1, but ereport {} \ + was (allegedly) reported by a power shelf in slot {slot}", + self.ereport.id + ) + } + other_thing => { + anyhow::bail!( + "weird: ereport {} has a PSC-related ereport class, but \ + was reported by {other_thing}", + self.ereport.id + ) + } + }; + + Ok((shelf, ShelfComponent::SpecificPsu(self.report.psu.refdes))) + } +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +struct PscEreport { + #[serde(flatten)] + psu: PsuId, + #[serde(flatten)] + class: EreportClass, +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +#[serde(tag = "k")] +enum EreportClass { + #[serde(rename = "hw.insert.psu")] + PsuInserted, + #[serde(rename = "hw.remove.psu")] + PsuRemoved, + #[serde(rename = "hw.pwr.pwr_good.bad")] + PwrBad { pmbus_status: PmbusStatus }, + #[serde(rename = "hw.pwr.pwr_good.good")] + PwrGood { pmbus_status: PmbusStatus }, +} + +#[derive(Debug, Eq, PartialEq, serde::Deserialize)] +struct PsuId { + refdes: Psu, + rail: String, + slot: u8, + fruid: PsuFruid, +} + +// These are the same field names that Hubris uses in the ereport. See: +// https://github.com/oxidecomputer/hubris/blob/ec18e4f11aaa14600c61f67335c32b250ef38269/drv/psc-seq-server/src/main.rs#L1107-L1117 +#[derive(serde::Deserialize, Debug, PartialEq, Eq, Default)] +struct PsuFruid { + mfr: Option, + mpn: Option, + serial: Option, + fw_rev: Option, +} + +impl From for alert::power_shelf::PsuIdentity { + fn from(fruid: PsuFruid) -> Self { + let PsuFruid { mfr, mpn, serial, fw_rev } = fruid; + Self { + manufacturer: mfr, + part_number: mpn, + serial_number: serial, + firmware_revision: fw_rev, + } + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, serde::Deserialize)] +// TODO(eliza): bitflags types for these? +struct PmbusStatus { + word: Option, + input: Option, + iout: Option, + vout: Option, + temp: Option, + cml: Option, + mfr: Option, +} + +#[cfg(test)] +mod test { + use super::*; + use crate::ereport_analysis::test as ereport_test; + use crate::test_util::FmTest; + use chrono::{DateTime, Utc}; + use nexus_types::fm::{AlertClass, ereport::Reporter}; + use omicron_uuid_kinds::OmicronZoneUuid; + use std::time::Duration; + + #[test] + fn test_pwr_bad_ereport() { + let FmTest { mut reporters, .. } = FmTest::new("test_pwr_bad_ereport"); + + let mut reporter = reporters + .reporter(Reporter::Sp { sp_type: SpType::Power, slot: 0 }); + + let ereport = Arc::new(reporter.parse_ereport( + DateTime::::MIN_UTC, + ereport_test::PSU_REMOVE_JSON, + )); + let ereport = + match ParsedEreport::::from_raw(dbg!(&ereport)) { + Ok(ereport) => ereport, + Err(e) => { + panic!("ereport could not be {e}") + } + }; + dbg!(&ereport); + assert_eq!( + dbg!(ereport.psc_location()).unwrap(), + (Shelf::Power0, ShelfComponent::SpecificPsu(Psu::Psu4)) + ) + } + + #[test] + fn test_remove_insert_pwr_good() { + let FmTest { logctx, mut reporters, system_builder, sitrep_rng } = + FmTest::new("test_remove_insert_pwr_good"); + + let mut reporter = reporters + .reporter(Reporter::Sp { sp_type: SpType::Power, slot: 0 }); + let (example_system, _) = system_builder.nsleds(2).build(); + let mut sitrep = SitrepBuilder::new_with_rng( + &logctx.log, + &example_system.collection, + None, + sitrep_rng, + ); + // It's the beginning of time! + let t0 = DateTime::::MIN_UTC; + + let mut de = PowerShelfDiagnosis::new(&logctx.log); + de.analyze_ereport( + &mut sitrep, + &Arc::new( + reporter.parse_ereport(t0, ereport_test::PSU_REMOVE_JSON), + ), + ) + .expect("analyzing ereport 1 should succeed"); + + de.analyze_ereport( + &mut sitrep, + &Arc::new(reporter.parse_ereport( + t0 + Duration::from_secs(1), + ereport_test::PSU_INSERT_JSON, + )), + ) + .expect("analyzing ereport 2 should succeed"); + + de.analyze_ereport( + &mut sitrep, + &Arc::new(reporter.parse_ereport( + t0 + Duration::from_secs(2), + ereport_test::PSU_PWR_GOOD_JSON, + )), + ) + .expect("analyzing ereport 3 should succeed"); + + de.finish(&mut sitrep).expect("finish should return Ok"); + + let sitrep = sitrep.build(OmicronZoneUuid::nil()); + + eprintln!("--- SITREP ---\n\n{sitrep:#?}"); + + let case0 = { + let mut cases = sitrep.cases.iter(); + let case0 = cases.next().expect("sitrep should have a case"); + assert_eq!( + cases.next(), + None, + "sitrep should have exactly one case" + ); + case0 + }; + + eprintln!("\n--- CASE ---\n\n{case0}"); + + let mut insert_alert = None; + let mut remove_alert = None; + for alert in &case0.alerts_requested { + match alert.class { + AlertClass::PsuInserted if insert_alert.is_none() => { + insert_alert = Some(alert); + } + AlertClass::PsuInserted => { + panic!( + "expected only one PSU inserted alert, saw multiple:\n\ + 1: {insert_alert:#?}\n\n2: {alert:#?}" + ); + } + AlertClass::PsuRemoved if remove_alert.is_none() => { + remove_alert = Some(alert); + } + AlertClass::PsuRemoved => { + panic!( + "expected only one PSU removed alert, saw multiple:\n\ + 1: {remove_alert:#?}\n\n2: {alert:#?}" + ); + } + } + } + + assert!(insert_alert.is_some(), "no PSU inserted alert was requested!"); + assert!(remove_alert.is_some(), "no PSU removed alert was requested!"); + assert!( + !case0.is_open(), + "case should have been closed since everything is okay" + ); + + logctx.cleanup_successful(); + } +} diff --git a/nexus/fm/src/ereport_analysis.rs b/nexus/fm/src/ereport_analysis.rs new file mode 100644 index 00000000000..98e29c20dd5 --- /dev/null +++ b/nexus/fm/src/ereport_analysis.rs @@ -0,0 +1,213 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Ereport analysis tools. + +use nexus_types::fm as types; +use serde::Deserialize; +use serde::de::DeserializeOwned; +use std::sync::Arc; + +/// Metadata that should be present in *all* hubris ereports. +#[derive(Clone, Debug, Eq, PartialEq, Hash, Deserialize)] +pub(crate) struct HubrisMetadata { + pub hubris_archive_id: String, + pub hubris_task_gen: u16, + pub hubris_task_name: String, + pub hubris_uptime_ms: u64, + // Added by MGS + pub ereport_message_version: u8, + + #[serde(rename = "v")] + pub version: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq, Hash, Deserialize)] +pub(crate) struct Baseboard { + #[serde(rename = "baseboard_part_number")] + pub part_number: String, + #[serde(rename = "baseboard_rev")] + pub rev: u32, + #[serde(rename = "serial_number")] + pub serial_number: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct ParsedEreport { + pub ereport: Arc, + pub hubris_metadata: Option, + pub baseboard: Option, + pub report: D, +} + +impl ParsedEreport { + pub(crate) fn from_raw( + ereport: &Arc, + ) -> Result { + let fields: EreportFields = + serde_json::from_value(ereport.data.report.clone())?; + let EreportFields { hubris_metadata, baseboard, report } = fields; + Ok(Self { + ereport: ereport.clone(), + hubris_metadata, + baseboard, + report, + }) + } +} + +#[derive(Debug, serde::Deserialize)] +struct EreportFields { + #[serde(flatten)] + hubris_metadata: Option, + #[serde(flatten)] + baseboard: Option, + #[serde(flatten)] + report: D, +} + +#[cfg(test)] +pub(crate) mod test { + use super::*; + + // These are real life ereports I copied from the dogfood rack. + pub(crate) const PSU_REMOVE_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197337481, + "k": "hw.remove.psu", + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + pub(crate) const PSU_INSERT_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197337481, + "k": "hw.insert.psu", + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + pub(crate) const PSU_PWR_BAD_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197408566, + "k": "hw.pwr.pwr_good.bad", + "pmbus_status": { + "cml": 0, + "input": 48, + "iout": 0, + "mfr": 0, + "temp": 0, + "vout": 0, + "word": 10312 + }, + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + pub(crate) const PSU_PWR_GOOD_JSON: &str = r#"{ + "baseboard_part_number": "913-0000003", + "baseboard_rev": 8, + "baseboard_serial_number": "BRM45220004", + "ereport_message_version": 0, + "fruid": { + "fw_rev": "0701", + "mfr": "Murata-PS", + "mpn": "MWOCP68-3600-D-RM", + "serial": "LL2216RB003Z" + }, + "hubris_archive_id": "qSm4IUtvQe0", + "hubris_task_gen": 0, + "hubris_task_name": "sequencer", + "hubris_uptime_ms": 1197408580, + "k": "hw.pwr.pwr_good.good", + "pmbus_status": { + "cml": 0, + "input": 0, + "iout": 0, + "mfr": 0, + "temp": 0, + "vout": 0, + "word": 0 + }, + "rail": "V54_PSU4", + "refdes": "PSU4", + "slot": 4, + "v": 0 + }"#; + + #[test] + fn test_hubris_metadata() { + let expected_metadata = HubrisMetadata { + hubris_archive_id: "qSm4IUtvQe0".to_string(), + hubris_task_gen: 0, + hubris_task_name: "sequencer".to_string(), + hubris_uptime_ms: 0, + ereport_message_version: 0, + version: Some(0), + }; + let ereports = [ + (PSU_REMOVE_JSON, 1197337481), + (PSU_INSERT_JSON, 1197337481), + (PSU_PWR_BAD_JSON, 1197408566), + (PSU_PWR_GOOD_JSON, 1197408580), + ]; + + for (json, hubris_uptime_ms) in ereports { + let json_value: serde_json::Value = + serde_json::from_str(json).expect("JSON should parse"); + let metadata: HubrisMetadata = + serde_json::from_value(dbg!(json_value)) + .expect("value should contain a HubrisMetadata"); + assert_eq!( + metadata, + HubrisMetadata { + hubris_uptime_ms, + ..expected_metadata.clone() + } + ); + } + } +} diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs new file mode 100644 index 00000000000..57fc0943427 --- /dev/null +++ b/nexus/fm/src/lib.rs @@ -0,0 +1,14 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management + +pub mod alert; +pub mod builder; +pub mod diagnosis; +pub mod ereport_analysis; +pub use builder::{CaseBuilder, SitrepBuilder}; + +#[cfg(test)] +pub mod test_util; diff --git a/nexus/fm/src/test_util.rs b/nexus/fm/src/test_util.rs new file mode 100644 index 00000000000..70b640c528d --- /dev/null +++ b/nexus/fm/src/test_util.rs @@ -0,0 +1,165 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::builder::SitrepBuilderRng; +use chrono::Utc; +use nexus_reconfigurator_planning::example; +use nexus_types::fm::ereport::{ + Ena, Ereport, EreportData, EreportId, Reporter, +}; +use omicron_test_utils::dev; +use omicron_uuid_kinds::EreporterRestartKind; +use omicron_uuid_kinds::EreporterRestartUuid; +use omicron_uuid_kinds::OmicronZoneKind; +use omicron_uuid_kinds::OmicronZoneUuid; +use rand::rngs::StdRng; +use slog::Logger; +use typed_rng::TypedUuidRng; + +pub struct FmTest { + pub logctx: dev::LogContext, + pub reporters: SimReporters, + pub sitrep_rng: SitrepBuilderRng, + pub system_builder: example::ExampleSystemBuilder, +} + +impl FmTest { + pub fn new(test_name: &str) -> Self { + let logctx = dev::test_setup_log(test_name); + let example_system_builder = + example::ExampleSystemBuilder::new(&logctx.log, test_name); + let reporters = SimReporters::new( + test_name, + logctx.log.new(slog::o!("component" => "sim-reporters")), + ); + Self { + logctx, + reporters, + sitrep_rng: SitrepBuilderRng::from_seed(test_name), + system_builder: example_system_builder, + } + } +} + +pub struct SimReporters { + log: slog::Logger, + parent: StdRng, + collector_id_rng: TypedUuidRng, +} + +impl SimReporters { + fn new(test_name: &str, log: slog::Logger) -> Self { + let mut parent = typed_rng::from_seed(test_name, "sim-reporters"); + // TODO(eliza): would be more realistic to pick something from the + // example system's omicron zones, but these UUIDs are only used for + // debugging purposes... + let collector_id_rng = + TypedUuidRng::from_parent_rng(&mut parent, "collector-ids"); + Self { parent, collector_id_rng, log } + } + + pub fn reporter(&mut self, reporter: Reporter) -> SimReporter { + let collector_id = self.collector_id_rng.next(); + let mut restart_id_rng = TypedUuidRng::from_parent_rng( + &mut self.parent, + ("restart_id", reporter), + ); + let restart_id = restart_id_rng.next(); + SimReporter { + reporter, + restart_id, + ena: Ena(0x1), + restart_id_rng, + collector_id, + log: self.log.new(slog::o!("reporter" => reporter.to_string())), + } + } +} + +pub struct SimReporter { + reporter: Reporter, + restart_id: EreporterRestartUuid, + ena: Ena, + restart_id_rng: TypedUuidRng, + + // TODO(eliza): this is not super realistic, as it will give a new "nexus" + // to each reporter...but the DEs don't actually care who collected the + // ereport, and we just need something to put in there. + collector_id: OmicronZoneUuid, + + log: slog::Logger, +} + +impl SimReporter { + #[track_caller] + pub fn parse_ereport( + &mut self, + now: chrono::DateTime, + json: &str, + ) -> Ereport { + self.mk_ereport( + now, + json.parse().expect("must be called with valid ereport JSON"), + ) + } + + pub fn mk_ereport( + &mut self, + now: chrono::DateTime, + json: serde_json::Map, + ) -> Ereport { + self.ena.0 += 1; + mk_ereport( + &self.log, + self.reporter, + EreportId { ena: self.ena, restart_id: self.restart_id }, + self.collector_id, + now, + json, + ) + } + + pub fn restart(&mut self) { + self.ena = Ena(0x1); + self.restart_id = self.restart_id_rng.next(); + } +} + +pub fn mk_ereport( + log: &slog::Logger, + reporter: Reporter, + id: EreportId, + collector_id: OmicronZoneUuid, + time_collected: chrono::DateTime, + json: serde_json::Map, +) -> Ereport { + let data = match reporter { + Reporter::Sp { .. } => { + let raw = ereport_types::Ereport { ena: id.ena, data: json }; + EreportData::from_sp_ereport( + log, + id.restart_id, + raw, + time_collected, + collector_id, + ) + } + Reporter::HostOs { .. } => { + todo!( + "eliza: when we get around to actually ingesting host ereport \ + JSON, figure out what the field names for serial and part \ + numbers would be!", + ); + } + }; + slog::info!( + &log, + "simulating an ereport: {}", data.id; + "ereport_id" => %data.id, + "ereport_class" => ?data.class, + "serial_number" => ?data.serial_number, + "part_number" => ?data.part_number, + ); + Ereport { reporter, data } +} diff --git a/nexus/src/app/alert.rs b/nexus/src/app/alert.rs index 91f21571723..4725e396cfa 100644 --- a/nexus/src/app/alert.rs +++ b/nexus/src/app/alert.rs @@ -193,14 +193,6 @@ impl Nexus { ) -> Result { let alert = self.datastore().alert_create(opctx, id, class, event).await?; - slog::debug!( - &opctx.log, - "published alert"; - "alert_id" => ?id, - "alert_class" => %alert.class, - "time_created" => ?alert.identity.time_created, - ); - // Once the alert has been inserted, activate the dispatcher task to // ensure its propagated to receivers. self.background_tasks.task_alert_dispatcher.activate(); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 206223ff5b0..a1a8e9e2d69 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -103,6 +103,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; +use super::tasks::fm_rendezvous; use super::tasks::fm_sitrep_gc; use super::tasks::fm_sitrep_load; use super::tasks::instance_reincarnation; @@ -268,6 +269,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator: Activator::new(), task_sp_ereport_ingester: Activator::new(), task_reconfigurator_config_loader: Activator::new(), + task_fm_rendezvous: Activator::new(), task_fm_sitrep_loader: Activator::new(), task_fm_sitrep_gc: Activator::new(), task_probe_distributor: Activator::new(), @@ -357,6 +359,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator, task_sp_ereport_ingester, task_reconfigurator_config_loader, + task_fm_rendezvous, task_fm_sitrep_loader, task_fm_sitrep_gc, task_probe_distributor, @@ -1127,6 +1130,18 @@ impl BackgroundTasksInitializer { activator: task_fm_sitrep_loader, }); + driver.register(TaskDefinition { + name: "fm_rendezvous", + description: + "updates externally visible database tables to match the \ + current fault management sitrep", + period: config.fm.rendezvouz_period_secs, + task_impl: Box::new(fm_rendezvous::FmRendezvous::new(datastore.clone(), sitrep_watcher.clone(), task_alert_dispatcher.clone())), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![Box::new(sitrep_watcher.clone())], + activator: task_fm_rendezvous, + }); + driver.register(TaskDefinition { name: "fm_sitrep_gc", description: "garbage collects fault management situation reports", diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs new file mode 100644 index 00000000000..db5321ddb1e --- /dev/null +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -0,0 +1,163 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for executing requested state changes in the fault +//! management blueprint. + +use crate::app::background::BackgroundTask; +use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep; +use futures::future::BoxFuture; +use nexus_background_task_interface::Activator; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::AlertRequest; +use nexus_types::fm::Sitrep; +use nexus_types::fm::SitrepVersion; +use nexus_types::internal_api::background::SitrepAlertRequestStatus as AlertStatus; +use nexus_types::internal_api::background::SitrepExecutionStatus as Status; +use omicron_common::api::external::Error; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct FmRendezvous { + datastore: Arc, + sitrep_watcher: watch::Receiver, + alert_dispatcher: Activator, +} + +impl BackgroundTask for FmRendezvous { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.actually_activate(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let err = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + json!({ "error": err }) + } + } + }) + } +} + +impl FmRendezvous { + pub fn new( + datastore: Arc, + rx: watch::Receiver, + alert_dispatcher: Activator, + ) -> Self { + Self { datastore, sitrep_watcher: rx, alert_dispatcher } + } + + async fn actually_activate(&mut self, opctx: &OpContext) -> Status { + let Some(sitrep) = self.sitrep_watcher.borrow_and_update().clone() + else { + return Status::NoSitrep; + }; + + // TODO(eliza): as we start doing other things (i.e. requesting support + // bundles, updating problems), consider spawning these in their own tasks... + let alerts = self.create_requested_alerts(&sitrep, opctx).await; + + Status::Executed { sitrep_id: sitrep.1.id(), alerts } + } + + async fn create_requested_alerts( + &self, + sitrep: &Arc<(SitrepVersion, Sitrep)>, + opctx: &OpContext, + ) -> AlertStatus { + let (_, ref sitrep) = **sitrep; + let mut status = AlertStatus::default(); + + // XXX(eliza): is it better to allocate all of these into a big array + // and do a single `INSERT INTO` query, or iterate over them one by one + // (not allocating) but insert one at a time? + for (case_id, req) in sitrep.alerts_requested() { + let &AlertRequest { id, requested_sitrep_id, class, ref payload } = + req; + status.total_alerts_requested += 1; + if requested_sitrep_id == sitrep.id() { + status.current_sitrep_alerts_requested += 1; + } + let class = class.into(); + match self + .datastore + .alert_create(&opctx, id, class, payload.clone()) + .await + { + // Alert already exists, that's fine. + Err(Error::Conflict { .. }) => {} + Err(e) => { + slog::warn!( + opctx.log, + "failed to create requested alert"; + "case_id" => %case_id, + "alert_id" => %id, + "alert_class" => %class, + "error" => %e, + ); + status + .errors + .push(format!("alert {id} (class: {class}): {e}")); + } + Ok(_) => status.alerts_created += 1, + } + } + + let n_errors = status.errors.len(); + if n_errors > 0 { + slog::warn!( + opctx.log, + "created {} alerts requested by the current sitrep, but \ + {n_errors} alerts could not be created!", + status.alerts_created; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + "errors" => n_errors, + ); + } else if status.alerts_created > 0 { + slog::info!( + opctx.log, + "created {} alerts requested by the current sitrep", + status.alerts_created; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } else if status.total_alerts_requested > 0 { + slog::debug!( + opctx.log, + "all alerts requested by the current sitrep already exist"; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } else { + slog::debug!( + opctx.log, + "current sitrep requests no alerts"; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } + + // We created some alerts, so let the alert dispatcher know. + if status.alerts_created > 0 { + self.alert_dispatcher.activate(); + } + + status + } +} diff --git a/nexus/src/app/background/tasks/fm_sitrep_gc.rs b/nexus/src/app/background/tasks/fm_sitrep_gc.rs index 92214faef4b..372ae80c6a7 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_gc.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_gc.rs @@ -152,9 +152,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: None, }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("inserting initial sitrep should succeed"); @@ -174,9 +175,10 @@ mod tests { time_created: Utc::now(), parent_sitrep_id: Some(sitrep1.metadata.id), }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("inserting child sitrep should succeed"); @@ -264,9 +266,12 @@ mod tests { comment: format!("test sitrep v{i}; orphan {i}"), time_created: Utc::now(), parent_sitrep_id, + // TODO(eliza): we should populate cases and assert they get + // cleaned up... }, + cases: Default::default(), }; - match datastore.fm_sitrep_insert(&opctx, &sitrep).await { + match datastore.fm_sitrep_insert(&opctx, sitrep).await { Ok(_) => { panic!("inserting sitrep v{v} orphan {i} should not succeed") } diff --git a/nexus/src/app/background/tasks/fm_sitrep_load.rs b/nexus/src/app/background/tasks/fm_sitrep_load.rs index 0a2c52f95b1..0dfaf2f7b0f 100644 --- a/nexus/src/app/background/tasks/fm_sitrep_load.rs +++ b/nexus/src/app/background/tasks/fm_sitrep_load.rs @@ -23,7 +23,7 @@ pub struct SitrepLoader { tx: watch::Sender, } -type CurrentSitrep = Option>; +pub type CurrentSitrep = Option>; impl BackgroundTask for SitrepLoader { fn activate<'a>( @@ -224,9 +224,10 @@ mod test { comment: "test sitrep 1".to_string(), time_created: Utc::now(), }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep1) + .fm_sitrep_insert(&opctx, sitrep1.clone()) .await .expect("sitrep should be inserted successfully"); @@ -288,9 +289,10 @@ mod test { comment: "test sitrep 2".to_string(), time_created: Utc::now(), }, + cases: Default::default(), }; datastore - .fm_sitrep_insert(&opctx, &sitrep2) + .fm_sitrep_insert(&opctx, sitrep2.clone()) .await .expect("sitrep2 should be inserted successfully"); diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 64df7770da1..36fbf1b9ca0 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -18,6 +18,7 @@ pub mod dns_propagation; pub mod dns_servers; pub mod ereport_ingester; pub mod external_endpoints; +pub mod fm_rendezvous; pub mod fm_sitrep_gc; pub mod fm_sitrep_load; pub mod instance_reincarnation; diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 3f90379388c..bedc71b301a 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -8,11 +8,19 @@ //! structure containing fault management state. pub mod ereport; -pub use ereport::Ereport; +pub use ereport::{Ereport, EreportId}; + +mod alert; +pub use alert::*; + +pub mod case; +pub use case::Case; use chrono::{DateTime, Utc}; -use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; -use schemars::JsonSchema; +use iddqd::IdOrdMap; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, +}; use serde::{Deserialize, Serialize}; /// A fault management situation report, or _sitrep_. @@ -30,12 +38,13 @@ use serde::{Deserialize, Serialize}; /// The sitrep, how it is represented in the database, and how the fault /// management subsystem creates and interacts with sitreps, is described in /// detail in [RFD 603](https://rfd.shared.oxide.computer/rfd/0603). -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct Sitrep { /// Metadata describing this sitrep, when it was created, its parent sitrep /// ID, and which Nexus produced it. pub metadata: SitrepMetadata, - // TODO(eliza): draw the rest of the sitrep + pub cases: IdOrdMap, + // pub cases_by_sp: } impl Sitrep { @@ -46,12 +55,26 @@ impl Sitrep { pub fn parent_id(&self) -> Option { self.metadata.parent_sitrep_id } + + pub fn open_cases(&self) -> impl Iterator + '_ { + self.cases.iter().filter(|c| c.is_open()) + } + + /// Iterate over all alerts requested by cases in this sitrep. + pub fn alerts_requested( + &self, + ) -> impl Iterator + '_ { + self.cases.iter().flat_map(|case| { + let case_id = case.id; + case.alerts_requested.iter().map(move |alert| (case_id, alert)) + }) + } } /// Metadata describing a sitrep. /// /// This corresponds to the records stored in the `fm_sitrep` database table. -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct SitrepMetadata { /// The ID of this sitrep. pub id: SitrepUuid, @@ -91,9 +114,26 @@ pub struct SitrepMetadata { } /// An entry in the sitrep version history. -#[derive(Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize)] +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct SitrepVersion { pub id: SitrepUuid, pub version: u32, pub time_made_current: DateTime, } + +#[derive( + Copy, + Clone, + Debug, + PartialEq, + Eq, + Hash, + serde::Serialize, + serde::Deserialize, + strum::Display, +)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum DiagnosisEngineKind { + PowerShelf, +} diff --git a/nexus/types/src/fm/alert.rs b/nexus/types/src/fm/alert.rs new file mode 100644 index 00000000000..05ad85073ee --- /dev/null +++ b/nexus/types/src/fm/alert.rs @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_uuid_kinds::AlertUuid; +use omicron_uuid_kinds::SitrepUuid; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct AlertRequest { + pub id: AlertUuid, + pub class: AlertClass, + pub payload: serde_json::Value, + pub requested_sitrep_id: SitrepUuid, +} + +impl iddqd::IdOrdItem for AlertRequest { + type Key<'a> = &'a AlertUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum AlertClass { + PsuInserted, + PsuRemoved, +} diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs new file mode 100644 index 00000000000..f961089bc82 --- /dev/null +++ b/nexus/types/src/fm/case.rs @@ -0,0 +1,361 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::fm::AlertRequest; +use crate::fm::DiagnosisEngineKind; +use crate::fm::Ereport; +use crate::inventory::SpType; +use chrono::{DateTime, Utc}; +use iddqd::{IdOrdItem, IdOrdMap}; +use omicron_uuid_kinds::{CaseUuid, SitrepUuid}; +use serde::{Deserialize, Serialize}; +use std::fmt; +use std::sync::Arc; + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct Case { + pub id: CaseUuid, + pub created_sitrep_id: SitrepUuid, + pub time_created: DateTime, + + pub closed_sitrep_id: Option, + pub time_closed: Option>, + + pub de: DiagnosisEngineKind, + + pub ereports: IdOrdMap, + pub alerts_requested: IdOrdMap, + pub impacted_locations: IdOrdMap, + + pub comment: String, +} + +impl Case { + pub fn is_open(&self) -> bool { + self.time_closed.is_none() + } + + pub fn display_indented( + &self, + indent: usize, + sitrep_id: Option, + ) -> impl fmt::Display + '_ { + DisplayCase { case: self, indent, sitrep_id } + } +} + +impl fmt::Display for Case { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.display_indented(0, None).fmt(f) + } +} + +impl IdOrdItem for Case { + type Key<'a> = &'a CaseUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct CaseEreport { + pub ereport: Arc, + pub assigned_sitrep_id: SitrepUuid, + pub comment: String, +} + +impl IdOrdItem for CaseEreport { + type Key<'a> = as IdOrdItem>::Key<'a>; + fn key(&self) -> Self::Key<'_> { + self.ereport.key() + } + + iddqd::id_upcast!(); +} + +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct ImpactedLocation { + pub sp_type: SpType, + pub slot: u16, + pub created_sitrep_id: SitrepUuid, + pub comment: String, +} + +impl IdOrdItem for ImpactedLocation { + type Key<'a> = (SpType, u16); + fn key(&self) -> Self::Key<'_> { + (self.sp_type, self.slot) + } + + iddqd::id_upcast!(); +} + +struct DisplayCase<'a> { + case: &'a Case, + indent: usize, + sitrep_id: Option, +} + +impl fmt::Display for DisplayCase<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const BULLET: &str = "* "; + const LIST_INDENT: usize = 4; + + let &Self { + case: + Case { + id, + created_sitrep_id, + time_created, + closed_sitrep_id, + time_closed, + de, + ereports, + alerts_requested, + impacted_locations, + comment, + }, + indent, + sitrep_id, + } = self; + + let this_sitrep = move |s| { + if Some(s) == sitrep_id { " <-- this sitrep" } else { "" } + }; + + writeln!( + f, + "{:>indent$}case {id}", + if indent > 0 { BULLET } else { "" } + )?; + writeln!( + f, + "{:>indent$}-----------------------------------------", + "" + )?; + writeln!(f, "{:>indent$}diagnosis engine: {de}", "")?; + writeln!( + f, + "{:>indent$}created in sitrep: {created_sitrep_id}{}", + "", + this_sitrep(*created_sitrep_id) + )?; + writeln!(f, "{:>indent$} at: {time_created}", "")?; + if let Some(closed_id) = closed_sitrep_id { + writeln!( + f, + "{:>indent$}closed in sitrep: {closed_id}{}", + "", + this_sitrep(*closed_id) + )?; + if let Some(time_closed) = time_closed { + writeln!(f, "{:>indent$} at: {time_closed}", "")?; + } else { + writeln!(f, "{:>indent$} at: ", "")?; + } + } + + writeln!(f, "\n{:>indent$}comment: {comment}", "")?; + + if !ereports.is_empty() { + writeln!(f, "\n{:>indent$}ereports:\n", "")?; + let indent = indent + LIST_INDENT; + for CaseEreport { ereport, assigned_sitrep_id, comment } in ereports + { + let pn = + ereport.part_number.as_deref().unwrap_or(""); + let sn = ereport + .serial_number + .as_deref() + .unwrap_or(""); + writeln!(f, "{BULLET:>indent$}ereport {}", ereport.id())?; + writeln!( + f, + "{:>indent$}class: {}", + "", + ereport.class.as_deref().unwrap_or("") + )?; + writeln!(f, "{:>indent$}reported by:", "")?; + + writeln!(f, "{:>indent$} location: {}", "", ereport.reporter)?; + writeln!(f, "{:>indent$} identity: {pn}:{sn}", "")?; + writeln!( + f, + "{:>indent$}added in sitrep: {assigned_sitrep_id}{}", + "", + this_sitrep(*assigned_sitrep_id) + )?; + writeln!(f, "{:>indent$}comment: {comment}\n", "")?; + } + } + + if !impacted_locations.is_empty() { + writeln!(f, "\n{:>indent$}locations impacted:\n", "")?; + let indent = indent + LIST_INDENT; + for ImpactedLocation { + sp_type, + slot, + created_sitrep_id, + comment, + } in impacted_locations + { + writeln!(f, "{BULLET:>indent$}{sp_type:<6} {slot}")?; + writeln!( + f, + "{:>indent$}added in sitrep: {created_sitrep_id}{}", + "", + this_sitrep(*created_sitrep_id) + )?; + writeln!(f, "{:>indent$}comment: {comment}\n", "")?; + } + } + + if !alerts_requested.is_empty() { + writeln!(f, "{:>indent$}alerts requested:\n", "")?; + let indent = indent + LIST_INDENT; + for AlertRequest { id, class, requested_sitrep_id, .. } in + alerts_requested + { + writeln!(f, "{BULLET:>indent$}alert {id}")?; + writeln!(f, "{:>indent$}class: {class:?}", "")?; + writeln!( + f, + "{:>indent$}requested in sitrep: {requested_sitrep_id}{}\n", + "", + this_sitrep(*requested_sitrep_id) + )?; + } + } + + writeln!(f)?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::fm::{AlertClass, AlertRequest, DiagnosisEngineKind}; + use chrono::Utc; + use ereport_types::{Ena, EreportId}; + use omicron_uuid_kinds::{ + AlertUuid, CaseUuid, EreporterRestartUuid, OmicronZoneUuid, SitrepUuid, + }; + use std::sync::Arc; + + #[test] + fn test_case_display() { + // Create UUIDs for the case + let case_id = CaseUuid::new_v4(); + let created_sitrep_id = SitrepUuid::new_v4(); + let closed_sitrep_id = SitrepUuid::new_v4(); + let time_created = Utc::now(); + let time_closed = Utc::now(); + + // Create some ereports + let mut ereports = IdOrdMap::new(); + + let ereport1 = CaseEreport { + ereport: Arc::new(Ereport { + data: crate::fm::ereport::EreportData { + id: EreportId { + restart_id: EreporterRestartUuid::new_v4(), + ena: Ena::from(2u64), + }, + time_collected: time_created, + collector_id: OmicronZoneUuid::new_v4(), + serial_number: Some("BRM6900420".to_string()), + part_number: Some("913-0000037".to_string()), + class: Some("hw.pwr.remove.psu".to_string()), + report: serde_json::json!({}), + }, + reporter: crate::fm::ereport::Reporter::Sp { + sp_type: SpType::Power, + slot: 0, + }, + }), + assigned_sitrep_id: created_sitrep_id, + comment: "PSU removed".to_string(), + }; + ereports.insert_unique(ereport1).unwrap(); + + let ereport2 = CaseEreport { + ereport: Arc::new(Ereport { + data: crate::fm::ereport::EreportData { + id: EreportId { + restart_id: EreporterRestartUuid::new_v4(), + ena: Ena::from(3u64), + }, + time_collected: time_created, + collector_id: OmicronZoneUuid::new_v4(), + serial_number: Some("BRM6900420".to_string()), + part_number: Some("913-0000037".to_string()), + class: Some("hw.pwr.insert.psu".to_string()), + report: serde_json::json!({"link": "eth0", "status": "down"}), + }, + reporter: crate::fm::ereport::Reporter::Sp { + sp_type: SpType::Power, + slot: 0, + }, + }), + assigned_sitrep_id: closed_sitrep_id, + comment: "PSU inserted, closing this case".to_string(), + }; + ereports.insert_unique(ereport2).unwrap(); + + // Create some alerts + let mut alerts_requested = IdOrdMap::new(); + + let alert1 = AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::PsuRemoved, + payload: serde_json::json!({}), + requested_sitrep_id: created_sitrep_id, + }; + alerts_requested.insert_unique(alert1).unwrap(); + + let alert2 = AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::PsuInserted, + payload: serde_json::json!({}), + requested_sitrep_id: closed_sitrep_id, + }; + alerts_requested.insert_unique(alert2).unwrap(); + + let mut impacted_sp_slots = IdOrdMap::new(); + let slot2 = ImpactedLocation { + sp_type: SpType::Power, + slot: 0, + created_sitrep_id, + comment: "Power shelf 0 reduced redundancy".to_string(), + }; + impacted_sp_slots.insert_unique(slot2).unwrap(); + + // Create the case + let case = Case { + id: case_id, + created_sitrep_id, + time_created, + closed_sitrep_id: Some(closed_sitrep_id), + time_closed: Some(time_closed), + de: DiagnosisEngineKind::PowerShelf, + ereports, + alerts_requested, + impacted_locations: impacted_sp_slots, + comment: "Power shelf rectifier added and removed here :-)" + .to_string(), + }; + + eprintln!("example case display:"); + eprintln!("=====================\n"); + eprintln!("{case}"); + + eprintln!("example case display (indented by 4):"); + eprintln!("======================================\n"); + eprintln!("{}", case.display_indented(4, Some(closed_sitrep_id))); + } +} diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index dd840550c3d..1ad263af15f 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -23,6 +23,28 @@ pub struct Ereport { pub reporter: Reporter, } +impl Ereport { + pub fn id(&self) -> &EreportId { + &self.data.id + } +} + +impl core::ops::Deref for Ereport { + type Target = EreportData; + fn deref(&self) -> &Self::Target { + &self.data + } +} + +impl iddqd::IdOrdItem for Ereport { + type Key<'a> = &'a EreportId; + fn key(&self) -> Self::Key<'_> { + self.id() + } + + iddqd::id_upcast!(); +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct EreportData { #[serde(flatten)] @@ -123,7 +145,16 @@ impl EreportData { /// Describes the source of an ereport. #[derive( - Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Serialize, + Deserialize, + Hash, )] #[serde(tag = "reporter")] pub enum Reporter { @@ -133,18 +164,17 @@ pub enum Reporter { impl fmt::Display for Reporter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Display format based on: + // https://rfd.shared.oxide.computer/rfd/200#_labeling match self { - Self::Sp { sp_type: SpType::Sled, slot } => { - write!(f, "Sled (SP) {slot:02}") - } - Self::Sp { sp_type: SpType::Switch, slot } => { - write!(f, "Switch {slot}") - } - Self::Sp { sp_type: SpType::Power, slot } => { - write!(f, "PSC {slot}") + Self::Sp { sp_type: sp_type @ SpType::Sled, slot } => { + write!(f, "{sp_type} {slot:<2} (SP)") } Self::HostOs { sled } => { - write!(f, "Sled (OS) {sled:?}") + write!(f, "{} {sled:?} (OS)", SpType::Sled) + } + Self::Sp { sp_type, slot } => { + write!(f, "{sp_type} {slot}") } } } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 42264d0411b..daaf00eeb03 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -15,6 +15,7 @@ use omicron_uuid_kinds::AlertReceiverUuid; use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::SitrepUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::TufRepoUuid; @@ -886,6 +887,25 @@ pub struct SitrepGcStatus { pub errors: Vec, } +/// The status of a `fm_sitrep_execution` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SitrepExecutionStatus { + NoSitrep, + Executed { sitrep_id: SitrepUuid, alerts: SitrepAlertRequestStatus }, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] +pub struct SitrepAlertRequestStatus { + /// The total number of alerts requested by the current sitrep. + pub total_alerts_requested: usize, + /// The total number of alerts which were *first* requested in the current sitrep. + pub current_sitrep_alerts_requested: usize, + /// The number of alerts created by this activation. + pub alerts_created: usize, + /// Errors that occurred during this activation. + pub errors: Vec, +} + #[derive(Debug, Deserialize, Serialize)] pub struct ProbeError { /// ID of the sled we failed to send a probe to. diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 69d43b7e3be..61e4e656e9c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6842,6 +6842,100 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_sitrep_version_by_id ON omicron.public.fm_sitrep_history (sitrep_id); +CREATE TYPE IF NOT EXISTS omicron.public.diagnosis_engine AS ENUM ( + 'power_shelf' +); + +CREATE TABLE IF NOT EXISTS omicron.public.fm_case ( + -- Case UUID + id UUID NOT NULL, + -- UUID of the sitrep in which the case had this state. + sitrep_id UUID NOT NULL, + + de omicron.public.diagnosis_engine NOT NULL, + + time_created TIMESTAMPTZ NOT NULL, + -- UUID of the sitrep in which the case was created. + created_sitrep_id UUID NOT NULL, + + -- Time when the case was closed (if not null). + time_closed TIMESTAMPTZ, + -- UUID of the sitrep in which the case was closed. + closed_sitrep_id UUID, + + comment TEXT NOT NULL, + + CONSTRAINT closed_case_validity CHECK ( + (closed_sitrep_id IS NULL AND time_closed IS NULL) OR + (closed_sitrep_id IS NOT NULL AND time_closed IS NOT NULL) + ), + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_cases_for_sitrep +ON omicron.public.fm_case (sitrep_id); + +CREATE TABLE IF NOT EXISTS omicron.public.fm_ereport_in_case ( + -- The ereport's identity. + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + + -- UUID of the case the ereport is assigned to. + case_id UUID NOT NULL, + + -- UUID of the sitrep in which this assignment exists. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the ereport was initially assigned to this + -- case. + assigned_sitrep_id UUID NOT NULL, + + comment TEXT NOT NULL, + + PRIMARY KEY (sitrep_id, restart_id, ena) +); + +CREATE INDEX IF NOT EXISTS + lookup_ereports_assigned_to_fm_case +ON omicron.public.fm_ereport_in_case (sitrep_id, case_id); + +CREATE TABLE IF NOT EXISTS omicron.public.fm_case_impacts_location ( + sitrep_id UUID NOT NULL, + case_id UUID NOT NULL, + -- location of this device according to MGS + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- ID of the sitrep in which this SP was added to the case. + created_sitrep_id UUID NOT NULL, + comment TEXT NOT NULL, + + PRIMARY KEY (sitrep_id, case_id, sp_type, sp_slot) +); + +CREATE TABLE IF NOT EXISTS omicron.public.fm_alert_request ( + -- Requested alert UUID + id UUID NOT NULL, + -- UUID of the sitrep in which the alert is requested. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the alert request was created. + requested_sitrep_id UUID NOT NULL, + -- UUID of the case to which this alert request belongs. + case_id UUID NOT NULL, + + -- The class of alert that was requested + alert_class omicron.public.alert_class NOT NULL, + -- Actual alert data. The structure of this depends on the alert class. + payload JSONB NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_alert_requests_for_case +ON omicron.public.fm_alert_request (sitrep_id, case_id); + /* * List of datasets available to be sliced up and passed to VMMs for instance * local storage. diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index b9b843fbcbd..17bce85e24a 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -107,6 +107,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index c7e9e5c4317..2ead38929d6 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -107,6 +107,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index d0bc04ffe8e..b45e7f71758 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,7 @@ impl_typed_uuid_kinds! { AntiAffinityGroup = {}, Blueprint = {}, BuiltInUser = {}, + Case = {}, Collection = {}, ConsoleSession = {}, Dataset = {},