From 094c381fa6289879ce6e3bf91c9e7d30663febe2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 9 Jun 2025 10:22:20 -0700 Subject: [PATCH 01/61] start ereport ingestion task --- Cargo.lock | 2 + clients/gateway-client/Cargo.toml | 1 + clients/gateway-client/src/lib.rs | 2 + nexus/Cargo.toml | 1 + .../app/background/tasks/ereport_ingester.rs | 105 ++++++++++++++++++ nexus/src/app/background/tasks/mod.rs | 1 + 6 files changed, 112 insertions(+) create mode 100644 nexus/src/app/background/tasks/ereport_ingester.rs diff --git a/Cargo.lock b/Cargo.lock index 648d96ff290..92982e6f0de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3556,6 +3556,7 @@ dependencies = [ "base64 0.22.1", "chrono", "daft", + "ereport-types", "gateway-messages", "gateway-types", "omicron-uuid-kinds", @@ -7436,6 +7437,7 @@ dependencies = [ "dns-service-client", "dpd-client", "dropshot", + "ereport-types", "expectorate", "fatfs", "futures", diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 6e74b903a57..d50d2a2be21 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -11,6 +11,7 @@ workspace = true base64.workspace = true chrono.workspace = true daft.workspace = true +ereport-types.workspace = true gateway-messages.workspace = true gateway-types.workspace = true omicron-uuid-kinds.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index df7b88c13b4..f9888c2c204 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -76,6 +76,8 @@ progenitor::generate_api!( }, replace = { RotSlot = gateway_types::rot::RotSlot, + Ena = ereport_types::Ena, + TypedUuidForEreporterRestartKind = omicron_uuid_kinds::EreporterRestartUuid, TypedUuidForMupdateKind = omicron_uuid_kinds::MupdateUuid, }, ); diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 14a01e6e54d..967bc3cedbb 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -31,6 +31,7 @@ crucible-pantry-client.workspace = true crucible-common.workspace = true dns-service-client.workspace = true dpd-client.workspace = true +ereport-types.workspace = true mg-admin-client.workspace = true dropshot.workspace = true fatfs.workspace = true diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs new file mode 100644 index 00000000000..5f670fdc9d2 --- /dev/null +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -0,0 +1,105 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background tasks for ereport ingestion + +use crate::app::background::BackgroundTask; +use anyhow::Context; +use futures::future::BoxFuture; +use internal_dns_types::names::ServiceName; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use omicron_common::backoff; +use omicron_uuid_kinds::EreporterRestartUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use std::sync::Arc; +use uuid::Uuid; + +pub struct SpEreportIngester { + pub datastore: Arc, + pub resolver: internal_dns_resolver::Resolver, + pub nexus_id: OmicronZoneUuid, + pub rack_id: Uuid, +} + +impl BackgroundTask for SpEreportIngester { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + todo!() + } +} + +impl SpEreportIngester { + async fn actually_activate( + &mut self, + opctx: &OpContext, + ) -> anyhow::Result<()> { + // Find MGS clients. + // TODO(eliza): reuse the same client across activations... + let mgs_clients = self + .resolver + .lookup_all_socket_v6(ServiceName::ManagementGatewayService) + .await + .context("looking up MGS addresses")? + .into_iter() + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + let log = opctx.log.new(o!("gateway_url" => url.clone())); + gateway_client::Client::new(&url, log) + }) + .collect::>(); + + Ok(()) + } +} + +struct Sp { + sp_type: gateway_client::types::SpType, + slot: u32, +} + +struct SpState { + committed_ena: ereport_types::Ena, + restart_id: EreporterRestartUuid, +} + +const LIMIT: std::num::NonZeroU32 = match std::num::NonZeroU32::new(255) { + None => unreachable!(), + Some(l) => l, +}; + +async fn mgs_request( + opctx: &OpContext, + clients: &[gateway_client::Client], + Sp { sp_type, slot }: Sp, + sp_state: Option, +) -> anyhow::Result<()> { + let mut idx = 0; + let restart_id = sp_state + .as_ref() + .map(|state| state.restart_id.clone()) + .unwrap_or(EreporterRestartUuid::nil()); + backoff::retry_notify_ext( + backoff::retry_policy_internal_service(), + || async { + let client = &clients[idx]; + let res = client + .sp_ereports_ingest( + sp_type, + slot, + sp_state.as_ref().map(|state| &state.committed_ena), + LIMIT, + &restart_id, + sp_state.as_ref().map(|state| &state.committed_ena), + ) + .await; + + Ok(()) + }, + |err, count, duration| todo!(), + ) + .await +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 48d29f18776..33ad6e0d01a 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -15,6 +15,7 @@ pub mod decommissioned_disk_cleaner; pub mod dns_config; pub mod dns_propagation; pub mod dns_servers; +pub mod ereport_ingester; pub mod external_endpoints; pub mod instance_reincarnation; pub mod instance_updater; From b824f3831a120f5efbf3778e4c787893a46a37d3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Jun 2025 11:26:05 -0700 Subject: [PATCH 02/61] start ereport db tables --- Cargo.lock | 2 + ereport/types/Cargo.toml | 1 + ereport/types/src/lib.rs | 12 +++ nexus/db-model/Cargo.toml | 1 + nexus/db-model/src/ereport.rs | 95 +++++++++++++++++++ nexus/db-model/src/lib.rs | 2 + nexus/db-schema/src/schema.rs | 28 ++++++ .../app/background/tasks/ereport_ingester.rs | 79 ++++++++++----- schema/crdb/dbinit.sql | 32 ++++++- 9 files changed, 228 insertions(+), 24 deletions(-) create mode 100644 nexus/db-model/src/ereport.rs diff --git a/Cargo.lock b/Cargo.lock index 92982e6f0de..d0c9e09548a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3096,6 +3096,7 @@ dependencies = [ "schemars", "serde", "serde_json", + "thiserror 1.0.69", ] [[package]] @@ -6123,6 +6124,7 @@ dependencies = [ "db-macros", "derive-where", "diesel", + "ereport-types", "expectorate", "hex", "ipnetwork", diff --git a/ereport/types/Cargo.toml b/ereport/types/Cargo.toml index 9f6137ab010..94d1f1b92de 100644 --- a/ereport/types/Cargo.toml +++ b/ereport/types/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dependencies] dropshot.workspace = true +thiserror.workspace = true schemars.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index ee5d1f93e7f..120a2a8c5b4 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -59,6 +59,18 @@ impl fmt::LowerHex for Ena { } } +#[derive(Clone, Debug, thiserror::Error)] +#[error("ENA value ({0}) is negative")] +pub struct EnaNegativeError(i64); + +impl TryFrom for Ena { + type Error = EnaNegativeError; + + fn try_from(value: i64) -> Result { + u64::try_from(value).map(Ena).map_err(|_| EnaNegativeError(value)) + } +} + /// Query parameters to request a tranche of ereports from a reporter. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct EreportQuery { diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 62c42713b37..489c5ceec89 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -41,6 +41,7 @@ tokio.workspace = true uuid.workspace = true db-macros.workspace = true +ereport-types.workspace = true omicron-certificates.workspace = true omicron-common.workspace = true nexus-config.workspace = true diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs new file mode 100644 index 00000000000..4baa52929db --- /dev/null +++ b/nexus/db-model/src/ereport.rs @@ -0,0 +1,95 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::SpMgsSlot; +use crate::SpType; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use diesel::backend::Backend; +use diesel::deserialize::{self, FromSql}; +use diesel::pg::Pg; +use diesel::prelude::*; +use diesel::serialize::{self, ToSql}; +use diesel::sql_types; +use ereport_types::Ena; +use nexus_db_schema::schema::{host_ereport, sp_ereport}; +use omicron_uuid_kinds::{EreporterRestartKind, OmicronZoneKind, SledKind}; +use serde::{Deserialize, Serialize}; +use std::convert::TryFrom; + +#[derive( + Copy, + Clone, + Debug, + Eq, + Ord, + PartialEq, + PartialOrd, + AsExpression, + FromSqlRow, + Serialize, + Deserialize, +)] +#[diesel(sql_type = sql_types::BigInt)] +#[repr(transparent)] +pub struct DbEna(pub Ena); + +NewtypeFrom! { () pub struct DbEna(Ena); } +NewtypeDeref! { () pub struct DbEna(Ena); } + +impl ToSql for DbEna { + fn to_sql<'a>( + &'a self, + out: &mut serialize::Output<'a, '_, Pg>, + ) -> serialize::Result { + >::to_sql( + &i64::try_from(self.0.0)?, + &mut out.reborrow(), + ) + } +} + +impl FromSql for DbEna +where + DB: Backend, + i64: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ena::try_from(i64::from_sql(bytes)?).map(DbEna).map_err(|e| e.into()) + } +} + +#[derive(Clone, Debug, Queryable, Selectable)] +#[diesel(table_name = sp_ereport)] +pub struct EreportMetadata { + pub restart_id: DbTypedUuid, + pub ena: DbEna, + + pub time_collected: DateTime, + pub collector_id: DbTypedUuid, +} + +#[derive(Clone, Debug, Queryable, Selectable)] +#[diesel(table_name = sp_ereport)] +pub struct SpEreport { + #[diesel(embed)] + pub metadata: EreportMetadata, + + pub sp_type: SpType, + pub sp_slot: SpMgsSlot, + + pub report: serde_json::Value, +} + +#[derive(Clone, Debug, Queryable, Selectable)] +#[diesel(table_name = host_ereport)] +pub struct HostEreport { + #[diesel(embed)] + pub metadata: EreportMetadata, + + pub sled_id: DbTypedUuid, + pub sled_serial: String, + + pub report: serde_json::Value, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index e0a0910f2aa..71557dae6e1 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -82,6 +82,7 @@ mod webhook_rx; // However, they must be defined in the same crate as our tables // for join-based marker trait generation. mod deployment; +mod ereport; mod ipv4_nat_entry; mod omicron_zone_config; mod quota; @@ -169,6 +170,7 @@ pub use disk::*; pub use disk_state::*; pub use dns::*; pub use downstairs::*; +pub use ereport::*; pub use external_ip::*; pub use generation::*; pub use identity_provider::*; diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index d570bfbb2ef..4faf62b48ef 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2442,3 +2442,31 @@ allow_tables_to_appear_in_same_query!( webhook_delivery_attempt ); joinable!(webhook_delivery_attempt -> webhook_delivery (delivery_id)); + +table! { + sp_ereport (restart_id, ena) { + restart_id -> Uuid, + ena -> Uuid, + time_collected -> Timestamptz, + collector_id -> Uuid, + + sp_type -> crate::enums::SpTypeEnum, + sp_slot -> Int4, + + report -> Jsonb, + } +} + +table! { + host_ereport (restart_id, ena) { + restart_id -> Uuid, + ena -> Uuid, + time_collected -> Timestamptz, + collector_id -> Uuid, + + sled_id -> Uuid, + sled_serial -> Text, + + report -> Jsonb, + } +} diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 5f670fdc9d2..a92d789f4d7 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -11,8 +11,10 @@ use internal_dns_types::names::ServiceName; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use omicron_common::backoff; +use omicron_common::backoff::BackoffError; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::OmicronZoneUuid; +use parallel_task_set::ParallelTaskSet; use std::sync::Arc; use uuid::Uuid; @@ -38,7 +40,7 @@ impl SpEreportIngester { opctx: &OpContext, ) -> anyhow::Result<()> { // Find MGS clients. - // TODO(eliza): reuse the same client across activations... + // TODO(eliza): reuse the same client across activations; qorb, etc. let mgs_clients = self .resolver .lookup_all_socket_v6(ServiceName::ManagementGatewayService) @@ -52,6 +54,9 @@ impl SpEreportIngester { }) .collect::>(); + // let mut tasks = ParallelTaskSet::new(); + // for sled in 0..32 {} + Ok(()) } } @@ -71,35 +76,63 @@ const LIMIT: std::num::NonZeroU32 = match std::num::NonZeroU32::new(255) { Some(l) => l, }; +type GatewayClientError = gateway_client::Error; + async fn mgs_request( opctx: &OpContext, - clients: &[gateway_client::Client], + clients: Arc>, Sp { sp_type, slot }: Sp, sp_state: Option, -) -> anyhow::Result<()> { - let mut idx = 0; - let restart_id = sp_state - .as_ref() - .map(|state| state.restart_id.clone()) - .unwrap_or(EreporterRestartUuid::nil()); +) -> Result { + let mut next_idx = 0; + + let (committed_ena, restart_id) = match sp_state { + Some(state) => { + (Some(state.committed_ena.clone()), state.restart_id.clone()) + } + None => (None, EreporterRestartUuid::nil()), + }; + backoff::retry_notify_ext( backoff::retry_policy_internal_service(), - || async { - let client = &clients[idx]; - let res = client - .sp_ereports_ingest( - sp_type, - slot, - sp_state.as_ref().map(|state| &state.committed_ena), - LIMIT, - &restart_id, - sp_state.as_ref().map(|state| &state.committed_ena), - ) - .await; - - Ok(()) + || { + let clients = clients.clone(); + let idx = next_idx; + // Next time, we'll try the other gateway, if this one is sad. + next_idx = (next_idx + 1) % clients.len(); + async move { + let client = &clients[idx]; + let res = client + .sp_ereports_ingest( + sp_type, + slot, + committed_ena.as_ref(), + LIMIT, + &restart_id, + committed_ena.as_ref(), + ) + .await; + match res { + Ok(ereports) => Ok(ereports.into_inner()), + Err(e) + if e.status() + .map(|status| status.is_client_error()) + .unwrap_or(false) => + { + Err(BackoffError::permanent(e)) + } + Err( + e @ gateway_client::Error::InvalidRequest(_) + | e @ gateway_client::Error::PreHookError(_) + | e @ gateway_client::Error::PostHookError(_), + ) => Err(BackoffError::permanent(e)), + Err(e) => return Err(BackoffError::transient(e)), + } + } + }, + |err, count, duration| { + // TODO(eliza): log the error 'n' stuff }, - |err, count, duration| todo!(), ) .await } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0195bfd06cc..fe61819d923 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5636,7 +5636,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.webhook_delivery ( deliverator_id IS NOT NULL AND time_leased IS NOT NULL ) ), - CONSTRAINT time_completed_iff_not_pending CHECK ( + CONSTRAINT time_completed_iff_not_pending CHECK (9 (state = 'pending' AND time_completed IS NULL) OR (state != 'pending' AND time_completed IS NOT NULL) ) @@ -5749,6 +5749,36 @@ ON omicron.public.webhook_delivery_attempt ( rx_id ); +/* + * Ereports + */ + +-- Ereports from service processors +CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + + -- time at which the ereport was collected + time_collected TIMESTAMPTZ NOT NULL, + -- UUID of the Nexus instance that collected the ereport + collector_id UUID NOT NULL, + + -- identity of the reporting SP + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- JSON representation of the ereport + report JSONB NOT NULL, + + PRIMARY KEY (restart_id, ena), +); + +CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_slot ON omicron.public.sp_ereport ( + sp_type, + sp_slot, + time_collected, +); + /* * Keep this at the end of file so that the database does not contain a version * until it is fully populated. From 42fa98de0022868c7c369797af64143b5053b9c7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Jun 2025 13:57:08 -0700 Subject: [PATCH 03/61] queries --- Cargo.lock | 1 + ereport/types/src/lib.rs | 15 +- nexus/db-model/src/ereport.rs | 40 +++- nexus/db-queries/Cargo.toml | 1 + nexus/db-queries/src/db/datastore/ereport.rs | 177 ++++++++++++++++++ nexus/db-queries/src/db/datastore/mod.rs | 1 + .../tests/output/host_latest_ereport_id.sql | 10 + .../tests/output/sp_latest_ereport_id.sql | 10 + nexus/db-schema/src/schema.rs | 4 +- schema/crdb/dbinit.sql | 31 ++- 10 files changed, 277 insertions(+), 13 deletions(-) create mode 100644 nexus/db-queries/src/db/datastore/ereport.rs create mode 100644 nexus/db-queries/tests/output/host_latest_ereport_id.sql create mode 100644 nexus/db-queries/tests/output/sp_latest_ereport_id.sql diff --git a/Cargo.lock b/Cargo.lock index d0c9e09548a..6cd09f35d40 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6180,6 +6180,7 @@ dependencies = [ "diesel", "diesel-dtrace", "dropshot", + "ereport-types", "expectorate", "futures", "gateway-client", diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index 120a2a8c5b4..8b244a9063c 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -37,7 +37,7 @@ pub struct Ena(pub u64); impl fmt::Display for Ena { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:#x}", self.0) + write!(f, "{:x}", self.0) } } @@ -71,6 +71,19 @@ impl TryFrom for Ena { } } +/// Unique identifier for an ereport. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct EreportId { + pub restart_id: EreporterRestartUuid, + pub ena: Ena, +} + +impl fmt::Display for EreportId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}:{:x}", self.restart_id, self.ena) + } +} + /// Query parameters to request a tranche of ereports from a reporter. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct EreportQuery { diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 4baa52929db..2a677dbf6db 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -12,7 +12,7 @@ use diesel::pg::Pg; use diesel::prelude::*; use diesel::serialize::{self, ToSql}; use diesel::sql_types; -use ereport_types::Ena; +use ereport_types::{Ena, EreportId}; use nexus_db_schema::schema::{host_ereport, sp_ereport}; use omicron_uuid_kinds::{EreporterRestartKind, OmicronZoneKind, SledKind}; use serde::{Deserialize, Serialize}; @@ -60,8 +60,7 @@ where } } -#[derive(Clone, Debug, Queryable, Selectable)] -#[diesel(table_name = sp_ereport)] +#[derive(Clone, Debug)] pub struct EreportMetadata { pub restart_id: DbTypedUuid, pub ena: DbEna, @@ -70,11 +69,35 @@ pub struct EreportMetadata { pub collector_id: DbTypedUuid, } +pub type EreportMetadataTuple = ( + DbTypedUuid, + DbEna, + DateTime, + DbTypedUuid, +); + +impl EreportMetadata { + pub fn id(&self) -> EreportId { + EreportId { restart_id: self.restart_id.into(), ena: self.ena.into() } + } +} + +impl From for EreportMetadata { + fn from( + (restart_id, ena, time_collected, collector_id): EreportMetadataTuple, + ) -> Self { + EreportMetadata { restart_id, ena, time_collected, collector_id } + } +} + #[derive(Clone, Debug, Queryable, Selectable)] #[diesel(table_name = sp_ereport)] pub struct SpEreport { - #[diesel(embed)] - pub metadata: EreportMetadata, + pub restart_id: DbTypedUuid, + pub ena: DbEna, + + pub time_collected: DateTime, + pub collector_id: DbTypedUuid, pub sp_type: SpType, pub sp_slot: SpMgsSlot, @@ -85,8 +108,11 @@ pub struct SpEreport { #[derive(Clone, Debug, Queryable, Selectable)] #[diesel(table_name = host_ereport)] pub struct HostEreport { - #[diesel(embed)] - pub metadata: EreportMetadata, + pub restart_id: DbTypedUuid, + pub ena: DbEna, + + pub time_collected: DateTime, + pub collector_id: DbTypedUuid, pub sled_id: DbTypedUuid, pub sled_serial: String, diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 151dbc71f78..50b8b1789c3 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -56,6 +56,7 @@ url.workspace = true usdt.workspace = true uuid.workspace = true +ereport-types.workspace = true db-macros.workspace = true nexus-auth.workspace = true nexus-config.workspace = true diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs new file mode 100644 index 00000000000..44f1b0476db --- /dev/null +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -0,0 +1,177 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods for ereports. + +use super::DataStore; +use crate::context::OpContext; +use crate::db::datastore::RunnableQuery; +use crate::db::model::DbEna; +use crate::db::model::EreportMetadata; +use crate::db::model::SpMgsSlot; +use crate::db::model::SpType; +use crate::db::model::SqlU16; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use nexus_db_errors::ErrorHandler; +use nexus_db_errors::public_error_from_diesel; +use nexus_db_schema::schema::host_ereport::dsl as host_dsl; +use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::EreporterRestartUuid; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SledUuid; +use uuid::Uuid; + +type EreportIdTuple = (Uuid, DbEna); + +impl DataStore { + pub async fn sp_latest_ereport_id( + &self, + opctx: &OpContext, + sp_type: impl Into, + slot: u16, + ) -> Result, Error> { + let sp_type = sp_type.into(); + let slot = SpMgsSlot::from(SqlU16::new(slot)); + let id = Self::sp_latest_ereport_id_query(sp_type, slot) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .map(id_from_tuple); + Ok(id) + } + + fn sp_latest_ereport_id_query( + sp_type: SpType, + slot: SpMgsSlot, + ) -> impl RunnableQuery { + sp_dsl::sp_ereport + .filter(sp_dsl::sp_type.eq(sp_type).and(sp_dsl::sp_slot.eq(slot))) + .order_by(sp_dsl::time_collected.desc()) + .limit(1) + .select((sp_dsl::restart_id, sp_dsl::ena)) + } + + pub async fn host_latest_ereport_id( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> Result, Error> { + let id = Self::host_latest_ereport_id_query(sled_id) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .map(id_from_tuple); + Ok(id) + } + + fn host_latest_ereport_id_query( + sled_id: SledUuid, + ) -> impl RunnableQuery { + host_dsl::host_ereport + .filter(host_dsl::sled_id.eq(sled_id.into_untyped_uuid())) + .order_by(host_dsl::time_collected.desc()) + .limit(1) + .select((host_dsl::restart_id, host_dsl::ena)) + } +} + +fn id_from_tuple( + (restart_id, DbEna(ena)): EreportIdTuple, +) -> ereport_types::EreportId { + ereport_types::EreportId { + restart_id: EreporterRestartUuid::from_untyped_uuid(restart_id), + ena, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::explain::ExplainableAsync; + use crate::db::pub_test_utils::TestDatabase; + use crate::db::raw_query_builder::expectorate_query_contents; + use omicron_test_utils::dev; + + #[tokio::test] + async fn expectorate_sp_latest_ereport_id() { + let query = DataStore::sp_latest_ereport_id_query( + SpType::Sled, + SpMgsSlot::from(SqlU16::new(1)), + ); + + expectorate_query_contents( + &query, + "tests/output/sp_latest_ereport_id.sql", + ) + .await; + } + + #[tokio::test] + async fn explain_sp_latest_ereport_id() { + let logctx = dev::test_setup_log("explain_sp_latest_ereport_id"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::sp_latest_ereport_id_query( + SpType::Sled, + SpMgsSlot::from(SqlU16::new(1)), + ); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + eprintln!("{explanation}"); + + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn expectorate_host_latest_ereport_id() { + let query = DataStore::host_latest_ereport_id_query(SledUuid::nil()); + + expectorate_query_contents( + &query, + "tests/output/host_latest_ereport_id.sql", + ) + .await; + } + + #[tokio::test] + async fn explain_host_latest_ereport_id() { + let logctx = dev::test_setup_log("explain_host_latest_ereport_id"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::host_latest_ereport_id_query(SledUuid::nil()); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + eprintln!("{explanation}"); + + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 7aa4ae6a00c..33d27e791f3 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -64,6 +64,7 @@ mod deployment; mod device_auth; mod disk; mod dns; +mod ereport; mod external_ip; mod identity_provider; mod image; diff --git a/nexus/db-queries/tests/output/host_latest_ereport_id.sql b/nexus/db-queries/tests/output/host_latest_ereport_id.sql new file mode 100644 index 00000000000..6cd39952557 --- /dev/null +++ b/nexus/db-queries/tests/output/host_latest_ereport_id.sql @@ -0,0 +1,10 @@ +SELECT + host_ereport.restart_id, host_ereport.ena +FROM + host_ereport +WHERE + host_ereport.sled_id = $1 +ORDER BY + host_ereport.time_collected DESC +LIMIT + $2 diff --git a/nexus/db-queries/tests/output/sp_latest_ereport_id.sql b/nexus/db-queries/tests/output/sp_latest_ereport_id.sql new file mode 100644 index 00000000000..14cf96cc93c --- /dev/null +++ b/nexus/db-queries/tests/output/sp_latest_ereport_id.sql @@ -0,0 +1,10 @@ +SELECT + sp_ereport.restart_id, sp_ereport.ena +FROM + sp_ereport +WHERE + sp_ereport.sp_type = $1 AND sp_ereport.sp_slot = $2 +ORDER BY + sp_ereport.time_collected DESC +LIMIT + $3 diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 4faf62b48ef..dc49bb992f6 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2446,7 +2446,7 @@ joinable!(webhook_delivery_attempt -> webhook_delivery (delivery_id)); table! { sp_ereport (restart_id, ena) { restart_id -> Uuid, - ena -> Uuid, + ena -> Int8, time_collected -> Timestamptz, collector_id -> Uuid, @@ -2460,7 +2460,7 @@ table! { table! { host_ereport (restart_id, ena) { restart_id -> Uuid, - ena -> Uuid, + ena -> Int8, time_collected -> Timestamptz, collector_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index fe61819d923..8d7aab9a1a5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5636,7 +5636,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.webhook_delivery ( deliverator_id IS NOT NULL AND time_leased IS NOT NULL ) ), - CONSTRAINT time_completed_iff_not_pending CHECK (9 + CONSTRAINT time_completed_iff_not_pending CHECK ( (state = 'pending' AND time_completed IS NULL) OR (state != 'pending' AND time_completed IS NOT NULL) ) @@ -5770,13 +5770,38 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( -- JSON representation of the ereport report JSONB NOT NULL, - PRIMARY KEY (restart_id, ena), + PRIMARY KEY (restart_id, ena) ); CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_slot ON omicron.public.sp_ereport ( sp_type, sp_slot, - time_collected, + time_collected +); + +-- Ereports from the host operating system +CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + + -- time at which the ereport was collected + time_collected TIMESTAMPTZ NOT NULL, + -- UUID of the Nexus instance that collected the ereport + collector_id UUID NOT NULL, + + -- identity of the reporting sled + sled_id UUID NOT NULL, + sled_serial TEXT NOT NULL, + + -- JSON representation of the ereport + report JSONB NOT NULL, + + PRIMARY KEY (restart_id, ena) +); + +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( + sled_id, + time_collected ); /* From cd78fc32e29ad69a5bc1c4f746c8f21cd1dfcde7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 3 Jun 2025 15:32:14 -0700 Subject: [PATCH 04/61] draw the rest of the owl well, most of it, anyway --- clients/gateway-client/src/lib.rs | 3 +- nexus/background-task-interface/src/init.rs | 1 + nexus/db-model/src/ereport.rs | 4 +- nexus/db-model/src/inventory.rs | 6 + nexus/db-queries/src/db/datastore/ereport.rs | 58 ++- nexus/src/app/background/init.rs | 3 + .../app/background/tasks/ereport_ingester.rs | 355 ++++++++++++++---- nexus/types/src/internal_api/background.rs | 25 ++ 8 files changed, 370 insertions(+), 85 deletions(-) diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index f9888c2c204..c980ff0dfe6 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -70,13 +70,14 @@ progenitor::generate_api!( SpIgnition = { derives = [PartialEq, Eq, PartialOrd, Ord] }, SpIgnitionSystemType = { derives = [Copy, PartialEq, Eq, PartialOrd, Ord] }, SpState = { derives = [PartialEq, Eq, PartialOrd, Ord] }, - SpType = { derives = [daft::Diffable] }, + SpType = { derives = [daft::Diffable, PartialEq, Eq, PartialOrd, Ord] }, SpUpdateStatus = { derives = [PartialEq, Hash, Eq] }, UpdatePreparationProgress = { derives = [PartialEq, Hash, Eq] }, }, replace = { RotSlot = gateway_types::rot::RotSlot, Ena = ereport_types::Ena, + Ereport = ereport_types::Ereport, TypedUuidForEreporterRestartKind = omicron_uuid_kinds::EreporterRestartUuid, TypedUuidForMupdateKind = omicron_uuid_kinds::MupdateUuid, }, diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 82099f679ad..7312364247d 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -46,6 +46,7 @@ pub struct BackgroundTasks { pub task_read_only_region_replacement_start: Activator, pub task_alert_dispatcher: Activator, pub task_webhook_deliverator: Activator, + pub task_sp_ereport_ingester: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 2a677dbf6db..94ed1d1c97d 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -90,7 +90,7 @@ impl From for EreportMetadata { } } -#[derive(Clone, Debug, Queryable, Selectable)] +#[derive(Clone, Debug, Insertable, Queryable, Selectable)] #[diesel(table_name = sp_ereport)] pub struct SpEreport { pub restart_id: DbTypedUuid, @@ -105,7 +105,7 @@ pub struct SpEreport { pub report: serde_json::Value, } -#[derive(Clone, Debug, Queryable, Selectable)] +#[derive(Clone, Debug, Insertable, Queryable, Selectable)] #[diesel(table_name = host_ereport)] pub struct HostEreport { pub restart_id: DbTypedUuid, diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 6461af1e2cf..cccadf05c18 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -631,6 +631,12 @@ where } } +impl From for SpMgsSlot { + fn from(slot: u16) -> Self { + Self(SqlU16::new(slot)) + } +} + /// Newtype wrapping the revision number for a particular baseboard /// /// MGS reports this as a u32 and we represent it the same way, though that diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 44f1b0476db..13606878afa 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -5,10 +5,11 @@ //! [`DataStore`] methods for ereports. use super::DataStore; +use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; use crate::db::model::DbEna; -use crate::db::model::EreportMetadata; +use crate::db::model::SpEreport; use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; @@ -16,8 +17,10 @@ use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; +use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; +use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::GenericUuid; @@ -32,11 +35,26 @@ impl DataStore { opctx: &OpContext, sp_type: impl Into, slot: u16, + ) -> Result, Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + self.sp_latest_ereport_id_on_conn( + &*self.pool_connection_authorized(opctx).await?, + sp_type, + slot, + ) + .await + } + + async fn sp_latest_ereport_id_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + sp_type: impl Into, + slot: u16, ) -> Result, Error> { let sp_type = sp_type.into(); let slot = SpMgsSlot::from(SqlU16::new(slot)); let id = Self::sp_latest_ereport_id_query(sp_type, slot) - .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .get_result_async(conn) .await .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? @@ -50,7 +68,7 @@ impl DataStore { ) -> impl RunnableQuery { sp_dsl::sp_ereport .filter(sp_dsl::sp_type.eq(sp_type).and(sp_dsl::sp_slot.eq(slot))) - .order_by(sp_dsl::time_collected.desc()) + .order_by((sp_dsl::time_collected.desc(), sp_dsl::ena.desc())) .limit(1) .select((sp_dsl::restart_id, sp_dsl::ena)) } @@ -60,6 +78,7 @@ impl DataStore { opctx: &OpContext, sled_id: SledUuid, ) -> Result, Error> { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let id = Self::host_latest_ereport_id_query(sled_id) .get_result_async(&*self.pool_connection_authorized(opctx).await?) .await @@ -74,10 +93,41 @@ impl DataStore { ) -> impl RunnableQuery { host_dsl::host_ereport .filter(host_dsl::sled_id.eq(sled_id.into_untyped_uuid())) - .order_by(host_dsl::time_collected.desc()) + .order_by((host_dsl::time_collected.desc(), host_dsl::ena.desc())) .limit(1) .select((host_dsl::restart_id, host_dsl::ena)) } + + pub async fn sp_ereports_insert( + &self, + opctx: &OpContext, + sp_type: impl Into, + slot: u16, + ereports: Vec, + ) -> CreateResult<(usize, Option)> { + opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + let created = diesel::insert_into(sp_dsl::sp_ereport) + .values(ereports) + .on_conflict((sp_dsl::restart_id, sp_dsl::ena)) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert ereports") + })?; + let sp_type = sp_type.into(); + let latest = self + .sp_latest_ereport_id_on_conn(&*conn, sp_type, slot) + .await + .map_err(|e| { + e.internal_context(format!( + "failed to refresh latest ereport ID for {sp_type:?} {slot}" + )) + })?; + Ok((created, latest)) + } } fn id_from_tuple( diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index a6bb986957a..91eb73ceb3a 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -100,6 +100,7 @@ use super::tasks::decommissioned_disk_cleaner; use super::tasks::dns_config; use super::tasks::dns_propagation; use super::tasks::dns_servers; +use super::tasks::ereport_ingester; use super::tasks::external_endpoints; use super::tasks::instance_reincarnation; use super::tasks::instance_updater; @@ -226,6 +227,7 @@ impl BackgroundTasksInitializer { task_read_only_region_replacement_start: Activator::new(), task_alert_dispatcher: Activator::new(), task_webhook_deliverator: Activator::new(), + task_sp_ereport_ingester: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -300,6 +302,7 @@ impl BackgroundTasksInitializer { task_read_only_region_replacement_start, task_alert_dispatcher, task_webhook_deliverator, + task_sp_ereport_ingester, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index a92d789f4d7..5ee99d15a6c 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -6,23 +6,34 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use chrono::Utc; +use ereport_types::Ena; +use ereport_types::Ereport; +use ereport_types::EreportId; use futures::future::BoxFuture; use internal_dns_types::names::ServiceName; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; use nexus_db_queries::db::DataStore; -use omicron_common::backoff; -use omicron_common::backoff::BackoffError; +use nexus_types::internal_api::background::EreporterStatus; +use nexus_types::internal_api::background::Sp; +use nexus_types::internal_api::background::SpEreportIngesterStatus; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::OmicronZoneUuid; use parallel_task_set::ParallelTaskSet; +use std::collections::BTreeMap; +use std::net::SocketAddrV6; use std::sync::Arc; -use uuid::Uuid; pub struct SpEreportIngester { - pub datastore: Arc, - pub resolver: internal_dns_resolver::Resolver, - pub nexus_id: OmicronZoneUuid, - pub rack_id: Uuid, + resolver: internal_dns_resolver::Resolver, + inner: Ingester, +} + +#[derive(Clone)] +struct Ingester { + datastore: Arc, + nexus_id: OmicronZoneUuid, } impl BackgroundTask for SpEreportIngester { @@ -30,15 +41,33 @@ impl BackgroundTask for SpEreportIngester { &'a mut self, opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { - todo!() + Box::pin(async move { + let status = + self.actually_activate(opctx).await.unwrap_or_else(|error| { + SpEreportIngesterStatus { + error: Some(error.to_string()), + ..Default::default() + } + }); + serde_json::json!(status) + }) } } impl SpEreportIngester { + #[must_use] + pub fn new( + datastore: Arc, + resolver: internal_dns_resolver::Resolver, + nexus_id: OmicronZoneUuid, + ) -> Self { + Self { resolver, inner: Ingester { datastore, nexus_id } } + } + async fn actually_activate( &mut self, opctx: &OpContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { // Find MGS clients. // TODO(eliza): reuse the same client across activations; qorb, etc. let mgs_clients = self @@ -47,28 +76,63 @@ impl SpEreportIngester { .await .context("looking up MGS addresses")? .into_iter() - .map(|sockaddr| { - let url = format!("http://{}", sockaddr); + .map(|addr| { + let url = format!("http://{addr}"); let log = opctx.log.new(o!("gateway_url" => url.clone())); - gateway_client::Client::new(&url, log) + let client = gateway_client::Client::new(&url, log); + GatewayClient { addr, client } }) - .collect::>(); + .collect::>(); - // let mut tasks = ParallelTaskSet::new(); - // for sled in 0..32 {} + // TODO(eliza): what seems like an appropriate parallelism? should we + // just do 16? + let mut tasks = ParallelTaskSet::new(); + let mut status = SpEreportIngesterStatus::default(); - Ok(()) - } -} + let sleds = + (0..32u16).map(|slot| (nexus_types::inventory::SpType::Sled, slot)); + let switches = (0..2u16) + .map(|slot| (nexus_types::inventory::SpType::Switch, slot)); + let pscs = + (0..2u16).map(|slot| (nexus_types::inventory::SpType::Power, slot)); + + for (sp_type, slot) in switches.chain(pscs).chain(sleds) { + let sp_result = tasks + .spawn({ + let opctx = opctx.child(BTreeMap::from([ + // XXX(eliza): that's so many little strings... :( + ("sp_type".to_string(), sp_type.to_string()), + ("slot".to_string(), slot.to_string()), + ])); + let clients = mgs_clients.clone(); + let ingester = self.inner.clone(); + async move { + let status = ingester + .ingest_sp_ereports(opctx, &clients, sp_type, slot) + .await?; + Some((Sp { sp_type, slot }, status)) + } + }) + .await; + if let Some(Some((sp, sp_status))) = sp_result { + status.sps.insert(sp, sp_status); + } + } -struct Sp { - sp_type: gateway_client::types::SpType, - slot: u32, + // Wait for remaining ingestion tasks to come back. + while let Some(sp_result) = tasks.join_next().await { + if let Some((sp, sp_status)) = sp_result { + status.sps.insert(sp, sp_status); + } + } + + Ok(status) + } } -struct SpState { - committed_ena: ereport_types::Ena, - restart_id: EreporterRestartUuid, +struct GatewayClient { + addr: SocketAddrV6, + client: gateway_client::Client, } const LIMIT: std::num::NonZeroU32 = match std::num::NonZeroU32::new(255) { @@ -78,61 +142,196 @@ const LIMIT: std::num::NonZeroU32 = match std::num::NonZeroU32::new(255) { type GatewayClientError = gateway_client::Error; -async fn mgs_request( - opctx: &OpContext, - clients: Arc>, - Sp { sp_type, slot }: Sp, - sp_state: Option, -) -> Result { - let mut next_idx = 0; - - let (committed_ena, restart_id) = match sp_state { - Some(state) => { - (Some(state.committed_ena.clone()), state.restart_id.clone()) +impl Ingester { + async fn ingest_sp_ereports( + &self, + opctx: OpContext, + clients: &[GatewayClient], + sp_type: nexus_types::inventory::SpType, + slot: u16, + ) -> Option { + // Fetch the latest ereport from this SP. + let latest = match self + .datastore + .sp_latest_ereport_id(&opctx, sp_type, slot) + .await + { + Ok(latest) => latest, + Err(error) => { + return Some(EreporterStatus { + errors: vec![format!( + "failed to query for latest ereport: {error}" + )], + ..Default::default() + }); + } + }; + + let mut params = EreportQueryParams::from_latest(latest); + let mut status = None; + while let Some(gateway_client::types::Ereports { + restart_id, + items, + next_page: _, + }) = self + .mgs_requests(&opctx, clients, ¶ms, sp_type, slot, &mut status) + .await + { + if items.is_empty() { + slog::trace!( + &opctx.log, + "no ereports returned by SP"; + "committed_ena" => ?params.committed_ena, + "start_ena" => ?params.start_ena, + "restart_id" => ?params.restart_id, + "total_ereports_received" => status + .as_ref() + .map(|s| s.ereports_received), + "total_new_ereports" => status + .as_ref() + .map(|s| s.new_ereports), + ); + break; + } + let db_ereports = items + .into_iter() + .map(|Ereport { ena, data }| db::model::SpEreport { + restart_id: restart_id.into(), + ena: ena.into(), + time_collected: Utc::now(), + collector_id: self.nexus_id.into(), + sp_type: sp_type.into(), + sp_slot: slot.into(), + report: serde_json::Value::Object(data), + }) + .collect::>(); + let received = db_ereports.len(); + let status = status.get_or_insert_default(); + status.ereports_received += received; + let created = match self + .datastore + .sp_ereports_insert(&opctx, sp_type, slot, db_ereports) + .await + { + Ok((created, latest)) => { + params = EreportQueryParams::from_latest(latest); + created + } + Err(e) => { + slog::error!( + &opctx.log, + "error inserting {received} ereports!"; + "committed_ena" => ?params.committed_ena, + "start_ena" => ?params.start_ena, + "restart_id" => ?params.restart_id, + "ereports_received" => received, + "error" => %e, + ); + status.errors.push(format!( + "failed to insert {received} ereports: {e}" + )); + break; + } + }; + status.new_ereports += created; + slog::info!( + &opctx.log, + "collected ereports from MGS"; + "requested_committed_ena" => ?params.committed_ena, + "requested_start_ena" => ?params.start_ena, + "requested_restart_id" => ?params.restart_id, + "received_restart_id" => ?restart_id, + "ereports_received" => received, + "new_ereports" => created, + "total_ereports_collected" => status.ereports_received, + "total_new_ereports" => status.new_ereports, + "total_requests" => status.requests, + ); } - None => (None, EreporterRestartUuid::nil()), - }; - - backoff::retry_notify_ext( - backoff::retry_policy_internal_service(), - || { - let clients = clients.clone(); - let idx = next_idx; - // Next time, we'll try the other gateway, if this one is sad. - next_idx = (next_idx + 1) % clients.len(); - async move { - let client = &clients[idx]; - let res = client - .sp_ereports_ingest( - sp_type, - slot, - committed_ena.as_ref(), - LIMIT, - &restart_id, - committed_ena.as_ref(), - ) - .await; - match res { - Ok(ereports) => Ok(ereports.into_inner()), - Err(e) - if e.status() - .map(|status| status.is_client_error()) - .unwrap_or(false) => - { - Err(BackoffError::permanent(e)) - } - Err( - e @ gateway_client::Error::InvalidRequest(_) - | e @ gateway_client::Error::PreHookError(_) - | e @ gateway_client::Error::PostHookError(_), - ) => Err(BackoffError::permanent(e)), - Err(e) => return Err(BackoffError::transient(e)), + + status + } + + async fn mgs_requests( + &self, + opctx: &OpContext, + clients: &[GatewayClient], + EreportQueryParams { ref committed_ena,ref start_ena, restart_id }: &EreportQueryParams, + sp_type: nexus_types::inventory::SpType, + slot: u16, + status: &mut Option, + ) -> Option { + // If an attempt to collect ereports from one gateway fails, we will try + // any other discovered gateways. + for GatewayClient { addr, client } in clients.iter() { + slog::debug!( + &opctx.log, + "attempting ereport collection from MGS"; + "sp_type" => ?sp_type, + "slot" => slot, + "committed_ena" => ?committed_ena, + "start_ena" => ?start_ena, + "restart_id" => ?restart_id, + "gateway_addr" => %addr, + ); + let res = client + .sp_ereports_ingest( + sp_type, + slot as u32, + committed_ena.as_ref(), + LIMIT, + &restart_id, + start_ena.as_ref(), + ) + .await; + match res { + Ok(ereports) => { + status.get_or_insert_default().requests += 1; + return Some(ereports.into_inner()); + } + Err(e) if e.status() == Some(http::StatusCode::NOT_FOUND) => { + slog::debug!( + &opctx.log, + "ereport collection: MGS claims there is no SP in this slot"; + "sp_type" => ?sp_type, + "slot" => slot, + "committed_ena" => ?committed_ena, + "start_ena" => ?start_ena, + "restart_id" => ?restart_id, + "gateway_addr" => %addr, + ); + } + Err(e) => { + let stats = status.get_or_insert_default(); + stats.requests += 1; + stats.errors.push(format!("MGS {addr}: {e}")); } } - }, - |err, count, duration| { - // TODO(eliza): log the error 'n' stuff - }, - ) - .await + } + + None + } +} + +struct EreportQueryParams { + committed_ena: Option, + start_ena: Option, + restart_id: EreporterRestartUuid, +} + +impl EreportQueryParams { + fn from_latest(latest: Option) -> Self { + match latest { + Some(ereport_types::EreportId { ena, restart_id }) => Self { + committed_ena: Some(ena), + start_ena: Some(Ena(ena.0 + 1)), + restart_id, + }, + None => Self { + committed_ena: None, + start_ena: None, + restart_id: EreporterRestartUuid::nil(), + }, + } + } } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index e4f441e8ea3..f4f0ce61702 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -529,3 +529,28 @@ pub struct ReadOnlyRegionReplacementStartStatus { pub requests_created_ok: Vec, pub errors: Vec, } + +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct SpEreportIngesterStatus { + pub sps: BTreeMap, + pub error: Option, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct Sp { + pub sp_type: crate::inventory::SpType, + pub slot: u16, +} + +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct EreporterStatus { + /// total number of ereports received from this reporter + pub ereports_received: usize, + /// number of new ereports ingested from this reporter (this may be less + /// than `ereports_received` if some ereports were collected by another + /// Nexus) + pub new_ereports: usize, + /// total number of HTTP requests sent. + pub requests: usize, + pub errors: Vec, +} From cbdb6693e0a4f4fccb7a4c3bb304dd0da55dd272 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 3 Jun 2025 15:57:24 -0700 Subject: [PATCH 05/61] actually attach the owl to the rest of the software --- nexus-config/src/nexus_config.rs | 21 +++++++++++++++++++++ nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 16 ++++++++++++++++ nexus/tests/config.test.toml | 1 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 7 files changed, 42 insertions(+) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 793a24c41d4..151d163f380 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -439,6 +439,8 @@ pub struct BackgroundTaskConfig { pub alert_dispatcher: AlertDispatcherConfig, /// configuration for webhook deliverator task pub webhook_deliverator: WebhookDeliveratorConfig, + /// configuration for SP ereport ingester task + pub sp_ereport_ingester: SpEreportIngesterConfig, } #[serde_as] @@ -803,6 +805,22 @@ pub struct WebhookDeliveratorConfig { pub second_retry_backoff_secs: u64, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct SpEreportIngesterConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + +impl Default for SpEreportIngesterConfig { + fn default() -> Self { + Self { + period_secs: Duration::from_secs(30), + } + } +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -1301,6 +1319,9 @@ mod test { first_retry_backoff_secs: 45, second_retry_backoff_secs: 46, }, + sp_ereport_ingester: SpEreportIngesterConfig { + period_secs: Duration::from_secs(47), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 8f64d5f0859..6e1099ba5b5 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -148,6 +148,7 @@ tuf_artifact_replication.min_sled_replication = 1 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 +sp_ereport_ingester.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index d2684608add..7103292e570 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -134,6 +134,7 @@ tuf_artifact_replication.min_sled_replication = 1 alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 read_only_region_replacement_start.period_secs = 30 +sp_ereport_ingester.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 91eb73ceb3a..f40039a0d7b 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -930,6 +930,22 @@ impl BackgroundTasksInitializer { } }); + driver.register(TaskDefinition { + name: "sp_ereport_ingester", + description: "collects error reports from service processors", + period: config.sp_ereport_ingester.period_secs, + task_impl: Box::new( + ereport_ingester::SpEreportIngester::new( + datastore, + resolver, + nexus_id, + ), + ), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_sp_ereport_ingester, + }); + driver } } diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 929ec3e57cc..03fff464f1e 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -171,6 +171,7 @@ webhook_deliverator.period_secs = 60 webhook_deliverator.first_retry_backoff_secs = 10 webhook_deliverator.second_retry_backoff_secs = 20 read_only_region_replacement_start.period_secs = 60 +sp_ereport_ingester.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 5cf5491fe91..20e9c6c9a93 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -80,6 +80,7 @@ read_only_region_replacement_start.period_secs = 30 # so we don't need to periodically activate it *that* frequently. alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 +sp_ereport_ingester.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 52a8b642283..c357a690102 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -80,6 +80,7 @@ read_only_region_replacement_start.period_secs = 30 # so we don't need to periodically activate it *that* frequently. alert_dispatcher.period_secs = 60 webhook_deliverator.period_secs = 60 +sp_ereport_ingester.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From 2cc22c1ca28124eec50e9c60be30772055b416f0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 3 Jun 2025 16:32:53 -0700 Subject: [PATCH 06/61] claude, you forgot to clone the `Arc` it's because you just predict tokens and don't actually understand ownership btw --- nexus/src/app/background/init.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index f40039a0d7b..9a8f637a608 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -918,7 +918,7 @@ impl BackgroundTasksInitializer { period: period_secs, task_impl: Box::new( webhook_deliverator::WebhookDeliverator::new( - datastore, + datastore.clone(), cfg, nexus_id, args.webhook_delivery_client, @@ -934,13 +934,9 @@ impl BackgroundTasksInitializer { name: "sp_ereport_ingester", description: "collects error reports from service processors", period: config.sp_ereport_ingester.period_secs, - task_impl: Box::new( - ereport_ingester::SpEreportIngester::new( - datastore, - resolver, - nexus_id, - ), - ), + task_impl: Box::new(ereport_ingester::SpEreportIngester::new( + datastore, resolver, nexus_id, + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_sp_ereport_ingester, From cd2ce89f1012033530428c50ccbc0107cb229605 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 3 Jun 2025 16:44:10 -0700 Subject: [PATCH 07/61] rm unused type alias --- nexus/src/app/background/tasks/ereport_ingester.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 5ee99d15a6c..26c4cb76e8e 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -140,8 +140,6 @@ const LIMIT: std::num::NonZeroU32 = match std::num::NonZeroU32::new(255) { Some(l) => l, }; -type GatewayClientError = gateway_client::Error; - impl Ingester { async fn ingest_sp_ereports( &self, From 5434f7741f83219c2b1e32ed02fb424de44b2205 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 4 Jun 2025 09:50:22 -0700 Subject: [PATCH 08/61] claude forgot that --- nexus-config/src/nexus_config.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 151d163f380..d971ef10d48 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -815,9 +815,7 @@ pub struct SpEreportIngesterConfig { impl Default for SpEreportIngesterConfig { fn default() -> Self { - Self { - period_secs: Duration::from_secs(30), - } + Self { period_secs: Duration::from_secs(30) } } } @@ -1102,6 +1100,7 @@ mod test { webhook_deliverator.lease_timeout_secs = 44 webhook_deliverator.first_retry_backoff_secs = 45 webhook_deliverator.second_retry_backoff_secs = 46 + sp_ereport_ingester.period_secs = 47 [default_region_allocation_strategy] type = "random" seed = 0 From 0af128dc6ea569d8825a350679ae8e502a8b1d7a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 4 Jun 2025 12:13:46 -0700 Subject: [PATCH 09/61] store part/serial numbers in CRDB if present --- gateway/tests/integration_tests/ereports.rs | 24 +++---- nexus/db-model/src/ereport.rs | 26 +++++++ nexus/db-schema/src/schema.rs | 3 + .../app/background/tasks/ereport_ingester.rs | 67 ++++++++++++++++--- schema/crdb/dbinit.sql | 14 +++- sp-sim/src/gimlet.rs | 4 +- sp-sim/src/sidecar.rs | 4 +- 7 files changed, 117 insertions(+), 25 deletions(-) diff --git a/gateway/tests/integration_tests/ereports.rs b/gateway/tests/integration_tests/ereports.rs index 562ecd08b46..18fe59ed8f1 100644 --- a/gateway/tests/integration_tests/ereports.rs +++ b/gateway/tests/integration_tests/ereports.rs @@ -74,8 +74,8 @@ mod sled0 { def_ereport! { EREPORT_1: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet00", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "task_apollo_server", @@ -88,8 +88,8 @@ mod sled0 { } def_ereport! { EREPORT_2: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet00", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "drv_ae35_server", @@ -111,8 +111,8 @@ mod sled0 { } def_ereport! { EREPORT_3: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet00", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "task_apollo_server", @@ -131,8 +131,8 @@ mod sled0 { def_ereport! { EREPORT_4: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet00", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "drv_thingy_server", @@ -146,8 +146,8 @@ mod sled0 { def_ereport! { EREPORT_5: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet00", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "task_latex_server", @@ -168,8 +168,8 @@ mod sled1 { def_ereport! { EREPORT_1: { - "chassis_model": "SimGimletSp", - "chassis_serial": "SimGimlet01", + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet01", "hubris_archive_id": "ffffffff", "hubris_version": "0.0.2", "hubris_task_name": "task_thermal_server", diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 94ed1d1c97d..b679df531a2 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -99,9 +99,35 @@ pub struct SpEreport { pub time_collected: DateTime, pub collector_id: DbTypedUuid, + // + // The physical location of the reporting SP. + // + /// SP location: the type of SP slot (sled, switch, power shelf). + /// + /// This is always known, as SPs are indexed by physical location when + /// collecting ereports from MGS. pub sp_type: SpType, + /// SP location: the slot number. + /// + /// This is always known, as SPs are indexed by physical location when + /// collecting ereports from MGS. pub sp_slot: SpMgsSlot, + /// SP VPD identity: the baseboard part number of the reporting SP. + /// + /// This is nullable, as the ereport may have been generated in a condition + /// where the SP was unable to determine its own part number. Consider that + /// "I don't know what I am!" is an error condition for which we might want + /// to generate an ereport! + pub part_number: Option, + /// SP VPD identity: the baseboard serial number of the reporting SP. + /// + /// This is nullable, as the ereport may have been generated in a condition + /// where the SP was unable to determine its own serial number. Consider that + /// "I don't know who I am!" is an error condition for which we might want + /// to generate an ereport! + pub serial_number: Option, + pub report: serde_json::Value, } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index dc49bb992f6..dc17bd60805 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2453,6 +2453,9 @@ table! { sp_type -> crate::enums::SpTypeEnum, sp_slot -> Int4, + part_number -> Text, + serial_number -> Text, + report -> Jsonb, } } diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 26c4cb76e8e..68eaac0bb1a 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -193,14 +193,30 @@ impl Ingester { } let db_ereports = items .into_iter() - .map(|Ereport { ena, data }| db::model::SpEreport { - restart_id: restart_id.into(), - ena: ena.into(), - time_collected: Utc::now(), - collector_id: self.nexus_id.into(), - sp_type: sp_type.into(), - sp_slot: slot.into(), - report: serde_json::Value::Object(data), + .map(|ereport| { + let part_number = get_sp_metadata_string( + "baseboard_part_number", + &ereport, + &restart_id, + &opctx.log, + ); + let serial_number = get_sp_metadata_string( + "baseboard_serial_number", + &ereport, + &restart_id, + &opctx.log, + ); + db::model::SpEreport { + restart_id: restart_id.into(), + ena: ereport.ena.into(), + time_collected: Utc::now(), + collector_id: self.nexus_id.into(), + sp_type: sp_type.into(), + sp_slot: slot.into(), + part_number, + serial_number, + report: serde_json::Value::Object(ereport.data), + } }) .collect::>(); let received = db_ereports.len(); @@ -311,6 +327,41 @@ impl Ingester { } } +/// Attempt to extract a VPD metadata from an SP ereport, logging a warning if +/// it's missing. We still want to keep such ereports, as the error condition +/// could be that the SP couldn't determine the metadata field, but it's +/// uncomfortable, so we ought to complain about it. +fn get_sp_metadata_string( + key: &str, + ereport: &Ereport, + restart_id: &EreporterRestartUuid, + log: &slog::Logger, +) -> Option { + match ereport.data.get(key) { + Some(serde_json::Value::String(s)) => Some(s.clone()), + Some(v) => { + slog::warn!( + &log, + "malformed ereport: value for '{key}' should be a string, \ + but found: {v:?}"; + "ena" => ?ereport.ena, + "restart_id" => ?restart_id, + ); + None + } + None => { + slog::warn!( + &log, + "ereport missing '{key}'; perhaps the SP doesn't know its own \ + VPD!"; + "ena" => ?ereport.ena, + "restart_id" => ?restart_id, + ); + None + } + } +} + struct EreportQueryParams { committed_ena: Option, start_ena: Option, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 8d7aab9a1a5..b582f18defd 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5763,10 +5763,22 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( -- UUID of the Nexus instance that collected the ereport collector_id UUID NOT NULL, - -- identity of the reporting SP + -- physical lcoation of the reporting SP + -- + -- these fields are always present, as they are how requests to collect + -- ereports are indexed by MGS. sp_type omicron.public.sp_type NOT NULL, sp_slot INT4 NOT NULL, + -- VPD identity of the reporting SP. + -- + -- unlike the physical location, these fields are nullable, as an ereport + -- may be generated in a state where the SP doesn't know who or what it is. + -- consider that "i don't know my own identity" is a reasonable condition to + -- might want to generate an ereport about! + serial_number TEXT(11), + part_number TEXT(11), + -- JSON representation of the ereport report JSONB NOT NULL, diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index 6308c41ee2f..c6daa251759 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -302,11 +302,11 @@ impl Gimlet { if cfg.restart.metadata.is_empty() { let map = &mut cfg.restart.metadata; map.insert( - "chassis_model".to_string(), + "baseboard_part_number".to_string(), SIM_GIMLET_BOARD.into(), ); map.insert( - "chassis_serial".to_string(), + "baseboard_serial_number".to_string(), gimlet.common.serial_number.clone().into(), ); map.insert("hubris_archive_id".to_string(), hubris_gitc.into()); diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs index 548f6f94c17..f72a7b43a84 100644 --- a/sp-sim/src/sidecar.rs +++ b/sp-sim/src/sidecar.rs @@ -235,11 +235,11 @@ impl Sidecar { if cfg.restart.metadata.is_empty() { let map = &mut cfg.restart.metadata; map.insert( - "chassis_model".to_string(), + "baseboard_part_number".to_string(), SIM_SIDECAR_BOARD.into(), ); map.insert( - "chassis_serial".to_string(), + "baseboard_serial_number".to_string(), sidecar.common.serial_number.clone().into(), ); map.insert( From c8b058c44d404b07bb84890db0d154c60d3d793a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 4 Jun 2025 12:16:03 -0700 Subject: [PATCH 10/61] claude forgot that one too lol --- nexus-config/src/nexus_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index d971ef10d48..8f906cefb49 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -1409,6 +1409,7 @@ mod test { read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 + sp_ereport_ingester.period_secs = 44 [default_region_allocation_strategy] type = "random" "##, From 216652c859e5c35f0c036354925e19e2620ff08c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 13:18:23 -0700 Subject: [PATCH 11/61] right, serde-json map keys must be strings... --- .../app/background/tasks/ereport_ingester.rs | 32 ++++++++++++------- nexus/types/src/internal_api/background.rs | 8 +++-- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 68eaac0bb1a..bd5038a3c58 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -16,7 +16,7 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::EreporterStatus; -use nexus_types::internal_api::background::Sp; +use nexus_types::internal_api::background::SpEreporterStatus; use nexus_types::internal_api::background::SpEreportIngesterStatus; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -107,25 +107,27 @@ impl SpEreportIngester { let clients = mgs_clients.clone(); let ingester = self.inner.clone(); async move { - let status = ingester + ingester .ingest_sp_ereports(opctx, &clients, sp_type, slot) - .await?; - Some((Sp { sp_type, slot }, status)) + .await } }) .await; - if let Some(Some((sp, sp_status))) = sp_result { - status.sps.insert(sp, sp_status); + if let Some(Some(sp_status)) = sp_result { + status.sps.push(sp_status); } } // Wait for remaining ingestion tasks to come back. while let Some(sp_result) = tasks.join_next().await { - if let Some((sp, sp_status)) = sp_result { - status.sps.insert(sp, sp_status); + if let Some(sp_status) = sp_result { + status.sps.push(sp_status); } } + // Sort statuses for consistent output in OMDB commands. + status.sps.sort_unstable_by_key(|sp| (sp.sp_type, sp.slot)); + Ok(status) } } @@ -147,7 +149,7 @@ impl Ingester { clients: &[GatewayClient], sp_type: nexus_types::inventory::SpType, slot: u16, - ) -> Option { + ) -> Option { // Fetch the latest ereport from this SP. let latest = match self .datastore @@ -156,11 +158,15 @@ impl Ingester { { Ok(latest) => latest, Err(error) => { - return Some(EreporterStatus { + return Some(SpEreporterStatus { + sp_type, + slot, + EreporterStatus { errors: vec![format!( "failed to query for latest ereport: {error}" )], ..Default::default() + } }); } }; @@ -263,7 +269,11 @@ impl Ingester { ); } - status + status.map(|status| SpEreporterStatus { + sp_type, + slot, + status, + }) } async fn mgs_requests( diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index f4f0ce61702..b164337997f 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -532,14 +532,16 @@ pub struct ReadOnlyRegionReplacementStartStatus { #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct SpEreportIngesterStatus { - pub sps: BTreeMap, + pub sps: Vec, pub error: Option, } -#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct Sp { +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +pub struct SpEreporterStatus { pub sp_type: crate::inventory::SpType, pub slot: u16, + #[serde(flatten)] + pub status: EreporterStatus, } #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] From 65cccef1dac04526e67c1019c32eec0c5a767ece Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 13:18:36 -0700 Subject: [PATCH 12/61] oopsie --- schema/crdb/dbinit.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b582f18defd..b79844414bd 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5776,8 +5776,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( -- may be generated in a state where the SP doesn't know who or what it is. -- consider that "i don't know my own identity" is a reasonable condition to -- might want to generate an ereport about! - serial_number TEXT(11), - part_number TEXT(11), + serial_number STRING(11), + part_number STRING(11), -- JSON representation of the ereport report JSONB NOT NULL, From 49fec6a55eba31533eadeac83bd9fe694caf0618 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 13:22:39 -0700 Subject: [PATCH 13/61] argh --- .../app/background/tasks/ereport_ingester.rs | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index bd5038a3c58..e1e75c83dc6 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -16,8 +16,8 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::EreporterStatus; -use nexus_types::internal_api::background::SpEreporterStatus; use nexus_types::internal_api::background::SpEreportIngesterStatus; +use nexus_types::internal_api::background::SpEreporterStatus; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::OmicronZoneUuid; use parallel_task_set::ParallelTaskSet; @@ -107,9 +107,10 @@ impl SpEreportIngester { let clients = mgs_clients.clone(); let ingester = self.inner.clone(); async move { - ingester + let status = ingester .ingest_sp_ereports(opctx, &clients, sp_type, slot) - .await + .await?; + Some(SpEreporterStatus { sp_type, slot, status }) } }) .await; @@ -149,7 +150,7 @@ impl Ingester { clients: &[GatewayClient], sp_type: nexus_types::inventory::SpType, slot: u16, - ) -> Option { + ) -> Option { // Fetch the latest ereport from this SP. let latest = match self .datastore @@ -158,15 +159,11 @@ impl Ingester { { Ok(latest) => latest, Err(error) => { - return Some(SpEreporterStatus { - sp_type, - slot, - EreporterStatus { + return Some(EreporterStatus { errors: vec![format!( "failed to query for latest ereport: {error}" )], ..Default::default() - } }); } }; @@ -269,11 +266,7 @@ impl Ingester { ); } - status.map(|status| SpEreporterStatus { - sp_type, - slot, - status, - }) + status } async fn mgs_requests( From a240529ad702fa2fd56284eb1bbd56b8c7167b70 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 13:31:42 -0700 Subject: [PATCH 14/61] omdb ereport show command --- Cargo.lock | 1 + dev-tools/omdb/Cargo.toml | 1 + dev-tools/omdb/src/bin/omdb/db.rs | 7 + dev-tools/omdb/src/bin/omdb/db/ereport.rs | 129 +++++++++++++++++++ ereport/types/src/lib.rs | 17 +++ nexus/db-model/src/ereport.rs | 90 +++++++++---- nexus/db-queries/src/db/datastore/ereport.rs | 55 ++++++++ nexus/db-schema/src/schema.rs | 4 +- 8 files changed, 280 insertions(+), 24 deletions(-) create mode 100644 dev-tools/omdb/src/bin/omdb/db/ereport.rs diff --git a/Cargo.lock b/Cargo.lock index 6cd09f35d40..2de1ffacbc6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7587,6 +7587,7 @@ dependencies = [ "diesel", "dropshot", "dyn-clone", + "ereport-types", "expectorate", "futures", "gateway-client", diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 22baaa16956..416eea8de83 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -24,6 +24,7 @@ csv.workspace = true diesel.workspace = true dropshot.workspace = true dyn-clone.workspace = true +ereport-types.workspace = true futures.workspace = true gateway-client.workspace = true gateway-messages.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 93781b9077f..ba589044ee4 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -19,6 +19,7 @@ use crate::Omdb; use crate::check_allow_destructive::DestructiveOperationToken; +use crate::db::ereport::cmd_db_ereport; use crate::helpers::CONNECTION_OPTIONS_HEADING; use crate::helpers::DATABASE_OPTIONS_HEADING; use crate::helpers::const_max_len; @@ -180,6 +181,7 @@ use tabled::Tabled; use uuid::Uuid; mod alert; +mod ereport; mod saga; const NO_ACTIVE_PROPOLIS_MSG: &str = ""; @@ -356,6 +358,8 @@ enum DbCommands { Disks(DiskArgs), /// Print information about internal and external DNS Dns(DnsArgs), + /// Query and display error reports + Ereport(ereport::EreportArgs), /// Print information about collected hardware/software inventory Inventory(InventoryArgs), /// Print information about physical disks @@ -1494,6 +1498,9 @@ impl DbArgs { &args, token, ).await + }, + DbCommands::Ereport(args) => { + cmd_db_ereport(&opctx, &datastore, &fetch_opts, &args).await } } } diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs new file mode 100644 index 00000000000..6e368d1aef1 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -0,0 +1,129 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb db ereport` subcommands + +use super::DbFetchOptions; +use crate::helpers::const_max_len; + +use anyhow::Context; +use clap::Args; +use clap::Subcommand; +use ereport_types::Ena; +use ereport_types::EreporterRestartUuid; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; +use nexus_db_queries::db::DataStore; + +#[derive(Debug, Args, Clone)] +pub(super) struct EreportArgs { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand, Clone)] +enum Commands { + /// List ereports + #[clap(alias = "ls")] + List(ListArgs), + + /// Show an ereport + #[clap(alias = "show")] + Info(InfoArgs), +} + +#[derive(Debug, Args, Clone)] +struct InfoArgs { + restart_id: EreporterRestartUuid, + ena: Ena, +} + +#[derive(Debug, Args, Clone)] +struct ListArgs {} + +pub(super) async fn cmd_db_ereport( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &EreportArgs, +) -> anyhow::Result<()> { + match args.command { + Commands::List(ref args) => { + cmd_db_ereport_list(opctx, datastore, fetch_opts, args).await + } + Commands::Info(ref args) => { + cmd_db_ereport_info(opctx, datastore, args).await + } + } +} + +async fn cmd_db_ereport_list( + _opctx: &OpContext, + _datastore: &DataStore, + _fetch_opts: &DbFetchOptions, + _args: &ListArgs, +) -> anyhow::Result<()> { + Err(anyhow::anyhow!("not yet implemented")) +} + +async fn cmd_db_ereport_info( + opctx: &OpContext, + datastore: &DataStore, + args: &InfoArgs, +) -> anyhow::Result<()> { + let &InfoArgs { restart_id, ena } = args; + let ereport_id = ereport_types::EreportId { restart_id, ena }; + let db::model::Ereport { id, metadata, reporter, report } = + datastore.ereport_fetch(opctx, ereport_id).await?; + + let db::model::EreportMetadata { + time_collected, + collector_id, + part_number, + serial_number, + } = metadata; + const ENA: &str = "ENA"; + const TIME_COLLECTED: &str = "collected at"; + const COLLECTOR_ID: &str = "collected by"; + const REPORTER: &str = "reported by"; + const RESTART_ID: &str = "restart ID"; + const PART_NUMBER: &str = " part number"; + const SERIAL_NUMBER: &str = " serial number"; + const WIDTH: usize = const_max_len(&[ + TIME_COLLECTED, + COLLECTOR_ID, + REPORTER, + PART_NUMBER, + SERIAL_NUMBER, + ]); + println!("\n{:=<80}", "== EREPORT METADATA "); + println!(" {ENA:>WIDTH$}: {}", id.ena); + println!(" {TIME_COLLECTED:>WIDTH$}: {time_collected}"); + println!(" {COLLECTOR_ID:>WIDTH$}: {collector_id}"); + match reporter { + db::model::Reporter::Sp { sp_type, slot } => { + println!( + " {REPORTER:>WIDTH$}: {sp_type:?} {slot} (service processor)" + ) + } + db::model::Reporter::HostOs { sled } => { + println!(" {REPORTER:>WIDTH$}: sled {sled:?} (host OS)"); + } + } + println!(" {RESTART_ID:>WIDTH$}: {restart_id}"); + println!( + " {PART_NUMBER:>WIDTH$}: {}", + part_number.as_deref().unwrap_or("") + ); + println!( + " {SERIAL_NUMBER:>WIDTH$}: {}", + serial_number.as_deref().unwrap_or("") + ); + + println!("\n{:=<80}", "== EREPORT "); + serde_json::to_writer_pretty(std::io::stdout(), &report) + .with_context(|| format!("failed to serialize ereport: {report:?}"))?; + + Ok(()) +} diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index 8b244a9063c..bb151cd7995 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -10,6 +10,7 @@ use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use std::num::NonZeroU32; +use std::str::FromStr; /// An ereport message. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema)] @@ -59,6 +60,22 @@ impl fmt::LowerHex for Ena { } } +impl FromStr for Ena { + type Err = std::num::ParseIntError; + + fn from_str(s: &str) -> Result { + let s = s.trim(); + let value = if let Some(hex_str) = + s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) + { + u64::from_str_radix(hex_str, 16)? + } else { + s.parse::()? + }; + Ok(Self(value)) + } +} + #[derive(Clone, Debug, thiserror::Error)] #[error("ENA value ({0}) is negative")] pub struct EnaNegativeError(i64); diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index b679df531a2..dc640c36cef 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -14,7 +14,9 @@ use diesel::serialize::{self, ToSql}; use diesel::sql_types; use ereport_types::{Ena, EreportId}; use nexus_db_schema::schema::{host_ereport, sp_ereport}; -use omicron_uuid_kinds::{EreporterRestartKind, OmicronZoneKind, SledKind}; +use omicron_uuid_kinds::{ + EreporterRestartKind, OmicronZoneKind, OmicronZoneUuid, SledKind, SledUuid, +}; use serde::{Deserialize, Serialize}; use std::convert::TryFrom; @@ -61,35 +63,79 @@ where } #[derive(Clone, Debug)] -pub struct EreportMetadata { - pub restart_id: DbTypedUuid, - pub ena: DbEna, - - pub time_collected: DateTime, - pub collector_id: DbTypedUuid, +pub struct Ereport { + pub id: EreportId, + pub metadata: EreportMetadata, + pub reporter: Reporter, + pub report: serde_json::Value, } -pub type EreportMetadataTuple = ( - DbTypedUuid, - DbEna, - DateTime, - DbTypedUuid, -); - -impl EreportMetadata { - pub fn id(&self) -> EreportId { - EreportId { restart_id: self.restart_id.into(), ena: self.ena.into() } +impl From for Ereport { + fn from(sp_report: SpEreport) -> Self { + let SpEreport { + restart_id, + ena, + time_collected, + collector_id, + part_number, + serial_number, + sp_type, + sp_slot, + report, + } = sp_report; + Ereport { + id: EreportId { restart_id: restart_id.into(), ena: ena.into() }, + metadata: EreportMetadata { + time_collected, + collector_id: collector_id.into(), + part_number, + serial_number, + }, + reporter: Reporter::Sp { sp_type, slot: sp_slot.0.into() }, + report, + } } } -impl From for EreportMetadata { - fn from( - (restart_id, ena, time_collected, collector_id): EreportMetadataTuple, - ) -> Self { - EreportMetadata { restart_id, ena, time_collected, collector_id } +impl From for Ereport { + fn from(host_report: HostEreport) -> Self { + let HostEreport { + restart_id, + ena, + time_collected, + collector_id, + sled_serial, + sled_id, + report, + } = host_report; + Ereport { + id: EreportId { restart_id: restart_id.into(), ena: ena.into() }, + metadata: EreportMetadata { + time_collected, + collector_id: collector_id.into(), + part_number: None, // TODO + serial_number: Some(sled_serial), + }, + reporter: Reporter::HostOs { sled: sled_id.into() }, + report, + } } } +#[derive(Clone, Debug)] +pub struct EreportMetadata { + pub time_collected: DateTime, + pub collector_id: OmicronZoneUuid, + pub part_number: Option, + pub serial_number: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +pub enum Reporter { + Sp { sp_type: SpType, slot: u16 }, + HostOs { sled: SledUuid }, +} + #[derive(Clone, Debug, Insertable, Queryable, Selectable)] #[diesel(table_name = sp_ereport)] pub struct SpEreport { diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 13606878afa..321cc801ed5 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -9,6 +9,8 @@ use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; use crate::db::model::DbEna; +use crate::db::model::Ereport; +use crate::db::model::HostEreport; use crate::db::model::SpEreport; use crate::db::model::SpMgsSlot; use crate::db::model::SpType; @@ -22,6 +24,7 @@ use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; +use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledUuid; @@ -30,6 +33,58 @@ use uuid::Uuid; type EreportIdTuple = (Uuid, DbEna); impl DataStore { + // pub async fn sp_ereport_list_by_serial( + // &self, + // opctx: &OpContext, + // serial: String, + // time_range: impl RangeBounds>, + // pagparams: + // ) -> ListResultVec { + // todo!() + // } + + /// Fetch an ereport by its restart ID and ENA. + /// + /// This function queries both the service-processor and host OS ereport + /// tables, and returns a `NotFound` error if neither table contains an + /// ereport with the requested ID. + pub async fn ereport_fetch( + &self, + opctx: &OpContext, + id: ereport_types::EreportId, + ) -> LookupResult { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + let restart_id = id.restart_id.into_untyped_uuid(); + let ena = DbEna::from(id.ena); + + if let Some(report) = sp_dsl::sp_ereport + .filter(sp_dsl::restart_id.eq(restart_id)) + .filter(sp_dsl::ena.eq(ena)) + .select(SpEreport::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + { + return Ok(report.into()); + } + + if let Some(report) = host_dsl::host_ereport + .filter(host_dsl::restart_id.eq(restart_id)) + .filter(host_dsl::ena.eq(ena)) + .select(HostEreport::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + { + return Ok(report.into()); + } + + Err(Error::non_resourcetype_not_found(format!("ereport {id}"))) + } + pub async fn sp_latest_ereport_id( &self, opctx: &OpContext, diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index dc17bd60805..6b86ddc357e 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2453,8 +2453,8 @@ table! { sp_type -> crate::enums::SpTypeEnum, sp_slot -> Int4, - part_number -> Text, - serial_number -> Text, + part_number -> Nullable, + serial_number -> Nullable, report -> Jsonb, } From 9737b56d8ee0316289b42062834939c8d9268315 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 16:04:17 -0700 Subject: [PATCH 15/61] enough of a list command for demo purposes --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 157 +++++++++++++++++++++- schema/crdb/dbinit.sql | 15 ++- 2 files changed, 165 insertions(+), 7 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 6e368d1aef1..c8f00592b55 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -5,16 +5,28 @@ //! `omdb db ereport` subcommands use super::DbFetchOptions; +use super::check_limit; use crate::helpers::const_max_len; +use crate::helpers::datetime_rfc3339_concise; +use crate::helpers::display_option_blank; use anyhow::Context; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::DateTime; +use chrono::Utc; use clap::Args; use clap::Subcommand; +use diesel::prelude::*; use ereport_types::Ena; use ereport_types::EreporterRestartUuid; +use nexus_db_model::SpType; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; +use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; +use omicron_uuid_kinds::GenericUuid; +use tabled::Tabled; +use uuid::Uuid; #[derive(Debug, Args, Clone)] pub(super) struct EreportArgs { @@ -40,7 +52,19 @@ struct InfoArgs { } #[derive(Debug, Args, Clone)] -struct ListArgs {} +struct ListArgs { + /// Include only ereports from the system with the provided serial number. + #[clap(long)] + serial: Option, + + /// Include only ereports collected before this timestamp + #[clap(long, short)] + before: Option>, + + /// Include only ereports collected after this timestamp + #[clap(long, short)] + after: Option>, +} pub(super) async fn cmd_db_ereport( opctx: &OpContext, @@ -59,12 +83,133 @@ pub(super) async fn cmd_db_ereport( } async fn cmd_db_ereport_list( - _opctx: &OpContext, - _datastore: &DataStore, - _fetch_opts: &DbFetchOptions, - _args: &ListArgs, + _: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &ListArgs, ) -> anyhow::Result<()> { - Err(anyhow::anyhow!("not yet implemented")) + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct EreportRow { + #[tabled(display_with = "datetime_rfc3339_concise")] + time_collected: DateTime, + restart_id: Uuid, + ena: Ena, + src: SrcType, + #[tabled(rename = "#")] + slot: u16, + } + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct WithSerial<'a, T: Tabled> { + #[tabled(inline)] + inner: T, + #[tabled(display_with = "display_option_blank", rename = "S/N")] + serial: Option<&'a str>, + #[tabled(display_with = "display_option_blank", rename = "P/N")] + part_number: Option<&'a str>, + } + + impl From<&'_ db::model::SpEreport> for EreportRow { + fn from(ereport: &db::model::SpEreport) -> Self { + let &db::model::SpEreport { + time_collected, + restart_id, + ena, + sp_type, + sp_slot, + .. + } = ereport; + let src = match sp_type { + SpType::Power => SrcType::Power, + SpType::Sled => SrcType::SledSp, + SpType::Switch => SrcType::Switch, + }; + EreportRow { + time_collected, + restart_id: restart_id.into_untyped_uuid(), + ena: ena.into(), + src, + slot: sp_slot.0.into(), + } + } + } + + impl<'report, T> From<&'report db::model::SpEreport> for WithSerial<'report, T> + where + T: Tabled, + T: From<&'report db::model::SpEreport>, + { + fn from(ereport: &'report db::model::SpEreport) -> Self { + let inner = T::from(ereport); + WithSerial { + inner, + serial: ereport.serial_number.as_deref(), + part_number: ereport.part_number.as_deref(), + } + } + } + + if let (Some(before), Some(after)) = (args.before, args.after) { + anyhow::ensure!( + after < before, + "if both `--after` and `--before` are included, after must be + earlier than before" + ); + } + + let conn = datastore.pool_connection_for_tests().await?; + + let ctx = || "loading SP ereports"; + let mut query = sp_dsl::sp_ereport + .select(db::model::SpEreport::as_select()) + .limit(fetch_opts.fetch_limit.get().into()) + .order_by((sp_dsl::time_collected, sp_dsl::restart_id, sp_dsl::ena)) + .into_boxed(); + + if let Some(ref serial) = args.serial { + query = query.filter(sp_dsl::serial_number.eq(serial.clone())); + } + + if let Some(before) = args.before { + query = query.filter(sp_dsl::time_collected.lt(before)); + } + + if let Some(after) = args.after { + query = query.filter(sp_dsl::time_collected.gt(after)); + } + + let sp_ereports = query.load_async(&*conn).await.with_context(ctx)?; + check_limit(&sp_ereports, fetch_opts.fetch_limit, ctx); + + // let host_ereports: Vec = Vec::new(); // TODO + + let mut table = if args.serial.is_some() { + tabled::Table::new(sp_ereports.iter().map(EreportRow::from)) + } else { + tabled::Table::new( + sp_ereports.iter().map(WithSerial::<'_, EreportRow>::from), + ) + }; + + table + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)); + + println!("{table}"); + + Ok(()) +} + +#[derive(strum::Display)] +enum SrcType { + Switch, + Power, + #[strum(to_string = "Sled (SP)")] + SledSp, + // #[strum(to_string = "Sled (OS)")] + // SledOS, } async fn cmd_db_ereport_info( diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b79844414bd..05f2ccc7870 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5785,12 +5785,19 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( PRIMARY KEY (restart_id, ena) ); -CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_slot ON omicron.public.sp_ereport ( +CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_slot +ON omicron.public.sp_ereport ( sp_type, sp_slot, time_collected ); +CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp +ON omicron.public.sp_ereport +USING BTREE ( + time_collected +); + -- Ereports from the host operating system CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( restart_id UUID NOT NULL, @@ -5816,6 +5823,12 @@ CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_e time_collected ); +CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp +ON omicron.public.host_ereport +USING BTREE ( + time_collected +); + /* * Keep this at the end of file so that the database does not contain a version * until it is fully populated. From 2fc3ed73f96617f58bdb409c4e360877b6b08f8d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Jun 2025 16:29:17 -0700 Subject: [PATCH 16/61] give one of fake the switches an ereport so the table is less boring --- gateway-test-utils/configs/sp_sim_config.test.toml | 11 +++++++++++ schema/crdb/dbinit.sql | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index 069b7602fba..1e8e18a6e00 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -48,6 +48,17 @@ sensors = [ { name = "South", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 }, ] +[simulated_sps.sidecar.ereport_config] +restart_id = "0d3e464a-666e-4687-976f-90e31238be8b" + +[[simulated_sps.sidecar.ereport_config.ereports]] +task_name = "task_thermal_server" +task_gen = 1 +uptime = 1235 +class = "oxide.sidecar.thermal.sensor_read_error" +sensor = { id = "dev-1", device = "fake-tmp-sensor", location = "South", presence = "Failed" } +error = "DeviceError" + [[simulated_sps.sidecar]] serial_number = "SimSidecar1" manufacturing_root_cert_seed = "01de01de01de01de01de01de01de01de01de01de01de01de01de01de01de01de" diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 05f2ccc7870..afeaa381f23 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5776,8 +5776,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( -- may be generated in a state where the SP doesn't know who or what it is. -- consider that "i don't know my own identity" is a reasonable condition to -- might want to generate an ereport about! - serial_number STRING(11), - part_number STRING(11), + serial_number STRING, + part_number STRING, -- JSON representation of the ereport report JSONB NOT NULL, From a11b98101070755890aeba5f42c2f0f60c32b8aa Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 6 Jun 2025 11:14:33 -0700 Subject: [PATCH 17/61] demo checkpoint --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 119 ++++++++++++++++++++-- 1 file changed, 113 insertions(+), 6 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index c8f00592b55..75485318e1b 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -43,6 +43,9 @@ enum Commands { /// Show an ereport #[clap(alias = "show")] Info(InfoArgs), + + /// List ereport reporters + Reporters(ReportersArgs), } #[derive(Debug, Args, Clone)] @@ -66,6 +69,17 @@ struct ListArgs { after: Option>, } +#[derive(Debug, Args, Clone)] +struct ReportersArgs { + #[clap(long = "type", short = 't')] + slot_type: Option, + + #[clap(long = "slot", short = 's', requires = "slot_type")] + slot: Option, + + serial: Option, +} + pub(super) async fn cmd_db_ereport( opctx: &OpContext, datastore: &DataStore, @@ -79,6 +93,10 @@ pub(super) async fn cmd_db_ereport( Commands::Info(ref args) => { cmd_db_ereport_info(opctx, datastore, args).await } + + Commands::Reporters(ref args) => { + cmd_db_ereporters(datastore, args).await + } } } @@ -121,16 +139,11 @@ async fn cmd_db_ereport_list( sp_slot, .. } = ereport; - let src = match sp_type { - SpType::Power => SrcType::Power, - SpType::Sled => SrcType::SledSp, - SpType::Switch => SrcType::Switch, - }; EreportRow { time_collected, restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), - src, + src: sp_type.into(), slot: sp_slot.0.into(), } } @@ -212,6 +225,16 @@ enum SrcType { // SledOS, } +impl From for SrcType { + fn from(sp_type: SpType) -> Self { + match sp_type { + SpType::Power => SrcType::Power, + SpType::Sled => SrcType::SledSp, + SpType::Switch => SrcType::Switch, + } + } +} + async fn cmd_db_ereport_info( opctx: &OpContext, datastore: &DataStore, @@ -272,3 +295,87 @@ async fn cmd_db_ereport_info( Ok(()) } + +async fn cmd_db_ereporters( + datastore: &DataStore, + args: &ReportersArgs, +) -> anyhow::Result<()> { + let &ReportersArgs { slot, slot_type, ref serial } = args; + + let conn = datastore.pool_connection_for_tests().await?; + let mut sp_query = sp_dsl::sp_ereport + .select(( + sp_dsl::restart_id, + sp_dsl::sp_slot, + sp_dsl::sp_type, + sp_dsl::serial_number, + sp_dsl::part_number, + )) + .order_by(( + sp_dsl::sp_type, + sp_dsl::sp_slot, + sp_dsl::serial_number, + sp_dsl::restart_id, + )) + .distinct() + .into_boxed(); + + if let Some(slot) = slot { + if slot_type.is_some() { + sp_query = sp_query + .filter(sp_dsl::sp_slot.eq(db::model::SqlU16::new(slot))); + } else { + anyhow::bail!( + "cannot filter reporters by slot without a value for `--type`" + ) + } + } + + if let Some(slot_type) = slot_type { + sp_query = sp_query + .filter(sp_dsl::sp_type.eq(SpType::from(slot_type.clone()))); + } + + if let Some(serial) = serial { + sp_query = sp_query.filter(sp_dsl::serial_number.eq(serial.clone())); + } + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct ReporterRow { + #[tabled(rename = "TYPE")] + ty: SrcType, + #[tabled(rename = "#")] + slot: u16, + #[tabled(display_with = "display_option_blank", rename = "S/N")] + serial: Option, + #[tabled(display_with = "display_option_blank", rename = "P/N")] + part_number: Option, + id: Uuid, + } + + let sp_ereports = sp_query + .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option)>( + &*conn, + ) + .await + .context("listing SP reporter entries")?; + + let mut table = tabled::Table::new(sp_ereports.into_iter().map( + |(restart_id, slot, sp_type, serial, part_number)| ReporterRow { + ty: sp_type.into(), + slot: slot.into(), + serial, + part_number, + id: restart_id, + }, + )); + + table + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)); + + println!("{table}"); + + Ok(()) +} From cb70e51a3d1d95bc69463d2bedd6cbdbf55758cd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 6 Jun 2025 11:25:53 -0700 Subject: [PATCH 18/61] add count to ereport list --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 75485318e1b..d9bade20120 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -352,6 +352,7 @@ async fn cmd_db_ereporters( #[tabled(display_with = "display_option_blank", rename = "P/N")] part_number: Option, id: Uuid, + ereports: i64, } let sp_ereports = sp_query @@ -360,16 +361,24 @@ async fn cmd_db_ereporters( ) .await .context("listing SP reporter entries")?; - - let mut table = tabled::Table::new(sp_ereports.into_iter().map( - |(restart_id, slot, sp_type, serial, part_number)| ReporterRow { + let mut rows = Vec::with_capacity(sp_ereports.len()); + + for (restart_id, slot, sp_type, serial, part_number) in sp_ereports { + let ereports = sp_dsl::sp_ereport + .filter(sp_dsl::restart_id.eq(restart_id)) + .count() + .get_result_async(&*conn) + .await?; + rows.push(ReporterRow { ty: sp_type.into(), slot: slot.into(), serial, part_number, id: restart_id, - }, - )); + ereports, + }); + } + let mut table = tabled::Table::new(rows.into_iter()); table .with(tabled::settings::Style::empty()) From 2775725e66ead35d442ee4b86dafbcb86b4545b5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 6 Jun 2025 14:21:53 -0700 Subject: [PATCH 19/61] start omdb status command --- dev-tools/omdb/src/bin/omdb/nexus.rs | 92 +++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 50b42b6392a..5c2a4aa600e 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -47,6 +47,7 @@ use nexus_types::deployment::OximeterReadMode; use nexus_types::deployment::OximeterReadPolicy; use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::BlueprintRendezvousStatus; +use nexus_types::internal_api::background::EreporterStatus; use nexus_types::internal_api::background::InstanceReincarnationStatus; use nexus_types::internal_api::background::InstanceUpdaterStatus; use nexus_types::internal_api::background::LookupRegionPortStatus; @@ -57,6 +58,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use nexus_types::internal_api::background::SpEreportIngesterStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::TufArtifactReplicationCounters; @@ -1122,6 +1124,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "service_firewall_rule_propagation" => { print_task_service_firewall_rule_propagation(details); } + "sp_ereport_ingester" => { + print_task_sp_ereport_ingester(details); + } "support_bundle_collector" => { print_task_support_bundle_collector(details); } @@ -2693,7 +2698,7 @@ fn print_task_webhook_deliverator(details: &serde_json::Value) { if n_internal_errors > 0 { total_errors += n_internal_errors; println!( - "/!\\ {ERRORS:NUM_WIDTH$}", + "{ERRICON} {ERRORS:NUM_WIDTH$}", n_internal_errors, ); if let Some(error) = error { @@ -2720,8 +2725,91 @@ fn print_task_webhook_deliverator(details: &serde_json::Value) { ); } +fn print_task_sp_ereport_ingester(details: &serde_json::Value) { + use nexus_types::internal_api::background::SpEreportIngesterStatus; + use nexus_types::internal_api::background::SpEreporterStatus; + + let SpEreportIngesterStatus { sps, error } = + match serde_json::from_value(details.clone()) { + Err(error) => { + eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ); + return; + } + Ok(status) => status, + }; + + if let Some(error) = error { + println!("{ERRICON} task activation failed: {error}"); + } + + print_ereporter_status_totals(sps.iter().map(|sp| &sp.status)); +} + +fn print_ereporter_status_totals<'status>( + statuses: impl Iterator, +) { + let mut total_received = 0; + let mut total_new = 0; + let mut total_reqs = 0; + let mut total_errors = 0; + let mut reporters_with_ereports = 0; + let mut reporters_with_errors = 0; + + for &EreporterStatus { + ereports_received, + new_ereports, + requests, + ref errors, + } in statuses + { + total_received += ereports_received; + total_new += new_ereports; + total_reqs += requests; + total_errors += errors.len(); + if total_received > 0 { + reporters_with_ereports += 1; + } + if total_errors > 0 { + reporters_with_errors += 1; + } + } + + const EREPORTS_RECEIVED: &str = "total ereports received"; + const NEW_EREPORTS: &str = " new ereports ingested"; + const HTTP_REQUESTS: &str = "total HTTP requests sent"; + const ERRORS: &str = " total collection errors"; + const REPORTERS_WITH_EREPORTS: &str = "reporters with ereports"; + const REPORTERS_WITH_ERRORS: &str = "reporters with collection errors"; + const WIDTH: usize = const_max_len(&[ + EREPORTS_RECEIVED, + NEW_EREPORTS, + HTTP_REQUESTS, + ERRORS, + REPORTERS_WITH_EREPORTS, + ]) + 1; + const NUM_WIDTH: usize = 4; + + println!(" {EREPORTS_RECEIVED:NUM_WIDTH$}"); + println!(" {NEW_EREPORTS:NUM_WIDTH$}"); + println!(" {HTTP_REQUESTS:NUM_WIDTH$}"); + println!(" {ERRORS:NUM_WIDTH$}"); + println!( + " {REPORTERS_WITH_EREPORTS:NUM_WIDTH$}" + ); + println!( + " {REPORTERS_WITH_ERRORS:NUM_WIDTH$}" + ); +} + +const ERRICON: &str = "/!\\"; + fn warn_if_nonzero(n: usize) -> &'static str { - if n > 0 { "/!\\" } else { " " } + if n > 0 { ERRICON } else { " " } } /// Summarizes an `ActivationReason` From 094c34f64c043208730bd24b0180ad6e454ad7c4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 6 Jun 2025 16:29:58 -0700 Subject: [PATCH 20/61] finish up status command --- dev-tools/omdb/src/bin/omdb/nexus.rs | 48 +++++++++++++++++++++++---- dev-tools/omdb/tests/env.out | 12 +++++++ dev-tools/omdb/tests/successes.out | 18 ++++++++++ dev-tools/omdb/tests/usage_errors.out | 2 ++ 4 files changed, 73 insertions(+), 7 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 5c2a4aa600e..dfd4e029b07 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -58,7 +58,6 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; -use nexus_types::internal_api::background::SpEreportIngesterStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::TufArtifactReplicationCounters; @@ -2746,6 +2745,41 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { } print_ereporter_status_totals(sps.iter().map(|sp| &sp.status)); + + const NEW_EREPORTS: &str = "new ereports ingested:"; + const HTTP_REQUESTS: &str = "HTTP requests sent:"; + const ERRORS: &str = "errors:"; + const WIDTH: usize = + const_max_len(&[NEW_EREPORTS, HTTP_REQUESTS, ERRORS]) + 1; + const NUM_WIDTH: usize = 3; + + if !sps.is_empty() { + println!("\n Service processors:"); + for SpEreporterStatus { sp_type, slot, status } in &sps { + println!( + " - {sp_type:<6} {slot:02}: {:>NUM_WIDTH$} ereports", + status.ereports_received + ); + println!( + " {NEW_EREPORTS:NUM_WIDTH$}", + status.new_ereports + ); + println!( + " {HTTP_REQUESTS:NUM_WIDTH$}", + status.requests + ); + + if !status.errors.is_empty() { + println!( + "{ERRICON} {ERRORS:NUM_WIDTH$}", + status.errors.len() + ); + for error in &status.errors { + println!(" - {error}"); + } + } + } + } } fn print_ereporter_status_totals<'status>( @@ -2777,12 +2811,12 @@ fn print_ereporter_status_totals<'status>( } } - const EREPORTS_RECEIVED: &str = "total ereports received"; - const NEW_EREPORTS: &str = " new ereports ingested"; - const HTTP_REQUESTS: &str = "total HTTP requests sent"; - const ERRORS: &str = " total collection errors"; - const REPORTERS_WITH_EREPORTS: &str = "reporters with ereports"; - const REPORTERS_WITH_ERRORS: &str = "reporters with collection errors"; + const EREPORTS_RECEIVED: &str = "total ereports received:"; + const NEW_EREPORTS: &str = " new ereports ingested:"; + const HTTP_REQUESTS: &str = "total HTTP requests sent:"; + const ERRORS: &str = " total collection errors:"; + const REPORTERS_WITH_EREPORTS: &str = "reporters with ereports:"; + const REPORTERS_WITH_ERRORS: &str = "reporters with collection errors:"; const WIDTH: usize = const_max_len(&[ EREPORTS_RECEIVED, NEW_EREPORTS, diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 81bfa368cb8..bf4cc170120 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -175,6 +175,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "sp_ereport_ingester" + collects error reports from service processors + + task: "support_bundle_collector" Manage support bundle collection and cleanup @@ -371,6 +375,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "sp_ereport_ingester" + collects error reports from service processors + + task: "support_bundle_collector" Manage support bundle collection and cleanup @@ -554,6 +562,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "sp_ereport_ingester" + collects error reports from service processors + + task: "support_bundle_collector" Manage support bundle collection and cleanup diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index b1d4851bebb..b41e5733986 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -387,6 +387,10 @@ task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table +task: "sp_ereport_ingester" + collects error reports from service processors + + task: "support_bundle_collector" Manage support bundle collection and cleanup @@ -735,6 +739,13 @@ task: "service_zone_nat_tracker" started at (s ago) and ran for ms last completion reported error: inventory collection is None +task: "sp_ereport_ingester" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: looking up MGS addresses + task: "support_bundle_collector" configured period: every days h m s currently executing: no @@ -1254,6 +1265,13 @@ task: "service_zone_nat_tracker" started at (s ago) and ran for ms last completion reported error: inventory collection is None +task: "sp_ereport_ingester" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: looking up MGS addresses + task: "support_bundle_collector" configured period: every days h m s currently executing: no diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 97187d11e80..338bffda78c 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -119,6 +119,7 @@ Commands: rack Print information about the rack disks Print information about virtual disks dns Print information about internal and external DNS + ereport Query and display error reports inventory Print information about collected hardware/software inventory physical-disks Print information about physical disks region Print information about regions @@ -176,6 +177,7 @@ Commands: rack Print information about the rack disks Print information about virtual disks dns Print information about internal and external DNS + ereport Query and display error reports inventory Print information about collected hardware/software inventory physical-disks Print information about physical disks region Print information about regions From 93fb64d9eb2b2248098c156ee3fe825302c27594 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 9 Jun 2025 10:33:42 -0700 Subject: [PATCH 21/61] include whole error cause chain in statuses --- dev-tools/omdb/tests/successes.out | 4 ++-- nexus/src/app/background/tasks/ereport_ingester.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index b41e5733986..802e736ceb9 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -744,7 +744,7 @@ task: "sp_ereport_ingester" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: looking up MGS addresses + last completion reported error: looking up MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "support_bundle_collector" configured period: every days h m s @@ -1270,7 +1270,7 @@ task: "sp_ereport_ingester" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: looking up MGS addresses + last completion reported error: looking up MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "support_bundle_collector" configured period: every days h m s diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index e1e75c83dc6..c2951f6ad76 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -45,7 +45,7 @@ impl BackgroundTask for SpEreportIngester { let status = self.actually_activate(opctx).await.unwrap_or_else(|error| { SpEreportIngesterStatus { - error: Some(error.to_string()), + error: Some(format!("{error:#}")), ..Default::default() } }); @@ -161,7 +161,7 @@ impl Ingester { Err(error) => { return Some(EreporterStatus { errors: vec![format!( - "failed to query for latest ereport: {error}" + "failed to query for latest ereport: {error:#}" )], ..Default::default() }); @@ -245,7 +245,7 @@ impl Ingester { "error" => %e, ); status.errors.push(format!( - "failed to insert {received} ereports: {e}" + "failed to insert {received} ereports: {e:#}" )); break; } @@ -321,7 +321,7 @@ impl Ingester { Err(e) => { let stats = status.get_or_insert_default(); stats.requests += 1; - stats.errors.push(format!("MGS {addr}: {e}")); + stats.errors.push(format!("MGS {addr}: {e:#}")); } } } From 56ee5f3e207c01a36b5243a23fa123fbb3f88141 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 9 Jun 2025 11:13:01 -0700 Subject: [PATCH 22/61] make ereports soft-deleteable --- dev-tools/omdb/src/bin/omdb/db.rs | 2 +- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 73 +++++++++++++++++-- nexus/db-model/src/ereport.rs | 7 ++ nexus/db-queries/src/db/datastore/ereport.rs | 2 + nexus/db-schema/src/schema.rs | 2 + .../app/background/tasks/ereport_ingester.rs | 1 + schema/crdb/dbinit.sql | 10 ++- 7 files changed, 87 insertions(+), 10 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index ba589044ee4..93c427d65df 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1500,7 +1500,7 @@ impl DbArgs { ).await }, DbCommands::Ereport(args) => { - cmd_db_ereport(&opctx, &datastore, &fetch_opts, &args).await + cmd_db_ereport(&datastore, &fetch_opts, &args).await } } } diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index d9bade20120..6d07a7f293b 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -19,10 +19,11 @@ use clap::Subcommand; use diesel::prelude::*; use ereport_types::Ena; use ereport_types::EreporterRestartUuid; +use nexus_db_lookup::DbConnection; use nexus_db_model::SpType; -use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; +use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_uuid_kinds::GenericUuid; use tabled::Tabled; @@ -81,17 +82,16 @@ struct ReportersArgs { } pub(super) async fn cmd_db_ereport( - opctx: &OpContext, datastore: &DataStore, fetch_opts: &DbFetchOptions, args: &EreportArgs, ) -> anyhow::Result<()> { match args.command { Commands::List(ref args) => { - cmd_db_ereport_list(opctx, datastore, fetch_opts, args).await + cmd_db_ereport_list(datastore, fetch_opts, args).await } Commands::Info(ref args) => { - cmd_db_ereport_info(opctx, datastore, args).await + cmd_db_ereport_info(datastore, fetch_opts, args).await } Commands::Reporters(ref args) => { @@ -101,7 +101,6 @@ pub(super) async fn cmd_db_ereport( } async fn cmd_db_ereport_list( - _: &OpContext, datastore: &DataStore, fetch_opts: &DbFetchOptions, args: &ListArgs, @@ -193,6 +192,10 @@ async fn cmd_db_ereport_list( query = query.filter(sp_dsl::time_collected.gt(after)); } + if !fetch_opts.include_deleted { + query = query.filter(sp_dsl::time_deleted.is_null()); + } + let sp_ereports = query.load_async(&*conn).await.with_context(ctx)?; check_limit(&sp_ereports, fetch_opts.fetch_limit, ctx); @@ -236,23 +239,26 @@ impl From for SrcType { } async fn cmd_db_ereport_info( - opctx: &OpContext, datastore: &DataStore, + fetch_opts: &DbFetchOptions, args: &InfoArgs, ) -> anyhow::Result<()> { let &InfoArgs { restart_id, ena } = args; let ereport_id = ereport_types::EreportId { restart_id, ena }; + let conn = datastore.pool_connection_for_tests().await?; let db::model::Ereport { id, metadata, reporter, report } = - datastore.ereport_fetch(opctx, ereport_id).await?; + ereport_fetch(&*conn, fetch_opts, ereport_id).await?; let db::model::EreportMetadata { time_collected, + time_deleted, collector_id, part_number, serial_number, } = metadata; const ENA: &str = "ENA"; const TIME_COLLECTED: &str = "collected at"; + const TIME_DELETED: &str = "deleted at"; const COLLECTOR_ID: &str = "collected by"; const REPORTER: &str = "reported by"; const RESTART_ID: &str = "restart ID"; @@ -260,6 +266,7 @@ async fn cmd_db_ereport_info( const SERIAL_NUMBER: &str = " serial number"; const WIDTH: usize = const_max_len(&[ TIME_COLLECTED, + TIME_DELETED, COLLECTOR_ID, REPORTER, PART_NUMBER, @@ -267,6 +274,9 @@ async fn cmd_db_ereport_info( ]); println!("\n{:=<80}", "== EREPORT METADATA "); println!(" {ENA:>WIDTH$}: {}", id.ena); + if let Some(time_deleted) = time_deleted { + println!("(i) {TIME_DELETED:>WIDTH$}: {time_deleted}"); + } println!(" {TIME_COLLECTED:>WIDTH$}: {time_collected}"); println!(" {COLLECTOR_ID:>WIDTH$}: {collector_id}"); match reporter { @@ -296,6 +306,55 @@ async fn cmd_db_ereport_info( Ok(()) } +async fn ereport_fetch( + conn: &async_bb8_diesel::Connection, + fetch_opts: &DbFetchOptions, + id: ereport_types::EreportId, +) -> anyhow::Result { + let restart_id = id.restart_id.into_untyped_uuid(); + let ena = db::model::DbEna::from(id.ena); + + let sp_query = sp_dsl::sp_ereport + .filter(sp_dsl::restart_id.eq(restart_id.clone())) + .filter(sp_dsl::ena.eq(ena.clone())) + .filter(sp_dsl::time_deleted.is_null()) + .select(db::model::SpEreport::as_select()); + let sp_result = if !fetch_opts.include_deleted { + sp_query + .filter(sp_dsl::time_deleted.is_null()) + .first_async(&*conn) + .await + } else { + sp_query.first_async(&*conn).await + }; + if let Some(report) = sp_result.optional().with_context(|| { + format!("failed to query for SP ereport matching {id}") + })? { + return Ok(report.into()); + } + + let host_query = host_dsl::host_ereport + .filter(host_dsl::restart_id.eq(restart_id)) + .filter(host_dsl::ena.eq(ena)) + .filter(host_dsl::time_deleted.is_null()) + .select(db::model::HostEreport::as_select()); + let host_result = if !fetch_opts.include_deleted { + host_query + .filter(host_dsl::time_deleted.is_null()) + .first_async(&*conn) + .await + } else { + host_query.first_async(&*conn).await + }; + if let Some(report) = host_result.optional().with_context(|| { + format!("failed to query for host OS ereport matching {id}") + })? { + return Ok(report.into()); + } + + Err(anyhow::anyhow!("no ereport {id} found")) +} + async fn cmd_db_ereporters( datastore: &DataStore, args: &ReportersArgs, diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index dc640c36cef..109450dbd60 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -76,6 +76,7 @@ impl From for Ereport { restart_id, ena, time_collected, + time_deleted, collector_id, part_number, serial_number, @@ -87,6 +88,7 @@ impl From for Ereport { id: EreportId { restart_id: restart_id.into(), ena: ena.into() }, metadata: EreportMetadata { time_collected, + time_deleted, collector_id: collector_id.into(), part_number, serial_number, @@ -103,6 +105,7 @@ impl From for Ereport { restart_id, ena, time_collected, + time_deleted, collector_id, sled_serial, sled_id, @@ -112,6 +115,7 @@ impl From for Ereport { id: EreportId { restart_id: restart_id.into(), ena: ena.into() }, metadata: EreportMetadata { time_collected, + time_deleted, collector_id: collector_id.into(), part_number: None, // TODO serial_number: Some(sled_serial), @@ -125,6 +129,7 @@ impl From for Ereport { #[derive(Clone, Debug)] pub struct EreportMetadata { pub time_collected: DateTime, + pub time_deleted: Option>, pub collector_id: OmicronZoneUuid, pub part_number: Option, pub serial_number: Option, @@ -141,6 +146,7 @@ pub enum Reporter { pub struct SpEreport { pub restart_id: DbTypedUuid, pub ena: DbEna, + pub time_deleted: Option>, pub time_collected: DateTime, pub collector_id: DbTypedUuid, @@ -182,6 +188,7 @@ pub struct SpEreport { pub struct HostEreport { pub restart_id: DbTypedUuid, pub ena: DbEna, + pub time_deleted: Option>, pub time_collected: DateTime, pub collector_id: DbTypedUuid, diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 321cc801ed5..f1534dbd120 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -61,6 +61,7 @@ impl DataStore { if let Some(report) = sp_dsl::sp_ereport .filter(sp_dsl::restart_id.eq(restart_id)) .filter(sp_dsl::ena.eq(ena)) + .filter(sp_dsl::time_deleted.is_null()) .select(SpEreport::as_select()) .first_async(&*conn) .await @@ -73,6 +74,7 @@ impl DataStore { if let Some(report) = host_dsl::host_ereport .filter(host_dsl::restart_id.eq(restart_id)) .filter(host_dsl::ena.eq(ena)) + .filter(host_dsl::time_deleted.is_null()) .select(HostEreport::as_select()) .first_async(&*conn) .await diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 6b86ddc357e..bd997d1bf14 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2447,6 +2447,7 @@ table! { sp_ereport (restart_id, ena) { restart_id -> Uuid, ena -> Int8, + time_deleted -> Nullable, time_collected -> Timestamptz, collector_id -> Uuid, @@ -2464,6 +2465,7 @@ table! { host_ereport (restart_id, ena) { restart_id -> Uuid, ena -> Int8, + time_deleted -> Nullable, time_collected -> Timestamptz, collector_id -> Uuid, diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index c2951f6ad76..af8bd119139 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -213,6 +213,7 @@ impl Ingester { restart_id: restart_id.into(), ena: ereport.ena.into(), time_collected: Utc::now(), + time_deleted: None, collector_id: self.nexus_id.into(), sp_type: sp_type.into(), sp_slot: slot.into(), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index afeaa381f23..949c07cf6ec 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5757,6 +5757,7 @@ ON omicron.public.webhook_delivery_attempt ( CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( restart_id UUID NOT NULL, ena INT8 NOT NULL, + time_deleted TIMESTAMPTZ, -- time at which the ereport was collected time_collected TIMESTAMPTZ NOT NULL, @@ -5796,12 +5797,15 @@ CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp ON omicron.public.sp_ereport USING BTREE ( time_collected -); +) +WHERE + time_deleted IS NULL; -- Ereports from the host operating system CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( restart_id UUID NOT NULL, ena INT8 NOT NULL, + time_deleted TIMESTAMPTZ, -- time at which the ereport was collected time_collected TIMESTAMPTZ NOT NULL, @@ -5827,7 +5831,9 @@ CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp ON omicron.public.host_ereport USING BTREE ( time_collected -); +) +WHERE + time_deleted IS NULL; /* * Keep this at the end of file so that the database does not contain a version From b96e2e86ef515f8d43a6f3c3d65a1aeab2065d80 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 9 Jun 2025 16:25:52 -0700 Subject: [PATCH 23/61] stuff --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 111 +++++++++++++--------- dev-tools/omdb/src/bin/omdb/helpers.rs | 7 ++ 2 files changed, 74 insertions(+), 44 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 6d07a7f293b..9c4885e2f47 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -9,9 +9,12 @@ use super::check_limit; use crate::helpers::const_max_len; use crate::helpers::datetime_rfc3339_concise; use crate::helpers::display_option_blank; +use crate::helpers::display_option_error; use anyhow::Context; +use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::AsyncSimpleConnection; use chrono::DateTime; use chrono::Utc; use clap::Args; @@ -23,6 +26,7 @@ use nexus_db_lookup::DbConnection; use nexus_db_model::SpType; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_uuid_kinds::GenericUuid; @@ -362,42 +366,56 @@ async fn cmd_db_ereporters( let &ReportersArgs { slot, slot_type, ref serial } = args; let conn = datastore.pool_connection_for_tests().await?; - let mut sp_query = sp_dsl::sp_ereport - .select(( - sp_dsl::restart_id, - sp_dsl::sp_slot, - sp_dsl::sp_type, - sp_dsl::serial_number, - sp_dsl::part_number, - )) - .order_by(( - sp_dsl::sp_type, - sp_dsl::sp_slot, - sp_dsl::serial_number, - sp_dsl::restart_id, - )) - .distinct() - .into_boxed(); + let sp_ereporters = (*conn).transaction_async({ + let serial = serial.clone(); + async move |conn| { + // Selecting all reporters may require a full table scan, depending + // on filters. + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + let mut sp_query = sp_dsl::sp_ereport + .select(( + sp_dsl::restart_id, + sp_dsl::sp_slot, + sp_dsl::sp_type, + sp_dsl::serial_number, + sp_dsl::part_number, + )) + .order_by(( + sp_dsl::sp_type, + sp_dsl::sp_slot, + sp_dsl::serial_number, + sp_dsl::restart_id, + )) + .distinct() + .into_boxed(); + + if let Some(slot) = slot { + if slot_type.is_some() { + sp_query = sp_query + .filter(sp_dsl::sp_slot.eq(db::model::SqlU16::new(slot))); + } else { + anyhow::bail!( + "cannot filter reporters by slot without a value for `--type`" + ) + } + } - if let Some(slot) = slot { - if slot_type.is_some() { - sp_query = sp_query - .filter(sp_dsl::sp_slot.eq(db::model::SqlU16::new(slot))); - } else { - anyhow::bail!( - "cannot filter reporters by slot without a value for `--type`" - ) - } - } + if let Some(slot_type) = slot_type { + sp_query = sp_query + .filter(sp_dsl::sp_type.eq(SpType::from(slot_type.clone()))); + } - if let Some(slot_type) = slot_type { - sp_query = sp_query - .filter(sp_dsl::sp_type.eq(SpType::from(slot_type.clone()))); - } + if let Some(serial) = serial { + sp_query = sp_query.filter(sp_dsl::serial_number.eq(serial.clone())); + } - if let Some(serial) = serial { - sp_query = sp_query.filter(sp_dsl::serial_number.eq(serial.clone())); - } + sp_query + .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option)>( + &conn, + ) + .await.context("listing SP reporter entries") + } + }).await?; #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] @@ -411,23 +429,28 @@ async fn cmd_db_ereporters( #[tabled(display_with = "display_option_blank", rename = "P/N")] part_number: Option, id: Uuid, - ereports: i64, + #[tabled(display_with = "display_option_error", rename = "P/N")] + ereports: Option, } - let sp_ereports = sp_query - .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option)>( - &*conn, - ) - .await - .context("listing SP reporter entries")?; - let mut rows = Vec::with_capacity(sp_ereports.len()); + let mut rows = Vec::with_capacity(sp_ereporters.len()); - for (restart_id, slot, sp_type, serial, part_number) in sp_ereports { - let ereports = sp_dsl::sp_ereport + for (restart_id, slot, sp_type, serial, part_number) in sp_ereporters { + let count_result = sp_dsl::sp_ereport .filter(sp_dsl::restart_id.eq(restart_id)) .count() .get_result_async(&*conn) - .await?; + .await; + let ereports = match count_result { + Ok(count) => Some(count), + Err(err) => { + eprintln!( + "WARN: could not count ereports for {restart_id} \ + ({sp_type} {slot}): {err}", + ); + None + } + }; rows.push(ReporterRow { ty: sp_type.into(), slot: slot.into(), diff --git a/dev-tools/omdb/src/bin/omdb/helpers.rs b/dev-tools/omdb/src/bin/omdb/helpers.rs index efa6f97fde3..d0a65f9d25f 100644 --- a/dev-tools/omdb/src/bin/omdb/helpers.rs +++ b/dev-tools/omdb/src/bin/omdb/helpers.rs @@ -45,6 +45,13 @@ pub(crate) fn display_option_blank( opt.as_ref().map(|x| x.to_string()).unwrap_or_else(|| "".to_string()) } +// Display the string "" for an Option if it's None. +pub(crate) fn display_option_error( + opt: &Option, +) -> String { + opt.as_ref().map(|x| x.to_string()).unwrap_or_else(|| "".to_string()) +} + // Format a `chrono::DateTime` in RFC3339 with milliseconds precision and using // `Z` rather than the UTC offset for UTC timestamps, to save a few characters // of line width in tabular output. From 69d6e8265d06e88e07bc644e5da2a21c48468629 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 11:42:05 -0700 Subject: [PATCH 24/61] nicer reporter list query --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 64 +++++----- nexus/db-model/src/ereport.rs | 3 +- nexus/db-queries/src/db/datastore/ereport.rs | 119 +++++++++++++++++++ 3 files changed, 155 insertions(+), 31 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 9c4885e2f47..2d8ff6964b4 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -7,9 +7,9 @@ use super::DbFetchOptions; use super::check_limit; use crate::helpers::const_max_len; +use crate::helpers::datetime_opt_rfc3339_concise; use crate::helpers::datetime_rfc3339_concise; use crate::helpers::display_option_blank; -use crate::helpers::display_option_error; use anyhow::Context; use async_bb8_diesel::AsyncConnection; @@ -19,6 +19,7 @@ use chrono::DateTime; use chrono::Utc; use clap::Args; use clap::Subcommand; +use diesel::dsl::{count_distinct, min}; use diesel::prelude::*; use ereport_types::Ena; use ereport_types::EreporterRestartUuid; @@ -373,12 +374,21 @@ async fn cmd_db_ereporters( // on filters. conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; let mut sp_query = sp_dsl::sp_ereport + .group_by(( + sp_dsl::restart_id, + sp_dsl::sp_slot, + sp_dsl::sp_type, + sp_dsl::serial_number, + sp_dsl::part_number + )) .select(( sp_dsl::restart_id, sp_dsl::sp_slot, sp_dsl::sp_type, sp_dsl::serial_number, sp_dsl::part_number, + min(sp_dsl::time_collected), + count_distinct(sp_dsl::ena), )) .order_by(( sp_dsl::sp_type, @@ -386,7 +396,6 @@ async fn cmd_db_ereporters( sp_dsl::serial_number, sp_dsl::restart_id, )) - .distinct() .into_boxed(); if let Some(slot) = slot { @@ -410,7 +419,7 @@ async fn cmd_db_ereporters( } sp_query - .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option)>( + .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option, Option>, i64)>( &conn, ) .await.context("listing SP reporter entries") @@ -420,6 +429,9 @@ async fn cmd_db_ereporters( #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] struct ReporterRow { + #[tabled(display_with = "datetime_opt_rfc3339_concise")] + first_seen: Option>, + id: Uuid, #[tabled(rename = "TYPE")] ty: SrcType, #[tabled(rename = "#")] @@ -428,38 +440,30 @@ async fn cmd_db_ereporters( serial: Option, #[tabled(display_with = "display_option_blank", rename = "P/N")] part_number: Option, - id: Uuid, - #[tabled(display_with = "display_option_error", rename = "P/N")] - ereports: Option, + ereports: i64, } - let mut rows = Vec::with_capacity(sp_ereporters.len()); - - for (restart_id, slot, sp_type, serial, part_number) in sp_ereporters { - let count_result = sp_dsl::sp_ereport - .filter(sp_dsl::restart_id.eq(restart_id)) - .count() - .get_result_async(&*conn) - .await; - let ereports = match count_result { - Ok(count) => Some(count), - Err(err) => { - eprintln!( - "WARN: could not count ereports for {restart_id} \ - ({sp_type} {slot}): {err}", - ); - None - } - }; - rows.push(ReporterRow { - ty: sp_type.into(), - slot: slot.into(), + let rows = sp_ereporters.into_iter().map( + |( + restart_id, + slot, + sp_type, serial, part_number, - id: restart_id, + first_seen, ereports, - }); - } + )| { + ReporterRow { + first_seen, + ty: sp_type.into(), + slot: slot.into(), + serial, + part_number, + id: restart_id, + ereports, + } + }, + ); let mut table = tabled::Table::new(rows.into_iter()); table diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 109450dbd60..d9101b8989f 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -15,7 +15,8 @@ use diesel::sql_types; use ereport_types::{Ena, EreportId}; use nexus_db_schema::schema::{host_ereport, sp_ereport}; use omicron_uuid_kinds::{ - EreporterRestartKind, OmicronZoneKind, OmicronZoneUuid, SledKind, SledUuid, + EreporterRestartKind, EreporterRestartUuid, OmicronZoneKind, + OmicronZoneUuid, SledKind, SledUuid, }; use serde::{Deserialize, Serialize}; use std::convert::TryFrom; diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index f1534dbd120..1ac2ba57725 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -11,11 +11,15 @@ use crate::db::datastore::RunnableQuery; use crate::db::model::DbEna; use crate::db::model::Ereport; use crate::db::model::HostEreport; +use crate::db::model::Reporter; use crate::db::model::SpEreport; use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; +use crate::db::model::SqlU32; use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::DateTime; +use chrono::Utc; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; @@ -24,6 +28,7 @@ use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; +use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::GenericUuid; @@ -32,6 +37,14 @@ use uuid::Uuid; type EreportIdTuple = (Uuid, DbEna); +#[derive(Clone, Debug)] +pub struct EreporterRestartBySerial { + pub id: EreporterRestartUuid, + pub first_seen_at: DateTime, + pub reporter: Reporter, + pub ereports: u32, +} + impl DataStore { // pub async fn sp_ereport_list_by_serial( // &self, @@ -87,6 +100,112 @@ impl DataStore { Err(Error::non_resourcetype_not_found(format!("ereport {id}"))) } + pub async fn ereporter_restart_list_by_serial( + &self, + opctx: &OpContext, + serial: String, + ) -> ListResultVec { + use diesel::dsl::{count_distinct, min}; + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + + let conn = &*self.pool_connection_authorized(opctx).await?; + let sp_rows = sp_dsl::sp_ereport + .filter( + sp_dsl::serial_number + .eq(serial.clone()) + .and(sp_dsl::time_deleted.is_null()), + ) + .group_by((sp_dsl::restart_id, sp_dsl::sp_slot, sp_dsl::sp_type)) + .select(( + sp_dsl::restart_id, + sp_dsl::sp_type, + sp_dsl::sp_slot, + min(sp_dsl::time_collected), + count_distinct(sp_dsl::ena), + )) + .order_by(sp_dsl::restart_id) + .load_async::<(Uuid, SpType, SqlU16, Option>, SqlU32)>( + &*conn, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let host_os_rows = host_dsl::host_ereport + .filter( + host_dsl::sled_serial + .eq(serial) + .and(host_dsl::time_deleted.is_null()), + ) + .group_by((host_dsl::restart_id, host_dsl::sled_id)) + .select(( + host_dsl::restart_id, + host_dsl::sled_id, + min(host_dsl::time_collected), + count_distinct(host_dsl::ena), + )) + .order_by(host_dsl::restart_id) + .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("listing SP ereports") + })?; + + let sp_reporters = sp_rows.into_iter().filter_map(|(restart_id, sp_type, sp_slot, first_seen, ereports)| { + let ereports = ereports.into(); + let first_seen_at = match first_seen { + Some(t) => t, + None => { + debug_assert!(false, "ELIZA: why would min(time_collected) be none?"); + slog::warn!( + opctx.log, + "SP row returned by `ereporter_restart_list_by_serial` \ + had a `None` value for `min(time_collected)`"; + "restart_id" => ?restart_id, + "sp_type" => ?sp_type, + "sp_slot" => ?sp_slot, + "first_seen" => ?first_seen, + "count" => ?ereports, + ); + return None; + } + }; + + Some(EreporterRestartBySerial { + id: EreporterRestartUuid::from_untyped_uuid(restart_id), + reporter: Reporter::Sp { sp_type, slot: sp_slot.into() }, + first_seen_at, + ereports, + }) + }); + let host_reporters = host_os_rows.into_iter().filter_map(|(restart_id, sled_id, first_seen, ereports)| { + let ereports = ereports.into(); + let first_seen_at = match first_seen { + Some(t) => t, + None => { + debug_assert!(false, "ELIZA: why would min(time_collected) be none?"); + slog::warn!( + opctx.log, + "SP row returned by `ereporter_restart_list_by_serial` \ + had a `None` value for `min(time_collected)`"; + "restart_id" => ?restart_id, + "sled_id" => ?sled_id, + "first_seen" => ?first_seen, + "ereports" => ?ereports, + ); + return None; + } + }; + + Some(EreporterRestartBySerial { + id: EreporterRestartUuid::from_untyped_uuid(restart_id), + reporter: Reporter::HostOs { sled: SledUuid::from_untyped_uuid(sled_id) }, + first_seen_at, + ereports, + }) + }); + Ok(sp_reporters.chain(host_reporters).collect::>()) + } + pub async fn sp_latest_ereport_id( &self, opctx: &OpContext, From ce2858a41df82204b90f67973f68dcc05d08cee7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 15:54:12 -0700 Subject: [PATCH 25/61] start adding a test --- .../app/background/tasks/ereport_ingester.rs | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index af8bd119139..9dbec394f98 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -388,3 +388,54 @@ impl EreportQueryParams { } } } + +#[cfg(test)] +mod tests { + use super::*; + use nexus_test_utils_macros::nexus_test; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_sp_ereport_ingestion(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + let mut ingester = SpEreportIngester::new( + datastore.clone(), + nexus.internal_resolver.clone(), + nexus.id(), + ); + + let activation1 = ingester + .actually_activate(&opctx) + .await + .expect("activation should succeed"); + dbg!(&activation1); + assert_eq!( + activation1.error.as_ref(), + None, + "there should be no top-level activation error" + ); + assert_eq!( + activation1.sps.len(), + 4, + "ereports from 4 SPs should be observed: {:?}", + activation1.sps, + ); + + for SpEreporterStatus { sp_type, slot, status } in &activation1.sps { + assert_eq!( + &status.errors, + &Vec::::new(), + "there should be no errors from SP {sp_type:?} {slot}", + ); + } + + + } +} From 9887a9969be8e7acebb5da8d8b061c4f8ba16901 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 15:54:34 -0700 Subject: [PATCH 26/61] fix incorrect detection of non-existing SP response --- nexus/src/app/background/tasks/ereport_ingester.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 9dbec394f98..bdd7110d58e 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -307,7 +307,9 @@ impl Ingester { status.get_or_insert_default().requests += 1; return Some(ereports.into_inner()); } - Err(e) if e.status() == Some(http::StatusCode::NOT_FOUND) => { + Err(gateway_client::Error::ErrorResponse(rsp)) + if rsp.error_code.as_deref() == Some("InvalidSp") => + { slog::debug!( &opctx.log, "ereport collection: MGS claims there is no SP in this slot"; From 1698fef9d08bdcee64a091d8308cdcf62739c1ef Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 15:54:47 -0700 Subject: [PATCH 27/61] log errors --- nexus/src/app/background/tasks/ereport_ingester.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index bdd7110d58e..019f888a015 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -322,6 +322,17 @@ impl Ingester { ); } Err(e) => { + slog::warn!( + &opctx.log, + "ereport collection: unanticipated MGS request error: {e:#}"; + "sp_type" => ?sp_type, + "slot" => slot, + "committed_ena" => ?committed_ena, + "start_ena" => ?start_ena, + "restart_id" => ?restart_id, + "gateway_addr" => %addr, + "error" => ?e, + ); let stats = status.get_or_insert_default(); stats.requests += 1; stats.errors.push(format!("MGS {addr}: {e:#}")); From fe6e446ed222b6f77175239588fa509843d19743 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 15:55:05 -0700 Subject: [PATCH 28/61] fix ereport serialization that `serde_urlencoded` dislikes --- ereport/types/src/lib.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index bb151cd7995..412f14658f3 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -34,6 +34,7 @@ pub struct Ereport { JsonSchema, )] #[repr(transparent)] +#[serde(from = "u64", into = "u64")] pub struct Ena(pub u64); impl fmt::Display for Ena { @@ -76,6 +77,18 @@ impl FromStr for Ena { } } +impl From for Ena { + fn from(value: u64) -> Self { + Self(value) + } +} + +impl From for u64 { + fn from(Ena(value): Ena) -> Self { + value + } +} + #[derive(Clone, Debug, thiserror::Error)] #[error("ENA value ({0}) is negative")] pub struct EnaNegativeError(i64); From c9a89c1ed7f1f6138e01ed87b1e1ab4ea2aecd7f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 13 Jun 2025 15:55:28 -0700 Subject: [PATCH 29/61] reticulating queries --- nexus/db-queries/src/db/datastore/ereport.rs | 26 +++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 1ac2ba57725..cd38996d592 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -17,6 +17,7 @@ use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; use crate::db::model::SqlU32; +use crate::db::pagination::paginated; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; @@ -27,6 +28,7 @@ use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_common::api::external::CreateResult; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; @@ -41,7 +43,7 @@ type EreportIdTuple = (Uuid, DbEna); pub struct EreporterRestartBySerial { pub id: EreporterRestartUuid, pub first_seen_at: DateTime, - pub reporter: Reporter, + pub reporter_kind: Reporter, pub ereports: u32, } @@ -100,6 +102,24 @@ impl DataStore { Err(Error::non_resourcetype_not_found(format!("ereport {id}"))) } + /// List ereports from the SP with the given restart ID. + pub async fn sp_ereport_list_by_restart( + &self, + opctx: &OpContext, + restart_id: EreporterRestartUuid, + pagparams: &DataPageParams<'_, DbEna>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + paginated(sp_dsl::sp_ereport, sp_dsl::ena, pagparams) + .filter(sp_dsl::restart_id.eq(restart_id.into_untyped_uuid())) + .filter(sp_dsl::time_deleted.is_null()) + .select(SpEreport::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// List unique ereporter restarts for the given serial number. pub async fn ereporter_restart_list_by_serial( &self, opctx: &OpContext, @@ -172,7 +192,7 @@ impl DataStore { Some(EreporterRestartBySerial { id: EreporterRestartUuid::from_untyped_uuid(restart_id), - reporter: Reporter::Sp { sp_type, slot: sp_slot.into() }, + reporter_kind: Reporter::Sp { sp_type, slot: sp_slot.into() }, first_seen_at, ereports, }) @@ -198,7 +218,7 @@ impl DataStore { Some(EreporterRestartBySerial { id: EreporterRestartUuid::from_untyped_uuid(restart_id), - reporter: Reporter::HostOs { sled: SledUuid::from_untyped_uuid(sled_id) }, + reporter_kind: Reporter::HostOs { sled: SledUuid::from_untyped_uuid(sled_id) }, first_seen_at, ereports, }) From 97ab85cb6d53748c39431569275a62d01cdbfd1f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 12:08:08 -0700 Subject: [PATCH 30/61] add migration for ereport tables --- nexus/db-model/src/schema_versions.rs | 3 ++- schema/crdb/dbinit.sql | 2 +- schema/crdb/ereports/up01.sql | 31 +++++++++++++++++++++++++++ schema/crdb/ereports/up02.sql | 6 ++++++ schema/crdb/ereports/up03.sql | 7 ++++++ schema/crdb/ereports/up04.sql | 19 ++++++++++++++++ schema/crdb/ereports/up05.sql | 4 ++++ schema/crdb/ereports/up06.sql | 7 ++++++ 8 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/ereports/up01.sql create mode 100644 schema/crdb/ereports/up02.sql create mode 100644 schema/crdb/ereports/up03.sql create mode 100644 schema/crdb/ereports/up04.sql create mode 100644 schema/crdb/ereports/up05.sql create mode 100644 schema/crdb/ereports/up06.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index bd270fb4c78..bde10255778 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(150, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(151, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(151, "ereports"), KnownVersion::new(150, "add-last-reconciliation-orphaned-datasets"), KnownVersion::new(149, "bp-add-target-release-min-gen"), KnownVersion::new(148, "clean-misplaced-m2s"), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 949c07cf6ec..c8e95026974 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5846,7 +5846,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '150.0.0', NULL) + (TRUE, NOW(), NOW(), '151.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/ereports/up01.sql b/schema/crdb/ereports/up01.sql new file mode 100644 index 00000000000..8c8164705c7 --- /dev/null +++ b/schema/crdb/ereports/up01.sql @@ -0,0 +1,31 @@ +CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + time_deleted TIMESTAMPTZ, + + -- time at which the ereport was collected + time_collected TIMESTAMPTZ NOT NULL, + -- UUID of the Nexus instance that collected the ereport + collector_id UUID NOT NULL, + + -- physical lcoation of the reporting SP + -- + -- these fields are always present, as they are how requests to collect + -- ereports are indexed by MGS. + sp_type omicron.public.sp_type NOT NULL, + sp_slot INT4 NOT NULL, + + -- VPD identity of the reporting SP. + -- + -- unlike the physical location, these fields are nullable, as an ereport + -- may be generated in a state where the SP doesn't know who or what it is. + -- consider that "i don't know my own identity" is a reasonable condition to + -- might want to generate an ereport about! + serial_number STRING, + part_number STRING, + + -- JSON representation of the ereport + report JSONB NOT NULL, + + PRIMARY KEY (restart_id, ena) +); \ No newline at end of file diff --git a/schema/crdb/ereports/up02.sql b/schema/crdb/ereports/up02.sql new file mode 100644 index 00000000000..2cf34912e35 --- /dev/null +++ b/schema/crdb/ereports/up02.sql @@ -0,0 +1,6 @@ +CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_slot +ON omicron.public.sp_ereport ( + sp_type, + sp_slot, + time_collected +); \ No newline at end of file diff --git a/schema/crdb/ereports/up03.sql b/schema/crdb/ereports/up03.sql new file mode 100644 index 00000000000..1a3b7960874 --- /dev/null +++ b/schema/crdb/ereports/up03.sql @@ -0,0 +1,7 @@ +CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp +ON omicron.public.sp_ereport +USING BTREE ( + time_collected +) +WHERE + time_deleted IS NULL; \ No newline at end of file diff --git a/schema/crdb/ereports/up04.sql b/schema/crdb/ereports/up04.sql new file mode 100644 index 00000000000..e58e9773ff9 --- /dev/null +++ b/schema/crdb/ereports/up04.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + time_deleted TIMESTAMPTZ, + + -- time at which the ereport was collected + time_collected TIMESTAMPTZ NOT NULL, + -- UUID of the Nexus instance that collected the ereport + collector_id UUID NOT NULL, + + -- identity of the reporting sled + sled_id UUID NOT NULL, + sled_serial TEXT NOT NULL, + + -- JSON representation of the ereport + report JSONB NOT NULL, + + PRIMARY KEY (restart_id, ena) +); \ No newline at end of file diff --git a/schema/crdb/ereports/up05.sql b/schema/crdb/ereports/up05.sql new file mode 100644 index 00000000000..c749258887a --- /dev/null +++ b/schema/crdb/ereports/up05.sql @@ -0,0 +1,4 @@ +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( + sled_id, + time_collected +); \ No newline at end of file diff --git a/schema/crdb/ereports/up06.sql b/schema/crdb/ereports/up06.sql new file mode 100644 index 00000000000..20141fd8e47 --- /dev/null +++ b/schema/crdb/ereports/up06.sql @@ -0,0 +1,7 @@ +CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp +ON omicron.public.host_ereport +USING BTREE ( + time_collected +) +WHERE + time_deleted IS NULL; \ No newline at end of file From 19ffe30fba3a5d75d12849941733ee956dfa7db3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 12:47:25 -0700 Subject: [PATCH 31/61] finish test --- .../app/background/tasks/ereport_ingester.rs | 222 +++++++++++++++++- 1 file changed, 220 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 019f888a015..8195b631720 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -405,7 +405,10 @@ impl EreportQueryParams { #[cfg(test)] mod tests { use super::*; + use nexus_db_queries::db::pagination::Paginator; use nexus_test_utils_macros::nexus_test; + use omicron_uuid_kinds::GenericUuid; + use uuid::uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -448,7 +451,222 @@ mod tests { "there should be no errors from SP {sp_type:?} {slot}", ); } - - + + // Now, let's check that the ereports exist in the database. + + let sled0_restart_id = EreporterRestartUuid::from_untyped_uuid(uuid!( + "af1ebf85-36ba-4c31-bbec-b9825d6d9d8b" + )); + let sled1_restart_id = EreporterRestartUuid::from_untyped_uuid(uuid!( + "55e30cc7-a109-492f-aca9-735ed725df3c" + )); + + let sled0_ereports = [ + ( + Ena::from(1), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "task_apollo_server", + "hubris_task_gen": 13, + "hubris_uptime_ms": 1233, + "ereport_message_version": 0, + "class": "gov.nasa.apollo.o2_tanks.stir.begin", + "message": "stirring the tanks", + }), + ), + ( + Ena::from(2), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "drv_ae35_server", + "hubris_task_gen": 1, + "hubris_uptime_ms": 1234, + "ereport_message_version": 0, + "class": "io.discovery.ae35.fault", + "message": "i've just picked up a fault in the AE-35 unit", + "de": { + "scheme": "fmd", + "mod-name": "ae35-diagnosis", + "authority": { + "product-id": "HAL-9000-series computer", + "server-id": "HAL 9000", + }, + }, + "hours_to_failure": 72, + }), + ), + ( + Ena::from(3), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "task_apollo_server", + "hubris_task_gen": 13, + "hubris_uptime_ms": 1237, + "ereport_message_version": 0, + "class": "gov.nasa.apollo.fault", + "message": "houston, we have a problem", + "crew": [ + "Lovell", + "Swigert", + "Haise", + ], + }), + ), + ( + Ena::from(4), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "drv_thingy_server", + "hubris_task_gen": 2, + "hubris_uptime_ms": 1240, + "ereport_message_version": 0, + "class": "flagrant_error", + "computer": false, + }), + ), + ( + Ena::from(5), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet00", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "task_latex_server", + "hubris_task_gen": 1, + "hubris_uptime_ms": 1245, + "ereport_message_version": 0, + "class": "overfull_hbox", + "badness": 10000, + }), + ), + ]; + + let sled1_ereports = [( + Ena::from(1), + serde_json::json!({ + "baseboard_part_number": "SimGimletSp", + "baseboard_serial_number": "SimGimlet01", + "hubris_archive_id": "ffffffff", + "hubris_version": "0.0.2", + "hubris_task_name": "task_thermal_server", + "hubris_task_gen": 1, + "hubris_uptime_ms": 1233, + "ereport_message_version": 0, + "class": "computer.oxide.gimlet.chassis_integrity.fault", + "nosub_class": "chassis_integrity.cat_hair_detected", + "message": "cat hair detected inside gimlet", + "de": { + "scheme": "fmd", + "mod-name": "hubris-thermal-diagnosis", + "mod-version": "1.0", + "authority": { + "product-id": "oxide", + "server-id": "SimGimlet1", + } + }, + "certainty": 0x64, + "cat_hair_amount": 10000, + }), + )]; + + check_sp_ereports_exist( + datastore, + &opctx, + sled0_restart_id, + &sled0_ereports, + ) + .await; + check_sp_ereports_exist( + datastore, + &opctx, + sled1_restart_id, + &sled1_ereports, + ) + .await; + + // Activate the task again and assert that no new ereports were + // ingested. + let activation2 = ingester + .actually_activate(&opctx) + .await + .expect("activation 2 should succeed"); + dbg!(&activation1); + assert_eq!( + activation2.error.as_ref(), + None, + "there should be no top-level activation error for the second \ + activation" + ); + assert_eq!(activation2.sps, &[], "no new ereports should be observed"); + + check_sp_ereports_exist( + datastore, + &opctx, + sled0_restart_id, + &sled0_ereports, + ) + .await; + check_sp_ereports_exist( + datastore, + &opctx, + sled1_restart_id, + &sled1_ereports, + ) + .await; + } + + async fn check_sp_ereports_exist( + datastore: &DataStore, + opctx: &OpContext, + restart_id: EreporterRestartUuid, + expected_ereports: &[(Ena, serde_json::Value)], + ) { + let mut paginator = + Paginator::new(std::num::NonZeroU32::new(100).unwrap()); + let mut found_ereports = BTreeMap::new(); + while let Some(p) = paginator.next() { + let batch = datastore + .sp_ereport_list_by_restart( + &opctx, + restart_id, + &p.current_pagparams(), + ) + .await + .expect("should be able to query for ereports"); + paginator = p.found_batch(&batch, &|ereport| ereport.ena); + found_ereports.extend( + batch.into_iter().map(|ereport| (ereport.ena.0, ereport)), + ); + } + assert_eq!( + expected_ereports.len(), + found_ereports.len(), + "expected {} ereports for {restart_id:?}", + expected_ereports.len() + ); + + for (ena, report) in expected_ereports { + let Some(found_report) = found_ereports.get(ena) else { + panic!( + "expected ereport {restart_id:?}:{ena} not found in database" + ) + }; + assert_eq!( + report, &found_report.report, + "ereport data for {restart_id:?}:{ena} doesn't match expected" + ); + } } } From ed1c4d6e4a8bf200735e991f935dfad4892f1d98 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 12:51:19 -0700 Subject: [PATCH 32/61] only create status when ereports are received --- nexus/src/app/background/tasks/ereport_ingester.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 8195b631720..4bc9dcf89af 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -179,6 +179,9 @@ impl Ingester { .await { if items.is_empty() { + if let Some(ref mut status) = status { + status.requests += 1; + } slog::trace!( &opctx.log, "no ereports returned by SP"; @@ -193,6 +196,8 @@ impl Ingester { .map(|s| s.new_ereports), ); break; + } else { + status.get_or_insert_default().requests += 1; } let db_ereports = items .into_iter() @@ -304,7 +309,6 @@ impl Ingester { .await; match res { Ok(ereports) => { - status.get_or_insert_default().requests += 1; return Some(ereports.into_inner()); } Err(gateway_client::Error::ErrorResponse(rsp)) @@ -439,8 +443,8 @@ mod tests { ); assert_eq!( activation1.sps.len(), - 4, - "ereports from 4 SPs should be observed: {:?}", + 3, + "ereports from 3 SPs should be observed: {:?}", activation1.sps, ); @@ -450,6 +454,10 @@ mod tests { &Vec::::new(), "there should be no errors from SP {sp_type:?} {slot}", ); + assert_eq!( + status.requests, 2, + "two HTTP requests should have been sent for SP {sp_type:?} {slot}", + ); } // Now, let's check that the ereports exist in the database. From 3c3f5559ee27b4bad620542cb7ea47fa903ca5eb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 12:53:29 -0700 Subject: [PATCH 33/61] rm unused --- dev-tools/omdb/src/bin/omdb/helpers.rs | 7 ------- nexus/db-model/src/ereport.rs | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/helpers.rs b/dev-tools/omdb/src/bin/omdb/helpers.rs index d0a65f9d25f..efa6f97fde3 100644 --- a/dev-tools/omdb/src/bin/omdb/helpers.rs +++ b/dev-tools/omdb/src/bin/omdb/helpers.rs @@ -45,13 +45,6 @@ pub(crate) fn display_option_blank( opt.as_ref().map(|x| x.to_string()).unwrap_or_else(|| "".to_string()) } -// Display the string "" for an Option if it's None. -pub(crate) fn display_option_error( - opt: &Option, -) -> String { - opt.as_ref().map(|x| x.to_string()).unwrap_or_else(|| "".to_string()) -} - // Format a `chrono::DateTime` in RFC3339 with milliseconds precision and using // `Z` rather than the UTC offset for UTC timestamps, to save a few characters // of line width in tabular output. diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index d9101b8989f..109450dbd60 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -15,8 +15,7 @@ use diesel::sql_types; use ereport_types::{Ena, EreportId}; use nexus_db_schema::schema::{host_ereport, sp_ereport}; use omicron_uuid_kinds::{ - EreporterRestartKind, EreporterRestartUuid, OmicronZoneKind, - OmicronZoneUuid, SledKind, SledUuid, + EreporterRestartKind, OmicronZoneKind, OmicronZoneUuid, SledKind, SledUuid, }; use serde::{Deserialize, Serialize}; use std::convert::TryFrom; From 79090dc0675aa95382a0fb40a1612cbfcd837898 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 13:04:25 -0700 Subject: [PATCH 34/61] clippy --- nexus/db-model/src/ereport.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 109450dbd60..5ca5d2a217c 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -93,7 +93,7 @@ impl From for Ereport { part_number, serial_number, }, - reporter: Reporter::Sp { sp_type, slot: sp_slot.0.into() }, + reporter: Reporter::Sp { sp_type, slot: sp_slot.0 }, report, } } From 82356cb4eb428a492a2d6da2f9f3d04dab36de5b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 13:18:21 -0700 Subject: [PATCH 35/61] more clippy placation --- nexus/db-queries/src/db/datastore/ereport.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index cd38996d592..781200a1b84 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -145,7 +145,7 @@ impl DataStore { )) .order_by(sp_dsl::restart_id) .load_async::<(Uuid, SpType, SqlU16, Option>, SqlU32)>( - &*conn, + &conn, ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -163,7 +163,7 @@ impl DataStore { count_distinct(host_dsl::ena), )) .order_by(host_dsl::restart_id) - .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&*conn) + .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&conn) .await .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) @@ -315,7 +315,7 @@ impl DataStore { })?; let sp_type = sp_type.into(); let latest = self - .sp_latest_ereport_id_on_conn(&*conn, sp_type, slot) + .sp_latest_ereport_id_on_conn(&conn, sp_type, slot) .await .map_err(|e| { e.internal_context(format!( From 178d963b7162464d0ef4ad996e9ffc3665fb06d5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 13:19:25 -0700 Subject: [PATCH 36/61] oops, include_deleted won't do that --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 2d8ff6964b4..42a6f710266 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -322,7 +322,6 @@ async fn ereport_fetch( let sp_query = sp_dsl::sp_ereport .filter(sp_dsl::restart_id.eq(restart_id.clone())) .filter(sp_dsl::ena.eq(ena.clone())) - .filter(sp_dsl::time_deleted.is_null()) .select(db::model::SpEreport::as_select()); let sp_result = if !fetch_opts.include_deleted { sp_query @@ -341,7 +340,6 @@ async fn ereport_fetch( let host_query = host_dsl::host_ereport .filter(host_dsl::restart_id.eq(restart_id)) .filter(host_dsl::ena.eq(ena)) - .filter(host_dsl::time_deleted.is_null()) .select(db::model::HostEreport::as_select()); let host_result = if !fetch_opts.include_deleted { host_query From 72ed5df3a871823943fa2e7987a7d92fa2c3b07e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 13:30:54 -0700 Subject: [PATCH 37/61] put back the `*`s clippy said to remove --- nexus/db-queries/src/db/datastore/ereport.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 781200a1b84..e86771dd6fe 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -145,7 +145,7 @@ impl DataStore { )) .order_by(sp_dsl::restart_id) .load_async::<(Uuid, SpType, SqlU16, Option>, SqlU32)>( - &conn, + &*conn, ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -163,7 +163,7 @@ impl DataStore { count_distinct(host_dsl::ena), )) .order_by(host_dsl::restart_id) - .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&conn) + .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&*conn) .await .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) From 97dd83e19e57de437f89c7baedfcc082f19c64eb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Jun 2025 13:36:29 -0700 Subject: [PATCH 38/61] clippy placation (REAL VERSION) --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 21 ++++++++----------- nexus/db-queries/src/db/datastore/ereport.rs | 4 ++-- .../app/background/tasks/ereport_ingester.rs | 2 +- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 42a6f710266..4a8e0be8220 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -148,7 +148,7 @@ async fn cmd_db_ereport_list( restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), src: sp_type.into(), - slot: sp_slot.0.into(), + slot: sp_slot.0, } } } @@ -252,7 +252,7 @@ async fn cmd_db_ereport_info( let ereport_id = ereport_types::EreportId { restart_id, ena }; let conn = datastore.pool_connection_for_tests().await?; let db::model::Ereport { id, metadata, reporter, report } = - ereport_fetch(&*conn, fetch_opts, ereport_id).await?; + ereport_fetch(&conn, fetch_opts, ereport_id).await?; let db::model::EreportMetadata { time_collected, @@ -320,16 +320,13 @@ async fn ereport_fetch( let ena = db::model::DbEna::from(id.ena); let sp_query = sp_dsl::sp_ereport - .filter(sp_dsl::restart_id.eq(restart_id.clone())) - .filter(sp_dsl::ena.eq(ena.clone())) + .filter(sp_dsl::restart_id.eq(restart_id)) + .filter(sp_dsl::ena.eq(ena)) .select(db::model::SpEreport::as_select()); let sp_result = if !fetch_opts.include_deleted { - sp_query - .filter(sp_dsl::time_deleted.is_null()) - .first_async(&*conn) - .await + sp_query.filter(sp_dsl::time_deleted.is_null()).first_async(conn).await } else { - sp_query.first_async(&*conn).await + sp_query.first_async(conn).await }; if let Some(report) = sp_result.optional().with_context(|| { format!("failed to query for SP ereport matching {id}") @@ -344,10 +341,10 @@ async fn ereport_fetch( let host_result = if !fetch_opts.include_deleted { host_query .filter(host_dsl::time_deleted.is_null()) - .first_async(&*conn) + .first_async(conn) .await } else { - host_query.first_async(&*conn).await + host_query.first_async(conn).await }; if let Some(report) = host_result.optional().with_context(|| { format!("failed to query for host OS ereport matching {id}") @@ -409,7 +406,7 @@ async fn cmd_db_ereporters( if let Some(slot_type) = slot_type { sp_query = sp_query - .filter(sp_dsl::sp_type.eq(SpType::from(slot_type.clone()))); + .filter(sp_dsl::sp_type.eq(SpType::from(slot_type))); } if let Some(serial) = serial { diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index e86771dd6fe..7dc76f9c010 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -145,7 +145,7 @@ impl DataStore { )) .order_by(sp_dsl::restart_id) .load_async::<(Uuid, SpType, SqlU16, Option>, SqlU32)>( - &*conn, + conn, ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -163,7 +163,7 @@ impl DataStore { count_distinct(host_dsl::ena), )) .order_by(host_dsl::restart_id) - .load_async::<(Uuid, Uuid, Option>, SqlU32)>(&*conn) + .load_async::<(Uuid, Uuid, Option>, SqlU32)>(conn) .await .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 4bc9dcf89af..6700ff524fb 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -300,7 +300,7 @@ impl Ingester { let res = client .sp_ereports_ingest( sp_type, - slot as u32, + u32::from(slot), committed_ena.as_ref(), LIMIT, &restart_id, From 7a48c6892a2d695fab1b6d6a293d40131e8067d1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 09:49:51 -0700 Subject: [PATCH 39/61] update expectorate query tests --- nexus/db-queries/tests/output/host_latest_ereport_id.sql | 2 +- nexus/db-queries/tests/output/sp_latest_ereport_id.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/tests/output/host_latest_ereport_id.sql b/nexus/db-queries/tests/output/host_latest_ereport_id.sql index 6cd39952557..a6ae2ffcd08 100644 --- a/nexus/db-queries/tests/output/host_latest_ereport_id.sql +++ b/nexus/db-queries/tests/output/host_latest_ereport_id.sql @@ -5,6 +5,6 @@ FROM WHERE host_ereport.sled_id = $1 ORDER BY - host_ereport.time_collected DESC + host_ereport.time_collected DESC, host_ereport.ena DESC LIMIT $2 diff --git a/nexus/db-queries/tests/output/sp_latest_ereport_id.sql b/nexus/db-queries/tests/output/sp_latest_ereport_id.sql index 14cf96cc93c..304afdcdf41 100644 --- a/nexus/db-queries/tests/output/sp_latest_ereport_id.sql +++ b/nexus/db-queries/tests/output/sp_latest_ereport_id.sql @@ -5,6 +5,6 @@ FROM WHERE sp_ereport.sp_type = $1 AND sp_ereport.sp_slot = $2 ORDER BY - sp_ereport.time_collected DESC + sp_ereport.time_collected DESC, sp_ereport.ena DESC LIMIT $3 From e48973f780e8ab08fdf288e12ea164733d7a95e0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 09:52:21 -0700 Subject: [PATCH 40/61] add restart ID filter to omdb ereport list --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 4a8e0be8220..ec52a4c67ab 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -63,9 +63,13 @@ struct InfoArgs { #[derive(Debug, Args, Clone)] struct ListArgs { /// Include only ereports from the system with the provided serial number. - #[clap(long)] + #[clap(long, short)] serial: Option, + /// Include only ereports from the provided reporter restart ID. + #[clap(long, short)] + id: Option, + /// Include only ereports collected before this timestamp #[clap(long, short)] before: Option>, @@ -189,6 +193,10 @@ async fn cmd_db_ereport_list( query = query.filter(sp_dsl::serial_number.eq(serial.clone())); } + if let Some(id) = args.id { + query = query.filter(sp_dsl::restart_id.eq(id.into_untyped_uuid())); + } + if let Some(before) = args.before { query = query.filter(sp_dsl::time_collected.lt(before)); } From d4153e9bc4052dd85e0c6fe3cd5d12ccfe571a9b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 10:05:37 -0700 Subject: [PATCH 41/61] implement host ereports in omdb ereport list --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 86 ++++++++++++++++++++--- nexus/db-model/src/ereport.rs | 20 ++++++ 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index ec52a4c67ab..11b8c32eaef 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -121,9 +121,7 @@ async fn cmd_db_ereport_list( time_collected: DateTime, restart_id: Uuid, ena: Ena, - src: SrcType, - #[tabled(rename = "#")] - slot: u16, + source: db::model::Reporter, } #[derive(Tabled)] @@ -151,8 +149,7 @@ async fn cmd_db_ereport_list( time_collected, restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), - src: sp_type.into(), - slot: sp_slot.0, + source: db::model::Reporter::Sp { sp_type, slot: sp_slot.0 }, } } } @@ -172,6 +169,40 @@ async fn cmd_db_ereport_list( } } + impl From<&'_ db::model::HostEreport> for EreportRow { + fn from(ereport: &db::model::HostEreport) -> Self { + let &db::model::HostEreport { + time_collected, + restart_id, + ena, + sled_id, + .. + } = ereport; + EreportRow { + time_collected, + restart_id: restart_id.into_untyped_uuid(), + ena: ena.into(), + source: db::model::Reporter::HostOs { sled: sled_id.into() }, + } + } + } + + impl<'report, T> From<&'report db::model::HostEreport> + for WithSerial<'report, T> + where + T: Tabled, + T: From<&'report db::model::HostEreport>, + { + fn from(ereport: &'report db::model::HostEreport) -> Self { + let inner = T::from(ereport); + WithSerial { + inner, + serial: Some(&ereport.sled_serial), + part_number: None, // TODO(eliza): go get this from inventory? + } + } + } + if let (Some(before), Some(after)) = (args.before, args.after) { anyhow::ensure!( after < before, @@ -212,13 +243,52 @@ async fn cmd_db_ereport_list( let sp_ereports = query.load_async(&*conn).await.with_context(ctx)?; check_limit(&sp_ereports, fetch_opts.fetch_limit, ctx); - // let host_ereports: Vec = Vec::new(); // TODO + let ctx = || "loading host OS ereports"; + let mut query = host_dsl::host_ereport + .select(db::model::HostEreport::as_select()) + .limit(fetch_opts.fetch_limit.get().into()) + .order_by(( + host_dsl::time_collected, + host_dsl::restart_id, + host_dsl::ena, + )) + .into_boxed(); + + if let Some(ref serial) = args.serial { + query = query.filter(host_dsl::sled_serial.eq(serial.clone())); + } + + if let Some(id) = args.id { + query = query.filter(host_dsl::restart_id.eq(id.into_untyped_uuid())); + } + + if let Some(before) = args.before { + query = query.filter(host_dsl::time_collected.lt(before)); + } + + if let Some(after) = args.after { + query = query.filter(host_dsl::time_collected.gt(after)); + } + + if !fetch_opts.include_deleted { + query = query.filter(host_dsl::time_deleted.is_null()); + } + + let host_ereports = query.load_async(&*conn).await.with_context(ctx)?; + check_limit(&host_ereports, fetch_opts.fetch_limit, ctx); let mut table = if args.serial.is_some() { - tabled::Table::new(sp_ereports.iter().map(EreportRow::from)) + tabled::Table::new( + sp_ereports + .iter() + .map(EreportRow::from) + .chain(host_ereports.iter().map(EreportRow::from)), + ) } else { tabled::Table::new( - sp_ereports.iter().map(WithSerial::<'_, EreportRow>::from), + sp_ereports.iter().map(WithSerial::<'_, EreportRow>::from).chain( + host_ereports.iter().map(WithSerial::<'_, EreportRow>::from), + ), ) }; diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 5ca5d2a217c..8d1c1bcd601 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -19,6 +19,7 @@ use omicron_uuid_kinds::{ }; use serde::{Deserialize, Serialize}; use std::convert::TryFrom; +use std::fmt; #[derive( Copy, @@ -141,6 +142,25 @@ pub enum Reporter { HostOs { sled: SledUuid }, } +impl std::fmt::Display for Reporter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Sp { sp_type: SpType::Sled, slot } => { + write!(f, "Sled (SP) {slot:02}") + } + Self::Sp { sp_type: SpType::Switch, slot } => { + write!(f, "Switch {slot}") + } + Self::Sp { sp_type: SpType::Power, slot } => { + write!(f, "PSC {slot}") + } + Self::HostOs { sled } => { + write!(f, "Sled (OS) {sled:?}") + } + } + } +} + #[derive(Clone, Debug, Insertable, Queryable, Selectable)] #[diesel(table_name = sp_ereport)] pub struct SpEreport { From ac0cc38cd3e26e9037077d323ccd665ef03b6f18 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 10:23:47 -0700 Subject: [PATCH 42/61] more OMDB list improvements - allow filtering by multiple serials/IDs - sort everything by timestamp in one list - always include serial because there may be multiple filters --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 119 +++++++++------------- 1 file changed, 47 insertions(+), 72 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 11b8c32eaef..ebeab95b387 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -56,19 +56,21 @@ enum Commands { #[derive(Debug, Args, Clone)] struct InfoArgs { + /// The reporter restart UUID of the ereport to show restart_id: EreporterRestartUuid, + /// The ENA of the ereport within the reporter restart ena: Ena, } #[derive(Debug, Args, Clone)] struct ListArgs { - /// Include only ereports from the system with the provided serial number. - #[clap(long, short)] - serial: Option, + /// Include only ereports from systems with the provided serial numbers. + #[clap(long = "serial", short)] + serials: Vec, - /// Include only ereports from the provided reporter restart ID. - #[clap(long, short)] - id: Option, + /// Include only ereports from the provided reporter restart IDs. + #[clap(long = "id", short)] + ids: Vec, /// Include only ereports collected before this timestamp #[clap(long, short)] @@ -116,33 +118,28 @@ async fn cmd_db_ereport_list( ) -> anyhow::Result<()> { #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct EreportRow { + struct EreportRow<'report> { #[tabled(display_with = "datetime_rfc3339_concise")] time_collected: DateTime, restart_id: Uuid, ena: Ena, source: db::model::Reporter, - } - - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct WithSerial<'a, T: Tabled> { - #[tabled(inline)] - inner: T, #[tabled(display_with = "display_option_blank", rename = "S/N")] - serial: Option<&'a str>, + serial: Option<&'report str>, #[tabled(display_with = "display_option_blank", rename = "P/N")] - part_number: Option<&'a str>, + part_number: Option<&'report str>, } - impl From<&'_ db::model::SpEreport> for EreportRow { - fn from(ereport: &db::model::SpEreport) -> Self { + impl<'report> From<&'report db::model::SpEreport> for EreportRow<'report> { + fn from(ereport: &'report db::model::SpEreport) -> Self { let &db::model::SpEreport { time_collected, restart_id, ena, sp_type, sp_slot, + ref serial_number, + ref part_number, .. } = ereport; EreportRow { @@ -150,32 +147,20 @@ async fn cmd_db_ereport_list( restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), source: db::model::Reporter::Sp { sp_type, slot: sp_slot.0 }, + serial: serial_number.as_deref(), + part_number: part_number.as_deref(), } } } - impl<'report, T> From<&'report db::model::SpEreport> for WithSerial<'report, T> - where - T: Tabled, - T: From<&'report db::model::SpEreport>, - { - fn from(ereport: &'report db::model::SpEreport) -> Self { - let inner = T::from(ereport); - WithSerial { - inner, - serial: ereport.serial_number.as_deref(), - part_number: ereport.part_number.as_deref(), - } - } - } - - impl From<&'_ db::model::HostEreport> for EreportRow { - fn from(ereport: &db::model::HostEreport) -> Self { + impl<'report> From<&'report db::model::HostEreport> for EreportRow<'report> { + fn from(ereport: &'report db::model::HostEreport) -> Self { let &db::model::HostEreport { time_collected, restart_id, ena, sled_id, + ref sled_serial, .. } = ereport; EreportRow { @@ -183,21 +168,7 @@ async fn cmd_db_ereport_list( restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), source: db::model::Reporter::HostOs { sled: sled_id.into() }, - } - } - } - - impl<'report, T> From<&'report db::model::HostEreport> - for WithSerial<'report, T> - where - T: Tabled, - T: From<&'report db::model::HostEreport>, - { - fn from(ereport: &'report db::model::HostEreport) -> Self { - let inner = T::from(ereport); - WithSerial { - inner, - serial: Some(&ereport.sled_serial), + serial: Some(&sled_serial), part_number: None, // TODO(eliza): go get this from inventory? } } @@ -220,14 +191,14 @@ async fn cmd_db_ereport_list( .order_by((sp_dsl::time_collected, sp_dsl::restart_id, sp_dsl::ena)) .into_boxed(); - if let Some(ref serial) = args.serial { - query = query.filter(sp_dsl::serial_number.eq(serial.clone())); + if !args.serials.is_empty() { + query = + query.filter(sp_dsl::serial_number.eq_any(args.serials.clone())); } - if let Some(id) = args.id { - query = query.filter(sp_dsl::restart_id.eq(id.into_untyped_uuid())); + if !args.ids.is_empty() { + query = query.filter(sp_dsl::restart_id.eq_any(args.ids.clone())); } - if let Some(before) = args.before { query = query.filter(sp_dsl::time_collected.lt(before)); } @@ -254,12 +225,13 @@ async fn cmd_db_ereport_list( )) .into_boxed(); - if let Some(ref serial) = args.serial { - query = query.filter(host_dsl::sled_serial.eq(serial.clone())); + if !args.serials.is_empty() { + query = + query.filter(host_dsl::sled_serial.eq_any(args.serials.clone())); } - if let Some(id) = args.id { - query = query.filter(host_dsl::restart_id.eq(id.into_untyped_uuid())); + if !args.ids.is_empty() { + query = query.filter(host_dsl::restart_id.eq_any(args.ids.clone())); } if let Some(before) = args.before { @@ -277,21 +249,24 @@ async fn cmd_db_ereport_list( let host_ereports = query.load_async(&*conn).await.with_context(ctx)?; check_limit(&host_ereports, fetch_opts.fetch_limit, ctx); - let mut table = if args.serial.is_some() { - tabled::Table::new( - sp_ereports - .iter() - .map(EreportRow::from) - .chain(host_ereports.iter().map(EreportRow::from)), + let mut rows = sp_ereports + .iter() + .map(EreportRow::from) + .chain(host_ereports.iter().map(EreportRow::from)) + .collect::>(); + + // Sort everything by time collected so that the host-OS and SP ereports are + // interspersed by time collected, reporter, and ENA. Use + // `std::cmp::Reverse` so that more recent ereports are displayed first. + rows.sort_by_key(|row| { + ( + std::cmp::Reverse(row.time_collected), + row.restart_id, + std::cmp::Reverse(row.ena), ) - } else { - tabled::Table::new( - sp_ereports.iter().map(WithSerial::<'_, EreportRow>::from).chain( - host_ereports.iter().map(WithSerial::<'_, EreportRow>::from), - ), - ) - }; + }); + let mut table = tabled::Table::new(rows); table .with(tabled::settings::Style::empty()) .with(tabled::settings::Padding::new(0, 1, 0, 0)); From 3a34cd40c70b92950a974f3d35d86b7cce3abfe9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 10:24:00 -0700 Subject: [PATCH 43/61] add OMDB ereport commands to help output test --- dev-tools/omdb/tests/test_all_output.rs | 4 + dev-tools/omdb/tests/usage_errors.out | 131 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index f59175b0c82..3fa997275c0 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -85,6 +85,10 @@ async fn test_omdb_usage_errors() { &["db", "inventory", "collections", "show", "--help"], &["db", "inventory", "collections", "show", "all", "--help"], &["db", "inventory", "collections", "show", "sp", "--help"], + &["db", "ereport"], + &["db", "ereport", "list", "--help"], + &["db", "ereport", "reporters", "--help"], + &["db", "ereport", "info", "--help"], &["db", "sleds", "--help"], &["db", "saga"], &["db", "snapshots"], diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index a92a51dfaf5..d5d51a423f4 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -492,6 +492,137 @@ Database Options: --include-deleted whether to include soft-deleted records when enumerating objects that can be soft-deleted +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= +EXECUTING COMMAND: omdb ["db", "ereport"] +termination: Exited(2) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +Query and display error reports + +Usage: omdb db ereport [OPTIONS] + +Commands: + list List ereports + info Show an ereport + reporters List ereport reporters + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] + [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects that + can be soft-deleted + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +============================================= +EXECUTING COMMAND: omdb ["db", "ereport", "list", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +List ereports + +Usage: omdb db ereport list [OPTIONS] + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + -s, --serial Include only ereports from systems with the provided serial numbers + -i, --id Include only ereports from the provided reporter restart IDs + -b, --before Include only ereports collected before this timestamp + -a, --after Include only ereports collected after this timestamp + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] + [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects that + can be soft-deleted + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= +EXECUTING COMMAND: omdb ["db", "ereport", "reporters", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +List ereport reporters + +Usage: omdb db ereport reporters [OPTIONS] [SERIAL] + +Arguments: + [SERIAL] + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + -t, --type + -s, --slot + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] + [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects that + can be soft-deleted + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= +EXECUTING COMMAND: omdb ["db", "ereport", "info", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +Show an ereport + +Usage: omdb db ereport info [OPTIONS] + +Arguments: + The reporter restart UUID of the ereport to show + The ENA of the ereport within the reporter restart + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + --color Color output [default: auto] [possible values: auto, always, never] + -h, --help Print help + +Connection Options: + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Database Options: + --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] + [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects that + can be soft-deleted + Safety Options: -w, --destructive Allow potentially-destructive subcommands --------------------------------------------- From fb1ffd0a3d2378a45d011a90a871ecc8764634cb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 16 Jun 2025 21:44:57 -0700 Subject: [PATCH 44/61] Update dbinit.sql Co-authored-by: Sean Klein --- schema/crdb/dbinit.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c8e95026974..19fec2bc639 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5764,7 +5764,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( -- UUID of the Nexus instance that collected the ereport collector_id UUID NOT NULL, - -- physical lcoation of the reporting SP + -- physical location of the reporting SP -- -- these fields are always present, as they are how requests to collect -- ereports are indexed by MGS. From 723f61e4a146f366bdcb8975e8eaf46183bc5333 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 10:43:58 -0700 Subject: [PATCH 45/61] omdb tweaks --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 143 ++++++++++++++-------- 1 file changed, 89 insertions(+), 54 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index ebeab95b387..1feb94f77e8 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -31,6 +31,7 @@ use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_schema::schema::host_ereport::dsl as host_dsl; use nexus_db_schema::schema::sp_ereport::dsl as sp_dsl; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::SledUuid; use tabled::Tabled; use uuid::Uuid; @@ -276,26 +277,6 @@ async fn cmd_db_ereport_list( Ok(()) } -#[derive(strum::Display)] -enum SrcType { - Switch, - Power, - #[strum(to_string = "Sled (SP)")] - SledSp, - // #[strum(to_string = "Sled (OS)")] - // SledOS, -} - -impl From for SrcType { - fn from(sp_type: SpType) -> Self { - match sp_type { - SpType::Power => SrcType::Power, - SpType::Sled => SrcType::SledSp, - SpType::Switch => SrcType::Switch, - } - } -} - async fn cmd_db_ereport_info( datastore: &DataStore, fetch_opts: &DbFetchOptions, @@ -413,6 +394,7 @@ async fn cmd_db_ereporters( args: &ReportersArgs, ) -> anyhow::Result<()> { let &ReportersArgs { slot, slot_type, ref serial } = args; + let slot_type = slot_type.map(nexus_db_model::SpType::from); let conn = datastore.pool_connection_for_tests().await?; let sp_ereporters = (*conn).transaction_async({ @@ -421,7 +403,7 @@ async fn cmd_db_ereporters( // Selecting all reporters may require a full table scan, depending // on filters. conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; - let mut sp_query = sp_dsl::sp_ereport + let mut query = sp_dsl::sp_ereport .group_by(( sp_dsl::restart_id, sp_dsl::sp_slot, @@ -438,17 +420,11 @@ async fn cmd_db_ereporters( min(sp_dsl::time_collected), count_distinct(sp_dsl::ena), )) - .order_by(( - sp_dsl::sp_type, - sp_dsl::sp_slot, - sp_dsl::serial_number, - sp_dsl::restart_id, - )) .into_boxed(); if let Some(slot) = slot { if slot_type.is_some() { - sp_query = sp_query + query = query .filter(sp_dsl::sp_slot.eq(db::model::SqlU16::new(slot))); } else { anyhow::bail!( @@ -458,15 +434,15 @@ async fn cmd_db_ereporters( } if let Some(slot_type) = slot_type { - sp_query = sp_query - .filter(sp_dsl::sp_type.eq(SpType::from(slot_type))); + query = query + .filter(sp_dsl::sp_type.eq(slot_type)); } if let Some(serial) = serial { - sp_query = sp_query.filter(sp_dsl::serial_number.eq(serial.clone())); + query = query.filter(sp_dsl::serial_number.eq(serial.clone())); } - sp_query + query .load_async::<(Uuid, db::model::SqlU16, SpType, Option, Option, Option>, i64)>( &conn, ) @@ -474,16 +450,58 @@ async fn cmd_db_ereporters( } }).await?; + let host_ereporters = if slot_type != Some(SpType::Sled) || slot.is_some() { + // If we have a SP type filter or a SP slot number, don't include host + // OS reporters (for now). + // TODO: if the SP type is "sled", can we get the sled UUID for that + // slot from inventory? + eprintln!( + "selecting reporters with type {slot_type:?} and slot {slot:?} will \ + skip host OS ereports", + ); + Vec::new() + } else { + (*conn).transaction_async({ + let serial = serial.clone(); + async move |conn| { + // Selecting all reporters may require a full table scan, depending + // on filters. + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + let mut query = host_dsl::host_ereport + .group_by(( + host_dsl::restart_id, + host_dsl::sled_id, + host_dsl::sled_serial, + )) + .select(( + host_dsl::restart_id, + host_dsl::sled_id, + host_dsl::sled_serial, + min(host_dsl::time_collected), + count_distinct(host_dsl::ena), + )) + .into_boxed(); + + if let Some(serial) = serial { + query = query.filter(host_dsl::sled_serial.eq(serial.clone())); + } + + query + .load_async::<(Uuid, Uuid, String, Option>, i64)>( + &conn, + ) + .await.context("listing host OS reporter entries") + } + }).await? + }; + #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] struct ReporterRow { #[tabled(display_with = "datetime_opt_rfc3339_concise")] first_seen: Option>, id: Uuid, - #[tabled(rename = "TYPE")] - ty: SrcType, - #[tabled(rename = "#")] - slot: u16, + identity: db::model::Reporter, #[tabled(display_with = "display_option_blank", rename = "S/N")] serial: Option, #[tabled(display_with = "display_option_blank", rename = "P/N")] @@ -491,28 +509,45 @@ async fn cmd_db_ereporters( ereports: i64, } - let rows = sp_ereporters.into_iter().map( - |( - restart_id, - slot, - sp_type, - serial, - part_number, - first_seen, - ereports, - )| { - ReporterRow { - first_seen, - ty: sp_type.into(), - slot: slot.into(), + let rows = { + let sp_rows = sp_ereporters.into_iter().map( + |( + restart_id, + slot, + sp_type, serial, part_number, + first_seen, + ereports, + )| { + ReporterRow { + first_seen, + identity: db::model::Reporter::Sp { slot: slot.0, sp_type }, + serial, + part_number, + id: restart_id, + ereports, + } + }, + ); + let host_rows = host_ereporters.into_iter().map( + |(restart_id, sled_id, serial, first_seen, ereports)| ReporterRow { + first_seen, + identity: db::model::Reporter::HostOs { + sled: SledUuid::from_untyped_uuid(sled_id), + }, + serial: Some(serial), + part_number: None, id: restart_id, ereports, - } - }, - ); - let mut table = tabled::Table::new(rows.into_iter()); + }, + ); + let mut rows = sp_rows.chain(host_rows).collect::>(); + rows.sort_by_key(|row| row.first_seen); + rows + }; + + let mut table = tabled::Table::new(rows); table .with(tabled::settings::Style::empty()) From 7d91abf38ad4f48e32261da7af8b248656323110 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 10:45:49 -0700 Subject: [PATCH 46/61] remove query expectorate tests --- nexus/db-queries/src/db/datastore/ereport.rs | 26 ------------------- .../tests/output/host_latest_ereport_id.sql | 10 ------- .../tests/output/sp_latest_ereport_id.sql | 10 ------- 3 files changed, 46 deletions(-) delete mode 100644 nexus/db-queries/tests/output/host_latest_ereport_id.sql delete mode 100644 nexus/db-queries/tests/output/sp_latest_ereport_id.sql diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 7dc76f9c010..602f711d4e3 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -340,23 +340,8 @@ mod tests { use super::*; use crate::db::explain::ExplainableAsync; use crate::db::pub_test_utils::TestDatabase; - use crate::db::raw_query_builder::expectorate_query_contents; use omicron_test_utils::dev; - #[tokio::test] - async fn expectorate_sp_latest_ereport_id() { - let query = DataStore::sp_latest_ereport_id_query( - SpType::Sled, - SpMgsSlot::from(SqlU16::new(1)), - ); - - expectorate_query_contents( - &query, - "tests/output/sp_latest_ereport_id.sql", - ) - .await; - } - #[tokio::test] async fn explain_sp_latest_ereport_id() { let logctx = dev::test_setup_log("explain_sp_latest_ereport_id"); @@ -385,17 +370,6 @@ mod tests { logctx.cleanup_successful(); } - #[tokio::test] - async fn expectorate_host_latest_ereport_id() { - let query = DataStore::host_latest_ereport_id_query(SledUuid::nil()); - - expectorate_query_contents( - &query, - "tests/output/host_latest_ereport_id.sql", - ) - .await; - } - #[tokio::test] async fn explain_host_latest_ereport_id() { let logctx = dev::test_setup_log("explain_host_latest_ereport_id"); diff --git a/nexus/db-queries/tests/output/host_latest_ereport_id.sql b/nexus/db-queries/tests/output/host_latest_ereport_id.sql deleted file mode 100644 index a6ae2ffcd08..00000000000 --- a/nexus/db-queries/tests/output/host_latest_ereport_id.sql +++ /dev/null @@ -1,10 +0,0 @@ -SELECT - host_ereport.restart_id, host_ereport.ena -FROM - host_ereport -WHERE - host_ereport.sled_id = $1 -ORDER BY - host_ereport.time_collected DESC, host_ereport.ena DESC -LIMIT - $2 diff --git a/nexus/db-queries/tests/output/sp_latest_ereport_id.sql b/nexus/db-queries/tests/output/sp_latest_ereport_id.sql deleted file mode 100644 index 304afdcdf41..00000000000 --- a/nexus/db-queries/tests/output/sp_latest_ereport_id.sql +++ /dev/null @@ -1,10 +0,0 @@ -SELECT - sp_ereport.restart_id, sp_ereport.ena -FROM - sp_ereport -WHERE - sp_ereport.sp_type = $1 AND sp_ereport.sp_slot = $2 -ORDER BY - sp_ereport.time_collected DESC, sp_ereport.ena DESC -LIMIT - $3 From f9c9fa03acf60f2933fc524fb917f8ef268a1f6d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 11:22:03 -0700 Subject: [PATCH 47/61] add EXPLAIN tests for restart ID list queries --- nexus/db-queries/src/db/datastore/ereport.rs | 136 ++++++++++++++----- 1 file changed, 100 insertions(+), 36 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 602f711d4e3..9a331cb75e3 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -21,6 +21,7 @@ use crate::db::pagination::paginated; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; +use diesel::dsl::{count_distinct, min}; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; @@ -125,50 +126,23 @@ impl DataStore { opctx: &OpContext, serial: String, ) -> ListResultVec { - use diesel::dsl::{count_distinct, min}; opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; let conn = &*self.pool_connection_authorized(opctx).await?; - let sp_rows = sp_dsl::sp_ereport - .filter( - sp_dsl::serial_number - .eq(serial.clone()) - .and(sp_dsl::time_deleted.is_null()), - ) - .group_by((sp_dsl::restart_id, sp_dsl::sp_slot, sp_dsl::sp_type)) - .select(( - sp_dsl::restart_id, - sp_dsl::sp_type, - sp_dsl::sp_slot, - min(sp_dsl::time_collected), - count_distinct(sp_dsl::ena), - )) - .order_by(sp_dsl::restart_id) + let sp_rows = Self::sp_restart_list_by_serial_query(serial.clone()) .load_async::<(Uuid, SpType, SqlU16, Option>, SqlU32)>( conn, ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - let host_os_rows = host_dsl::host_ereport - .filter( - host_dsl::sled_serial - .eq(serial) - .and(host_dsl::time_deleted.is_null()), - ) - .group_by((host_dsl::restart_id, host_dsl::sled_id)) - .select(( - host_dsl::restart_id, - host_dsl::sled_id, - min(host_dsl::time_collected), - count_distinct(host_dsl::ena), - )) - .order_by(host_dsl::restart_id) - .load_async::<(Uuid, Uuid, Option>, SqlU32)>(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - .internal_context("listing SP ereports") - })?; + let host_os_rows = + Self::host_restart_list_by_serial_query(serial.clone()) + .load_async::<(Uuid, Uuid, Option>, SqlU32)>(conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("listing SP ereports") + })?; let sp_reporters = sp_rows.into_iter().filter_map(|(restart_id, sp_type, sp_slot, first_seen, ereports)| { let ereports = ereports.into(); @@ -226,6 +200,46 @@ impl DataStore { Ok(sp_reporters.chain(host_reporters).collect::>()) } + fn sp_restart_list_by_serial_query( + serial: String, + ) -> impl RunnableQuery<(Uuid, SpType, SqlU16, Option>, SqlU32)> + { + sp_dsl::sp_ereport + .filter( + sp_dsl::serial_number + .eq(serial.clone()) + .and(sp_dsl::time_deleted.is_null()), + ) + .group_by((sp_dsl::restart_id, sp_dsl::sp_slot, sp_dsl::sp_type)) + .select(( + sp_dsl::restart_id, + sp_dsl::sp_type, + sp_dsl::sp_slot, + min(sp_dsl::time_collected), + count_distinct(sp_dsl::ena), + )) + .order_by(sp_dsl::restart_id) + } + + fn host_restart_list_by_serial_query( + serial: String, + ) -> impl RunnableQuery<(Uuid, Uuid, Option>, SqlU32)> { + host_dsl::host_ereport + .filter( + host_dsl::sled_serial + .eq(serial) + .and(host_dsl::time_deleted.is_null()), + ) + .group_by((host_dsl::restart_id, host_dsl::sled_id)) + .select(( + host_dsl::restart_id, + host_dsl::sled_id, + min(host_dsl::time_collected), + count_distinct(host_dsl::ena), + )) + .order_by(host_dsl::restart_id) + } + pub async fn sp_latest_ereport_id( &self, opctx: &OpContext, @@ -394,4 +408,54 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn explain_host_restart_list_by_serial() { + let logctx = dev::test_setup_log("explain_host_restart_list_by_serial"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::host_restart_list_by_serial_query(String::new()); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + eprintln!("{explanation}"); + + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn explain_sp_restart_list_by_serial() { + let logctx = dev::test_setup_log("explain_sp_restart_list_by_serial"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let pool = db.pool(); + let conn = pool.claim().await.unwrap(); + + let query = DataStore::sp_restart_list_by_serial_query(String::new()); + let explanation = query + .explain_async(&conn) + .await + .expect("Failed to explain query - is it valid SQL?"); + + eprintln!("{explanation}"); + + assert!( + !explanation.contains("FULL SCAN"), + "Found an unexpected FULL SCAN: {}", + explanation + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } From 16f9e5b1e99678bdaf7328fcdb3b46807fe335d9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 11:24:45 -0700 Subject: [PATCH 48/61] add indices for ereports-by-serial queries --- schema/crdb/dbinit.sql | 16 +++++++++++++++- schema/crdb/ereports/up04.sql | 24 +++++------------------- schema/crdb/ereports/up05.sql | 21 ++++++++++++++++++--- schema/crdb/ereports/up06.sql | 9 +++------ schema/crdb/ereports/up07.sql | 7 +++++++ schema/crdb/ereports/up08.sql | 5 +++++ 6 files changed, 53 insertions(+), 29 deletions(-) create mode 100644 schema/crdb/ereports/up07.sql create mode 100644 schema/crdb/ereports/up08.sql diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 19fec2bc639..78a25f1af00 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5801,6 +5801,13 @@ USING BTREE ( WHERE time_deleted IS NULL; + +CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_serial +ON omicron.public.sp_ereport ( + serial_number +) WHERE + time_deleted IS NULL; + -- Ereports from the host operating system CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( restart_id UUID NOT NULL, @@ -5822,7 +5829,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( PRIMARY KEY (restart_id, ena) ); -CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled +ON omicron.public.host_ereport ( sled_id, time_collected ); @@ -5835,6 +5843,12 @@ USING BTREE ( WHERE time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_serial +ON omicron.public.host_ereport ( + sled_serial +) WHERE + time_deleted IS NULL; + /* * Keep this at the end of file so that the database does not contain a version * until it is fully populated. diff --git a/schema/crdb/ereports/up04.sql b/schema/crdb/ereports/up04.sql index e58e9773ff9..0c59da0d5bb 100644 --- a/schema/crdb/ereports/up04.sql +++ b/schema/crdb/ereports/up04.sql @@ -1,19 +1,5 @@ -CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( - restart_id UUID NOT NULL, - ena INT8 NOT NULL, - time_deleted TIMESTAMPTZ, - - -- time at which the ereport was collected - time_collected TIMESTAMPTZ NOT NULL, - -- UUID of the Nexus instance that collected the ereport - collector_id UUID NOT NULL, - - -- identity of the reporting sled - sled_id UUID NOT NULL, - sled_serial TEXT NOT NULL, - - -- JSON representation of the ereport - report JSONB NOT NULL, - - PRIMARY KEY (restart_id, ena) -); \ No newline at end of file +CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_serial +ON omicron.public.sp_ereport ( + serial_number +) WHERE + time_deleted IS NULL; diff --git a/schema/crdb/ereports/up05.sql b/schema/crdb/ereports/up05.sql index c749258887a..e58e9773ff9 100644 --- a/schema/crdb/ereports/up05.sql +++ b/schema/crdb/ereports/up05.sql @@ -1,4 +1,19 @@ -CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( - sled_id, - time_collected +CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( + restart_id UUID NOT NULL, + ena INT8 NOT NULL, + time_deleted TIMESTAMPTZ, + + -- time at which the ereport was collected + time_collected TIMESTAMPTZ NOT NULL, + -- UUID of the Nexus instance that collected the ereport + collector_id UUID NOT NULL, + + -- identity of the reporting sled + sled_id UUID NOT NULL, + sled_serial TEXT NOT NULL, + + -- JSON representation of the ereport + report JSONB NOT NULL, + + PRIMARY KEY (restart_id, ena) ); \ No newline at end of file diff --git a/schema/crdb/ereports/up06.sql b/schema/crdb/ereports/up06.sql index 20141fd8e47..c749258887a 100644 --- a/schema/crdb/ereports/up06.sql +++ b/schema/crdb/ereports/up06.sql @@ -1,7 +1,4 @@ -CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp -ON omicron.public.host_ereport -USING BTREE ( +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( + sled_id, time_collected -) -WHERE - time_deleted IS NULL; \ No newline at end of file +); \ No newline at end of file diff --git a/schema/crdb/ereports/up07.sql b/schema/crdb/ereports/up07.sql new file mode 100644 index 00000000000..20141fd8e47 --- /dev/null +++ b/schema/crdb/ereports/up07.sql @@ -0,0 +1,7 @@ +CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp +ON omicron.public.host_ereport +USING BTREE ( + time_collected +) +WHERE + time_deleted IS NULL; \ No newline at end of file diff --git a/schema/crdb/ereports/up08.sql b/schema/crdb/ereports/up08.sql new file mode 100644 index 00000000000..22290b2bde6 --- /dev/null +++ b/schema/crdb/ereports/up08.sql @@ -0,0 +1,5 @@ +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_serial +ON omicron.public.host_ereport ( + sled_serial +) WHERE + time_deleted IS NULL; From e0d26ae3896d1b820a17d39895203046e7fdf340 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 11:25:46 -0700 Subject: [PATCH 49/61] rm dead code --- nexus/db-queries/src/db/datastore/ereport.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 9a331cb75e3..721bc796e3f 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -49,16 +49,6 @@ pub struct EreporterRestartBySerial { } impl DataStore { - // pub async fn sp_ereport_list_by_serial( - // &self, - // opctx: &OpContext, - // serial: String, - // time_range: impl RangeBounds>, - // pagparams: - // ) -> ListResultVec { - // todo!() - // } - /// Fetch an ereport by its restart ID and ENA. /// /// This function queries both the service-processor and host OS ereport From 372d582602d61d7fa98e80eee99a5f7a937d9bda Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 17 Jun 2025 11:41:00 -0700 Subject: [PATCH 50/61] we can reasonably panic if first_seen_at is null --- nexus/db-queries/src/db/datastore/ereport.rs | 79 +++++++------------- 1 file changed, 28 insertions(+), 51 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 721bc796e3f..f96a0c6172c 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -134,59 +134,36 @@ impl DataStore { .internal_context("listing SP ereports") })?; - let sp_reporters = sp_rows.into_iter().filter_map(|(restart_id, sp_type, sp_slot, first_seen, ereports)| { - let ereports = ereports.into(); - let first_seen_at = match first_seen { - Some(t) => t, - None => { - debug_assert!(false, "ELIZA: why would min(time_collected) be none?"); - slog::warn!( - opctx.log, - "SP row returned by `ereporter_restart_list_by_serial` \ - had a `None` value for `min(time_collected)`"; - "restart_id" => ?restart_id, - "sp_type" => ?sp_type, - "sp_slot" => ?sp_slot, - "first_seen" => ?first_seen, - "count" => ?ereports, - ); - return None; + const FIRST_SEEN_NOT_NULL: &str = "`min(time_collected)` should never \ + return `NULL`, since the `time_collected` column is not nullable, \ + and the `SELECT` clause should return nothing if the result set \ + is empty"; + + let sp_reporters = sp_rows.into_iter().map( + |(restart_id, sp_type, sp_slot, first_seen, ereports)| { + EreporterRestartBySerial { + id: EreporterRestartUuid::from_untyped_uuid(restart_id), + reporter_kind: Reporter::Sp { + sp_type, + slot: sp_slot.into(), + }, + first_seen_at: first_seen.expect(FIRST_SEEN_NOT_NULL), + ereports: ereports.into(), } - }; - - Some(EreporterRestartBySerial { - id: EreporterRestartUuid::from_untyped_uuid(restart_id), - reporter_kind: Reporter::Sp { sp_type, slot: sp_slot.into() }, - first_seen_at, - ereports, - }) - }); - let host_reporters = host_os_rows.into_iter().filter_map(|(restart_id, sled_id, first_seen, ereports)| { - let ereports = ereports.into(); - let first_seen_at = match first_seen { - Some(t) => t, - None => { - debug_assert!(false, "ELIZA: why would min(time_collected) be none?"); - slog::warn!( - opctx.log, - "SP row returned by `ereporter_restart_list_by_serial` \ - had a `None` value for `min(time_collected)`"; - "restart_id" => ?restart_id, - "sled_id" => ?sled_id, - "first_seen" => ?first_seen, - "ereports" => ?ereports, - ); - return None; + }, + ); + let host_reporters = host_os_rows.into_iter().map( + |(restart_id, sled_id, first_seen, ereports)| { + EreporterRestartBySerial { + id: EreporterRestartUuid::from_untyped_uuid(restart_id), + reporter_kind: Reporter::HostOs { + sled: SledUuid::from_untyped_uuid(sled_id), + }, + first_seen_at: first_seen.expect(FIRST_SEEN_NOT_NULL), + ereports: ereports.into(), } - }; - - Some(EreporterRestartBySerial { - id: EreporterRestartUuid::from_untyped_uuid(restart_id), - reporter_kind: Reporter::HostOs { sled: SledUuid::from_untyped_uuid(sled_id) }, - first_seen_at, - ereports, - }) - }); + }, + ); Ok(sp_reporters.chain(host_reporters).collect::>()) } From d9f80193e186811dcdd9bc379f46997e434949d0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 18 Jun 2025 10:10:20 -0700 Subject: [PATCH 51/61] add tests for parsing ENAs, make sure FromStr always roundtrips --- ereport/types/src/lib.rs | 42 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/ereport/types/src/lib.rs b/ereport/types/src/lib.rs index 412f14658f3..9727684a6c8 100644 --- a/ereport/types/src/lib.rs +++ b/ereport/types/src/lib.rs @@ -39,7 +39,7 @@ pub struct Ena(pub u64); impl fmt::Display for Ena { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:x}", self.0) + write!(f, "{:#x}", self.0) } } @@ -51,13 +51,13 @@ impl fmt::Debug for Ena { impl fmt::UpperHex for Ena { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::UpperHex::fmt(&self.0, f) + write!(f, "{:#X}", self.0) } } impl fmt::LowerHex for Ena { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::LowerHex::fmt(&self.0, f) + write!(f, "{:#x}", self.0) } } @@ -200,4 +200,40 @@ mod tests { .expect("ereport should deserialize"); eprintln!("EREPORT: {deserialized:#?}"); } + + #[test] + fn test_ena_from_str() { + let expected = [ + ("1", Ena(1)), + ("0x1", Ena(1)), + // Should parse in base 10 + ("16", Ena(16)), + // Should parse hexadecimally + ("0x16", Ena(0x16)), + ("0X16", Ena(0x16)), + // ENA spotted "in the wild", taken from: + // https://rfd.shared.oxide.computer/rfd/0520#_ereports_in_the_fma + ("0x3cae76440c100001", Ena(0x3cae76440c100001)), + ]; + + for (input, expected) in expected { + let actual = Ena::from_str(dbg!(input)).expect("should parse"); + assert_eq!(dbg!(expected), dbg!(actual)); + } + } + + #[test] + fn test_ena_from_str_roundtrip() { + let enas = [Ena(1), Ena(16), Ena(4200), Ena(0x3cae76440c100001)]; + + for ena in enas { + let ena = dbg!(ena); + let display = format!("{ena}"); + assert_eq!(dbg!(display).parse::(), Ok(ena)); + let upperhex = format!("{ena:X}"); + assert_eq!(dbg!(upperhex).parse::(), Ok(ena)); + let lowerhex = format!("{ena:x}"); + assert_eq!(dbg!(lowerhex).parse::(), Ok(ena)); + } + } } From 06ce0a8feb70063e1a40533b5a26246a59fde985 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 18 Jun 2025 14:41:58 -0700 Subject: [PATCH 52/61] add `WHERE time_deleted IS NULL` to ereport indices --- schema/crdb/dbinit.sql | 8 ++++++-- schema/crdb/ereports/up02.sql | 4 +++- schema/crdb/ereports/up06.sql | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 78a25f1af00..623436fa8b8 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5791,7 +5791,9 @@ ON omicron.public.sp_ereport ( sp_type, sp_slot, time_collected -); +) +where + time_deleted IS NULL; CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp ON omicron.public.sp_ereport @@ -5833,7 +5835,9 @@ CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( sled_id, time_collected -); +) +WHERE + time_deleted IS NULL; CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp ON omicron.public.host_ereport diff --git a/schema/crdb/ereports/up02.sql b/schema/crdb/ereports/up02.sql index 2cf34912e35..8b5a8afbf42 100644 --- a/schema/crdb/ereports/up02.sql +++ b/schema/crdb/ereports/up02.sql @@ -3,4 +3,6 @@ ON omicron.public.sp_ereport ( sp_type, sp_slot, time_collected -); \ No newline at end of file +) +WHERE + time_deleted IS NULL; diff --git a/schema/crdb/ereports/up06.sql b/schema/crdb/ereports/up06.sql index c749258887a..6a43c7fa924 100644 --- a/schema/crdb/ereports/up06.sql +++ b/schema/crdb/ereports/up06.sql @@ -1,4 +1,6 @@ CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( sled_id, time_collected -); \ No newline at end of file +) +WHERE + time_deleted IS NULL; From b1bce521d54bb846bc1523983a5cca87bc34503e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 10:40:40 -0700 Subject: [PATCH 53/61] add not deleted filter to latest ereport queries this ensures that the index is actually used --- the change in 06ce0a8feb70063e1a40533b5a26246a59fde985 added `WHERE time_deleted IS NULL` to the indices to look up ereports by sled/SP slot, and this made the latest-ereport queries start doing a full table scan. adding the filter clause fixes that. --- nexus/db-queries/src/db/datastore/ereport.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index f96a0c6172c..a7cab6f9513 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -244,7 +244,12 @@ impl DataStore { slot: SpMgsSlot, ) -> impl RunnableQuery { sp_dsl::sp_ereport - .filter(sp_dsl::sp_type.eq(sp_type).and(sp_dsl::sp_slot.eq(slot))) + .filter( + sp_dsl::sp_type + .eq(sp_type) + .and(sp_dsl::sp_slot.eq(slot)) + .and(sp_dsl::time_deleted.is_null()), + ) .order_by((sp_dsl::time_collected.desc(), sp_dsl::ena.desc())) .limit(1) .select((sp_dsl::restart_id, sp_dsl::ena)) @@ -269,7 +274,11 @@ impl DataStore { sled_id: SledUuid, ) -> impl RunnableQuery { host_dsl::host_ereport - .filter(host_dsl::sled_id.eq(sled_id.into_untyped_uuid())) + .filter( + host_dsl::sled_id + .eq(sled_id.into_untyped_uuid()) + .and(host_dsl::time_deleted.is_null()), + ) .order_by((host_dsl::time_collected.desc(), host_dsl::ena.desc())) .limit(1) .select((host_dsl::restart_id, host_dsl::ena)) From c2914b17ad60cb5e3b3d70e6ecbe75afd80f7e50 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 10:42:56 -0700 Subject: [PATCH 54/61] get list of SP IDs from MGS instead of making them up as suggested by @jgallagher in https://github.com/oxidecomputer/omicron/pull/8296#discussion_r2155401104. this way, we no longer hard-code SP IDs in Nexus, which seems much ncicer. --- dev-tools/omdb/src/bin/omdb/nexus.rs | 11 +- .../app/background/tasks/ereport_ingester.rs | 142 +++++++++++------- nexus/types/src/internal_api/background.rs | 2 +- 3 files changed, 96 insertions(+), 59 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index dfd4e029b07..b6b1412414d 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2728,7 +2728,7 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { use nexus_types::internal_api::background::SpEreportIngesterStatus; use nexus_types::internal_api::background::SpEreporterStatus; - let SpEreportIngesterStatus { sps, error } = + let SpEreportIngesterStatus { sps, errors } = match serde_json::from_value(details.clone()) { Err(error) => { eprintln!( @@ -2740,8 +2740,11 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { Ok(status) => status, }; - if let Some(error) = error { - println!("{ERRICON} task activation failed: {error}"); + if !errors.is_empty() { + println!("{ERRICON} errors:"); + for error in errors { + println!(" > {error}"); + } } print_ereporter_status_totals(sps.iter().map(|sp| &sp.status)); @@ -2754,7 +2757,7 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { const NUM_WIDTH: usize = 3; if !sps.is_empty() { - println!("\n Service processors:"); + println!("\n service processors:"); for SpEreporterStatus { sp_type, slot, status } in &sps { println!( " - {sp_type:<6} {slot:02}: {:>NUM_WIDTH$} ereports", diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 6700ff524fb..c31140f65cb 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -5,7 +5,6 @@ //! Background tasks for ereport ingestion use crate::app::background::BackgroundTask; -use anyhow::Context; use chrono::Utc; use ereport_types::Ena; use ereport_types::Ereport; @@ -42,13 +41,7 @@ impl BackgroundTask for SpEreportIngester { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { Box::pin(async move { - let status = - self.actually_activate(opctx).await.unwrap_or_else(|error| { - SpEreportIngesterStatus { - error: Some(format!("{error:#}")), - ..Default::default() - } - }); + let status = self.actually_activate(opctx).await; serde_json::json!(status) }) } @@ -67,50 +60,97 @@ impl SpEreportIngester { async fn actually_activate( &mut self, opctx: &OpContext, - ) -> anyhow::Result { + ) -> SpEreportIngesterStatus { + let mut status = SpEreportIngesterStatus::default(); // Find MGS clients. // TODO(eliza): reuse the same client across activations; qorb, etc. - let mgs_clients = self - .resolver - .lookup_all_socket_v6(ServiceName::ManagementGatewayService) - .await - .context("looking up MGS addresses")? - .into_iter() - .map(|addr| { - let url = format!("http://{addr}"); - let log = opctx.log.new(o!("gateway_url" => url.clone())); - let client = gateway_client::Client::new(&url, log); - GatewayClient { addr, client } - }) - .collect::>(); + let mgs_clients = { + let lookup = self + .resolver + .lookup_all_socket_v6(ServiceName::ManagementGatewayService) + .await; + let addrs = match lookup { + Err(error) => { + const MSG: &str = "failed to resolve MGS addresses"; + error!(opctx.log, "{MSG}"; "error" => ?error); + status.errors.push(format!("{MSG}: {error}")); + return status; + } + Ok(addrs) => addrs, + }; + + addrs + .into_iter() + .map(|addr| { + let url = format!("http://{addr}"); + let log = opctx.log.new(o!("gateway_url" => url.clone())); + let client = gateway_client::Client::new(&url, log); + GatewayClient { addr, client } + }) + .collect::>() + }; + + if mgs_clients.is_empty() { + const MSG: &str = "no MGS addresses resolved"; + error!(opctx.log, "{MSG}"); + status.errors.push(MSG.to_string()); + return status; + }; + + // Ask MGS for the list of all SP identifiers. If a request to the first + // gateway fails, we'll try again for every resolved MGS before giving + // up. + let sps = { + let mut gateways = mgs_clients.iter(); + loop { + let Some(GatewayClient { addr, client }) = gateways.next() + else { + const MSG: &str = "no MGS successfully returned SP ID list"; + error!(opctx.log, "{MSG}"); + status.errors.push(MSG.to_string()); + return status; + }; + match client.sp_all_ids().await { + Ok(ids) => break ids.into_inner(), + Err(err) => { + const MSG: &str = "failed to list SP IDs from MGS"; + warn!( + opctx.log, + "{MSG}"; + "error" => %err, + "gateway_addr" => %addr, + ); + status.errors.push(format!("{MSG} ({addr}): {err}")); + } + } + } + }; // TODO(eliza): what seems like an appropriate parallelism? should we // just do 16? let mut tasks = ParallelTaskSet::new(); - let mut status = SpEreportIngesterStatus::default(); - let sleds = - (0..32u16).map(|slot| (nexus_types::inventory::SpType::Sled, slot)); - let switches = (0..2u16) - .map(|slot| (nexus_types::inventory::SpType::Switch, slot)); - let pscs = - (0..2u16).map(|slot| (nexus_types::inventory::SpType::Power, slot)); - - for (sp_type, slot) in switches.chain(pscs).chain(sleds) { + for gateway_client::types::SpIdentifier { type_, slot } in sps { + let Ok(slot) = u16::try_from(slot) else { + const MSG: &str = "invalid slot number received from MGS"; + error!(opctx.log, "{MSG}"; "sp_type" => %type_, "slot" => %slot); + status.errors.push(format!("{MSG}: {type_} {slot}")); + continue; + }; let sp_result = tasks .spawn({ let opctx = opctx.child(BTreeMap::from([ // XXX(eliza): that's so many little strings... :( - ("sp_type".to_string(), sp_type.to_string()), + ("sp_type".to_string(), type_.to_string()), ("slot".to_string(), slot.to_string()), ])); let clients = mgs_clients.clone(); let ingester = self.inner.clone(); async move { let status = ingester - .ingest_sp_ereports(opctx, &clients, sp_type, slot) + .ingest_sp_ereports(opctx, &clients, type_, slot) .await?; - Some(SpEreporterStatus { sp_type, slot, status }) + Some(SpEreporterStatus { sp_type: type_, slot, status }) } }) .await; @@ -129,7 +169,7 @@ impl SpEreportIngester { // Sort statuses for consistent output in OMDB commands. status.sps.sort_unstable_by_key(|sp| (sp.sp_type, sp.slot)); - Ok(status) + status } } @@ -431,16 +471,13 @@ mod tests { nexus.id(), ); - let activation1 = ingester - .actually_activate(&opctx) - .await - .expect("activation should succeed"); - dbg!(&activation1); - assert_eq!( - activation1.error.as_ref(), - None, - "there should be no top-level activation error" + let activation1 = ingester.actually_activate(&opctx).await; + assert!( + activation1.errors.is_empty(), + "activation 1 should succeed without errors, but saw: {:#?}", + activation1.errors ); + dbg!(&activation1); assert_eq!( activation1.sps.len(), 3, @@ -606,17 +643,14 @@ mod tests { // Activate the task again and assert that no new ereports were // ingested. - let activation2 = ingester - .actually_activate(&opctx) - .await - .expect("activation 2 should succeed"); - dbg!(&activation1); - assert_eq!( - activation2.error.as_ref(), - None, - "there should be no top-level activation error for the second \ - activation" + let activation2 = ingester.actually_activate(&opctx).await; + assert!( + activation2.errors.is_empty(), + "activation 2 should succeed without errors, but saw: {:#?}", + activation2.errors ); + dbg!(&activation2); + assert_eq!(activation2.sps, &[], "no new ereports should be observed"); check_sp_ereports_exist( diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index b164337997f..e724795bceb 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -533,7 +533,7 @@ pub struct ReadOnlyRegionReplacementStartStatus { #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct SpEreportIngesterStatus { pub sps: Vec, - pub error: Option, + pub errors: Vec, } #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] From 543a99fc7e663ea56cb7ef533379cff25a07ff26 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 10:45:21 -0700 Subject: [PATCH 55/61] remove duplicate SP type/slot keys in logs these are now added to the OpContext metadata, so no sense in repeating them. --- nexus/src/app/background/tasks/ereport_ingester.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index c31140f65cb..6fbcbd7b21e 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -330,8 +330,6 @@ impl Ingester { slog::debug!( &opctx.log, "attempting ereport collection from MGS"; - "sp_type" => ?sp_type, - "slot" => slot, "committed_ena" => ?committed_ena, "start_ena" => ?start_ena, "restart_id" => ?restart_id, @@ -357,8 +355,6 @@ impl Ingester { slog::debug!( &opctx.log, "ereport collection: MGS claims there is no SP in this slot"; - "sp_type" => ?sp_type, - "slot" => slot, "committed_ena" => ?committed_ena, "start_ena" => ?start_ena, "restart_id" => ?restart_id, @@ -369,8 +365,6 @@ impl Ingester { slog::warn!( &opctx.log, "ereport collection: unanticipated MGS request error: {e:#}"; - "sp_type" => ?sp_type, - "slot" => slot, "committed_ena" => ?committed_ena, "start_ena" => ?start_ena, "restart_id" => ?restart_id, From bd8c5a50e7fbd1c2db78bbfa5e983115a98e6856 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 11:02:26 -0700 Subject: [PATCH 56/61] various documentation embetterment --- .../app/background/tasks/ereport_ingester.rs | 10 ++- schema/crdb/dbinit.sql | 89 +++++++++++++++---- schema/crdb/ereports/up01.sql | 55 +++++++++--- schema/crdb/ereports/up05.sql | 33 +++++-- 4 files changed, 148 insertions(+), 39 deletions(-) diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 6fbcbd7b21e..1e2bb151a20 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -2,7 +2,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background tasks for ereport ingestion +//! Background tasks for ereport ingestion. +//! +//! The tasks in this module implement the Nexus portion of the ereport +//! ingestion protocol [described in RFD 520][rfd520]. +//! +//! [rfd520]: https://rfd.shared.oxide.computer/rfd/520#_determinations use crate::app::background::BackgroundTask; use chrono::Utc; @@ -210,6 +215,9 @@ impl Ingester { let mut params = EreportQueryParams::from_latest(latest); let mut status = None; + + // Continue requesting ereports from this SP in a loop until we have + // received all its ereports. while let Some(gateway_client::types::Ereports { restart_id, items, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 623436fa8b8..5321a629474 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5751,36 +5751,65 @@ ON omicron.public.webhook_delivery_attempt ( /* * Ereports + * + * See RFD 520 for details: + * https://rfd.shared.oxide.computer/rfd/520 */ --- Ereports from service processors +/* Ereports from service processors */ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( + /* + * the primary key for an ereport is formed from the tuple of the + * reporter's restart ID (a randomly generated UUID) and the ereport's ENA + * (a 64-bit integer that uniquely identifies the ereport within that + * restart of the reporter). + * + * see: https://rfd.shared.oxide.computer/rfd/520#ereport-metadata + */ restart_id UUID NOT NULL, ena INT8 NOT NULL, time_deleted TIMESTAMPTZ, - -- time at which the ereport was collected + /* time at which the ereport was collected */ time_collected TIMESTAMPTZ NOT NULL, - -- UUID of the Nexus instance that collected the ereport + /* UUID of the Nexus instance that collected the ereport */ collector_id UUID NOT NULL, - -- physical location of the reporting SP - -- - -- these fields are always present, as they are how requests to collect - -- ereports are indexed by MGS. + /* + * physical location of the reporting SP + * + * these fields are always present, as they are how requests to collect + * ereports are indexed by MGS. + */ sp_type omicron.public.sp_type NOT NULL, sp_slot INT4 NOT NULL, - -- VPD identity of the reporting SP. - -- - -- unlike the physical location, these fields are nullable, as an ereport - -- may be generated in a state where the SP doesn't know who or what it is. - -- consider that "i don't know my own identity" is a reasonable condition to - -- might want to generate an ereport about! + /* + * VPD identity of the reporting SP. + * + * unlike the physical location, these fields are nullable, as an ereport + * may be generated in a state where the SP doesn't know who or what it is. + * consider that "i don't know my own identity" is a reasonable condition + * to want to generate an ereport about! + */ serial_number STRING, part_number STRING, - -- JSON representation of the ereport + /* + * JSON representation of the ereport as received from the SP. + * + * the raw JSON representation of the ereport is always stored, alongside + * any more structured data that we extract from it, as extracting data + * from the received ereport requires additional knowledge of the ereport + * formats generated by the SP and its various tasks. as these may change, + * and new ereports may be added which Nexus may not yet be aware of, + * we always store the raw JSON representation of the ereport. as Nexus + * becomes aware of new ereport schemas, it can go back and extract + * structured data from previously collected ereports with those schemas, + * but this is only possible if the JSON blob is persisted. + * + * see also: https://rfd.shared.oxide.computer/rfd/520#data-model + */ report JSONB NOT NULL, PRIMARY KEY (restart_id, ena) @@ -5810,22 +5839,44 @@ ON omicron.public.sp_ereport ( ) WHERE time_deleted IS NULL; --- Ereports from the host operating system +/* Ereports from the host operating system */ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( + /* + * the primary key for an ereport is formed from the tuple of the + * reporter's restart ID (a randomly generated UUID) and the ereport's ENA + * (a 64-bit integer that uniquely identifies the ereport within that + * restart of the reporter). + * + * see: https://rfd.shared.oxide.computer/rfd/520#ereport-metadata + */ restart_id UUID NOT NULL, ena INT8 NOT NULL, time_deleted TIMESTAMPTZ, - -- time at which the ereport was collected + /* time at which the ereport was collected */ time_collected TIMESTAMPTZ NOT NULL, - -- UUID of the Nexus instance that collected the ereport + /* UUID of the Nexus instance that collected the ereport */ collector_id UUID NOT NULL, - -- identity of the reporting sled + /* identity of the reporting sled */ sled_id UUID NOT NULL, sled_serial TEXT NOT NULL, - -- JSON representation of the ereport + /* + * JSON representation of the ereport as received from the sled-agent. + * + * the raw JSON representation of the ereport is always stored, alongside + * any more structured data that we extract from it, as extracting data + * from the received ereport requires additional knowledge of the ereport + * formats generated by the host OS' fault management system. as these may + * change, and new ereports may be added which Nexus may not yet be aware + * of, we always store the raw JSON representation of the ereport. as Nexus + * becomes aware of new ereport schemas, it can go back and extract + * structured data from previously collected ereports with those schemas, + * but this is only possible if the JSON blob is persisted. + * + * see also: https://rfd.shared.oxide.computer/rfd/520#data-model + */ report JSONB NOT NULL, PRIMARY KEY (restart_id, ena) diff --git a/schema/crdb/ereports/up01.sql b/schema/crdb/ereports/up01.sql index 8c8164705c7..2641ab758ca 100644 --- a/schema/crdb/ereports/up01.sql +++ b/schema/crdb/ereports/up01.sql @@ -1,31 +1,58 @@ +/* Ereports from service processors */ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( + /* + * the primary key for an ereport is formed from the tuple of the + * reporter's restart ID (a randomly generated UUID) and the ereport's ENA + * (a 64-bit integer that uniquely identifies the ereport within that + * restart of the reporter). + * + * see: https://rfd.shared.oxide.computer/rfd/520#ereport-metadata + */ restart_id UUID NOT NULL, ena INT8 NOT NULL, time_deleted TIMESTAMPTZ, - -- time at which the ereport was collected + /* time at which the ereport was collected */ time_collected TIMESTAMPTZ NOT NULL, - -- UUID of the Nexus instance that collected the ereport + /* UUID of the Nexus instance that collected the ereport */ collector_id UUID NOT NULL, - -- physical lcoation of the reporting SP - -- - -- these fields are always present, as they are how requests to collect - -- ereports are indexed by MGS. + /* + * physical location of the reporting SP + * + * these fields are always present, as they are how requests to collect + * ereports are indexed by MGS. + */ sp_type omicron.public.sp_type NOT NULL, sp_slot INT4 NOT NULL, - -- VPD identity of the reporting SP. - -- - -- unlike the physical location, these fields are nullable, as an ereport - -- may be generated in a state where the SP doesn't know who or what it is. - -- consider that "i don't know my own identity" is a reasonable condition to - -- might want to generate an ereport about! + /* + * VPD identity of the reporting SP. + * + * unlike the physical location, these fields are nullable, as an ereport + * may be generated in a state where the SP doesn't know who or what it is. + * consider that "i don't know my own identity" is a reasonable condition + * to want to generate an ereport about! + */ serial_number STRING, part_number STRING, - -- JSON representation of the ereport + /* + * JSON representation of the ereport as received from the SP. + * + * the raw JSON representation of the ereport is always stored, alongside + * any more structured data that we extract from it, as extracting data + * from the received ereport requires additional knowledge of the ereport + * formats generated by the SP and its various tasks. as these may change, + * and new ereports may be added which Nexus may not yet be aware of, + * we always store the raw JSON representation of the ereport. as Nexus + * becomes aware of new ereport schemas, it can go back and extract + * structured data from previously collected ereports with those schemas, + * but this is only possible if the JSON blob is persisted. + * + * see also: https://rfd.shared.oxide.computer/rfd/520#data-model + */ report JSONB NOT NULL, PRIMARY KEY (restart_id, ena) -); \ No newline at end of file +); diff --git a/schema/crdb/ereports/up05.sql b/schema/crdb/ereports/up05.sql index e58e9773ff9..289fec3c42e 100644 --- a/schema/crdb/ereports/up05.sql +++ b/schema/crdb/ereports/up05.sql @@ -1,19 +1,42 @@ +/* Ereports from the host operating system */ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( + /* + * the primary key for an ereport is formed from the tuple of the + * reporter's restart ID (a randomly generated UUID) and the ereport's ENA + * (a 64-bit integer that uniquely identifies the ereport within that + * restart of the reporter). + * + * see: https://rfd.shared.oxide.computer/rfd/520#ereport-metadata + */ restart_id UUID NOT NULL, ena INT8 NOT NULL, time_deleted TIMESTAMPTZ, - -- time at which the ereport was collected + /* time at which the ereport was collected */ time_collected TIMESTAMPTZ NOT NULL, - -- UUID of the Nexus instance that collected the ereport + /* UUID of the Nexus instance that collected the ereport */ collector_id UUID NOT NULL, - -- identity of the reporting sled + /* identity of the reporting sled */ sled_id UUID NOT NULL, sled_serial TEXT NOT NULL, - -- JSON representation of the ereport + /* + * JSON representation of the ereport as received from the sled-agent. + * + * the raw JSON representation of the ereport is always stored, alongside + * any more structured data that we extract from it, as extracting data + * from the received ereport requires additional knowledge of the ereport + * formats generated by the host OS' fault management system. as these may + * change, and new ereports may be added which Nexus may not yet be aware + * of, we always store the raw JSON representation of the ereport. as Nexus + * becomes aware of new ereport schemas, it can go back and extract + * structured data from previously collected ereports with those schemas, + * but this is only possible if the JSON blob is persisted. + * + * see also: https://rfd.shared.oxide.computer/rfd/520#data-model + */ report JSONB NOT NULL, PRIMARY KEY (restart_id, ena) -); \ No newline at end of file +); From b7322b90bbf97c8e11bb6dbb1d03d4b2cc45d1a4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 12:55:22 -0700 Subject: [PATCH 57/61] omdb output wiggling --- dev-tools/omdb/src/bin/omdb/nexus.rs | 20 ++++++++++---------- dev-tools/omdb/tests/successes.out | 18 ++++++++++++++++-- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index b6b1412414d..71c21afe59c 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2740,15 +2740,6 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { Ok(status) => status, }; - if !errors.is_empty() { - println!("{ERRICON} errors:"); - for error in errors { - println!(" > {error}"); - } - } - - print_ereporter_status_totals(sps.iter().map(|sp| &sp.status)); - const NEW_EREPORTS: &str = "new ereports ingested:"; const HTTP_REQUESTS: &str = "HTTP requests sent:"; const ERRORS: &str = "errors:"; @@ -2756,6 +2747,15 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { const_max_len(&[NEW_EREPORTS, HTTP_REQUESTS, ERRORS]) + 1; const NUM_WIDTH: usize = 3; + if !errors.is_empty() { + println!("{ERRICON} {ERRORS:NUM_WIDTH$}", errors.len()); + for error in errors { + println!(" - {error}"); + } + } + + print_ereporter_status_totals(sps.iter().map(|sp| &sp.status)); + if !sps.is_empty() { println!("\n service processors:"); for SpEreporterStatus { sp_type, slot, status } in &sps { @@ -2778,7 +2778,7 @@ fn print_task_sp_ereport_ingester(details: &serde_json::Value) { status.errors.len() ); for error in &status.errors { - println!(" - {error}"); + println!(" - {error}"); } } } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 802e736ceb9..c9862cc89aa 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -744,7 +744,14 @@ task: "sp_ereport_ingester" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: looking up MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } +/!\ errors: 1 + - failed to resolve MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + total ereports received: 0 + new ereports ingested: 0 + total HTTP requests sent: 0 + total collection errors: 0 + reporters with ereports: 0 + reporters with collection errors: 0 task: "support_bundle_collector" configured period: every days h m s @@ -1270,7 +1277,14 @@ task: "sp_ereport_ingester" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - last completion reported error: looking up MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } +/!\ errors: 1 + - failed to resolve MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + total ereports received: 0 + new ereports ingested: 0 + total HTTP requests sent: 0 + total collection errors: 0 + reporters with ereports: 0 + reporters with collection errors: 0 task: "support_bundle_collector" configured period: every days h m s From 5e676c69ff36f63fedb5922db58efd862cbce794 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 23 Jun 2025 15:35:46 -0700 Subject: [PATCH 58/61] change ereport-by-time-collected indices to USING HASH --- schema/crdb/dbinit.sql | 5 ++--- schema/crdb/ereports/up03.sql | 4 ++-- schema/crdb/ereports/up06.sql | 3 ++- schema/crdb/ereports/up07.sql | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5321a629474..5e6fdb46943 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5826,13 +5826,12 @@ where CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp ON omicron.public.sp_ereport -USING BTREE ( +USING HASH ( time_collected ) WHERE time_deleted IS NULL; - CREATE INDEX IF NOT EXISTS lookup_sp_ereports_by_serial ON omicron.public.sp_ereport ( serial_number @@ -5892,7 +5891,7 @@ WHERE CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp ON omicron.public.host_ereport -USING BTREE ( +USING HASH ( time_collected ) WHERE diff --git a/schema/crdb/ereports/up03.sql b/schema/crdb/ereports/up03.sql index 1a3b7960874..90cff43e5f6 100644 --- a/schema/crdb/ereports/up03.sql +++ b/schema/crdb/ereports/up03.sql @@ -1,7 +1,7 @@ CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp ON omicron.public.sp_ereport -USING BTREE ( +USING HASH ( time_collected ) WHERE - time_deleted IS NULL; \ No newline at end of file + time_deleted IS NULL; diff --git a/schema/crdb/ereports/up06.sql b/schema/crdb/ereports/up06.sql index 6a43c7fa924..65684f6ba75 100644 --- a/schema/crdb/ereports/up06.sql +++ b/schema/crdb/ereports/up06.sql @@ -1,4 +1,5 @@ -CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled ON omicron.public.host_ereport ( +CREATE INDEX IF NOT EXISTS lookup_host_ereports_by_sled +ON omicron.public.host_ereport ( sled_id, time_collected ) diff --git a/schema/crdb/ereports/up07.sql b/schema/crdb/ereports/up07.sql index 20141fd8e47..c86749caa19 100644 --- a/schema/crdb/ereports/up07.sql +++ b/schema/crdb/ereports/up07.sql @@ -1,7 +1,7 @@ CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp ON omicron.public.host_ereport -USING BTREE ( +USING HASH ( time_collected ) WHERE - time_deleted IS NULL; \ No newline at end of file + time_deleted IS NULL; From 778bb67a0fed180104bbca18d010e25b449f040f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 24 Jun 2025 13:09:08 -0700 Subject: [PATCH 59/61] turns out our CRDB version doesn't support hash-sharded indices :( --- schema/crdb/dbinit.sql | 6 ++---- schema/crdb/ereports/up03.sql | 3 +-- schema/crdb/ereports/up07.sql | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5e6fdb46943..0bd5b31e003 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5825,8 +5825,7 @@ where time_deleted IS NULL; CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp -ON omicron.public.sp_ereport -USING HASH ( +ON omicron.public.sp_ereport( time_collected ) WHERE @@ -5890,8 +5889,7 @@ WHERE time_deleted IS NULL; CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp -ON omicron.public.host_ereport -USING HASH ( +ON omicron.public.host_ereport ( time_collected ) WHERE diff --git a/schema/crdb/ereports/up03.sql b/schema/crdb/ereports/up03.sql index 90cff43e5f6..484bbbca635 100644 --- a/schema/crdb/ereports/up03.sql +++ b/schema/crdb/ereports/up03.sql @@ -1,6 +1,5 @@ CREATE INDEX IF NOT EXISTS order_sp_ereports_by_timestamp -ON omicron.public.sp_ereport -USING HASH ( +ON omicron.public.sp_ereport ( time_collected ) WHERE diff --git a/schema/crdb/ereports/up07.sql b/schema/crdb/ereports/up07.sql index c86749caa19..e63fe78d5a4 100644 --- a/schema/crdb/ereports/up07.sql +++ b/schema/crdb/ereports/up07.sql @@ -1,6 +1,5 @@ CREATE INDEX IF NOT EXISTS order_host_ereports_by_timestamp -ON omicron.public.host_ereport -USING HASH ( +ON omicron.public.host_ereport ( time_collected ) WHERE From a134491f129a141963ae633c312b4bf472cdb09c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 24 Jun 2025 13:52:05 -0700 Subject: [PATCH 60/61] add `class` column to ereports --- dev-tools/omdb/src/bin/omdb/db/ereport.rs | 25 +++++++++++++++++++ nexus/db-model/src/ereport.rs | 15 +++++++++++ nexus/db-schema/src/schema.rs | 2 ++ .../app/background/tasks/ereport_ingester.rs | 17 +++++++++++-- schema/crdb/dbinit.sql | 16 ++++++++++++ schema/crdb/ereports/up01.sql | 8 ++++++ schema/crdb/ereports/up05.sql | 8 ++++++ 7 files changed, 89 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 1feb94f77e8..e0d2d184d31 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -73,6 +73,10 @@ struct ListArgs { #[clap(long = "id", short)] ids: Vec, + /// Include only ereports with the provided class strings. + #[clap(long = "class", short)] + classes: Vec, + /// Include only ereports collected before this timestamp #[clap(long, short)] before: Option>, @@ -124,6 +128,8 @@ async fn cmd_db_ereport_list( time_collected: DateTime, restart_id: Uuid, ena: Ena, + #[tabled(display_with = "display_option_blank")] + class: Option, source: db::model::Reporter, #[tabled(display_with = "display_option_blank", rename = "S/N")] serial: Option<&'report str>, @@ -137,6 +143,7 @@ async fn cmd_db_ereport_list( time_collected, restart_id, ena, + ref class, sp_type, sp_slot, ref serial_number, @@ -147,6 +154,7 @@ async fn cmd_db_ereport_list( time_collected, restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), + class: class.clone(), source: db::model::Reporter::Sp { sp_type, slot: sp_slot.0 }, serial: serial_number.as_deref(), part_number: part_number.as_deref(), @@ -160,6 +168,7 @@ async fn cmd_db_ereport_list( time_collected, restart_id, ena, + ref class, sled_id, ref sled_serial, .. @@ -168,6 +177,7 @@ async fn cmd_db_ereport_list( time_collected, restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), + class: class.clone(), source: db::model::Reporter::HostOs { sled: sled_id.into() }, serial: Some(&sled_serial), part_number: None, // TODO(eliza): go get this from inventory? @@ -197,6 +207,10 @@ async fn cmd_db_ereport_list( query.filter(sp_dsl::serial_number.eq_any(args.serials.clone())); } + if !args.classes.is_empty() { + query = query.filter(sp_dsl::class.eq_any(args.classes.clone())); + } + if !args.ids.is_empty() { query = query.filter(sp_dsl::restart_id.eq_any(args.ids.clone())); } @@ -231,6 +245,10 @@ async fn cmd_db_ereport_list( query.filter(host_dsl::sled_serial.eq_any(args.serials.clone())); } + if !args.classes.is_empty() { + query = query.filter(host_dsl::class.eq_any(args.classes.clone())); + } + if !args.ids.is_empty() { query = query.filter(host_dsl::restart_id.eq_any(args.ids.clone())); } @@ -294,16 +312,19 @@ async fn cmd_db_ereport_info( collector_id, part_number, serial_number, + class, } = metadata; const ENA: &str = "ENA"; const TIME_COLLECTED: &str = "collected at"; const TIME_DELETED: &str = "deleted at"; const COLLECTOR_ID: &str = "collected by"; + const CLASS: &str = "class"; const REPORTER: &str = "reported by"; const RESTART_ID: &str = "restart ID"; const PART_NUMBER: &str = " part number"; const SERIAL_NUMBER: &str = " serial number"; const WIDTH: usize = const_max_len(&[ + CLASS, TIME_COLLECTED, TIME_DELETED, COLLECTOR_ID, @@ -313,6 +334,10 @@ async fn cmd_db_ereport_info( ]); println!("\n{:=<80}", "== EREPORT METADATA "); println!(" {ENA:>WIDTH$}: {}", id.ena); + match class { + Some(class) => println!(" {CLASS:>WIDTH$}: {class}"), + None => println!("/!\\ {CLASS:>WIDTH$}: "), + } if let Some(time_deleted) = time_deleted { println!("(i) {TIME_DELETED:>WIDTH$}: {time_deleted}"); } diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 8d1c1bcd601..12320836b50 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -83,6 +83,7 @@ impl From for Ereport { serial_number, sp_type, sp_slot, + class, report, } = sp_report; Ereport { @@ -93,6 +94,7 @@ impl From for Ereport { collector_id: collector_id.into(), part_number, serial_number, + class, }, reporter: Reporter::Sp { sp_type, slot: sp_slot.0 }, report, @@ -110,6 +112,7 @@ impl From for Ereport { collector_id, sled_serial, sled_id, + class, report, } = host_report; Ereport { @@ -120,6 +123,7 @@ impl From for Ereport { collector_id: collector_id.into(), part_number: None, // TODO serial_number: Some(sled_serial), + class, }, reporter: Reporter::HostOs { sled: sled_id.into() }, report, @@ -134,6 +138,7 @@ pub struct EreportMetadata { pub collector_id: OmicronZoneUuid, pub part_number: Option, pub serial_number: Option, + pub class: Option, } #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] @@ -199,6 +204,11 @@ pub struct SpEreport { /// "I don't know who I am!" is an error condition for which we might want /// to generate an ereport! pub serial_number: Option, + /// The ereport class, which indicates the category of event reported. + /// + /// This is nullable, as it is extracted from the report JSON, and reports + /// missing class information must still be ingested. + pub class: Option, pub report: serde_json::Value, } @@ -215,6 +225,11 @@ pub struct HostEreport { pub sled_id: DbTypedUuid, pub sled_serial: String, + /// The ereport class, which indicates the category of event reported. + /// + /// This is nullable, as it is extracted from the report JSON, and reports + /// missing class information must still be ingested. + pub class: Option, pub report: serde_json::Value, } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index bd997d1bf14..ced10793bfc 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2456,6 +2456,7 @@ table! { part_number -> Nullable, serial_number -> Nullable, + class -> Nullable, report -> Jsonb, } @@ -2471,6 +2472,7 @@ table! { sled_id -> Uuid, sled_serial -> Text, + class -> Nullable, report -> Jsonb, } diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index 1e2bb151a20..7acddfdfa47 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -250,18 +250,30 @@ impl Ingester { let db_ereports = items .into_iter() .map(|ereport| { + const MISSING_VPD: &str = + " (perhaps the SP doesn't know its own VPD?)"; let part_number = get_sp_metadata_string( "baseboard_part_number", &ereport, &restart_id, &opctx.log, + MISSING_VPD, ); let serial_number = get_sp_metadata_string( "baseboard_serial_number", &ereport, &restart_id, &opctx.log, + MISSING_VPD, ); + let class = get_sp_metadata_string( + "class", + &ereport, + &restart_id, + &opctx.log, + "", + ); + db::model::SpEreport { restart_id: restart_id.into(), ena: ereport.ena.into(), @@ -272,6 +284,7 @@ impl Ingester { sp_slot: slot.into(), part_number, serial_number, + class, report: serde_json::Value::Object(ereport.data), } }) @@ -399,6 +412,7 @@ fn get_sp_metadata_string( ereport: &Ereport, restart_id: &EreporterRestartUuid, log: &slog::Logger, + extra_context: &'static str, ) -> Option { match ereport.data.get(key) { Some(serde_json::Value::String(s)) => Some(s.clone()), @@ -415,8 +429,7 @@ fn get_sp_metadata_string( None => { slog::warn!( &log, - "ereport missing '{key}'; perhaps the SP doesn't know its own \ - VPD!"; + "ereport missing '{key}'{extra_context}"; "ena" => ?ereport.ena, "restart_id" => ?restart_id, ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0bd5b31e003..277147b2bb7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5795,6 +5795,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( serial_number STRING, part_number STRING, + /* + * The ereport class, which indicates the category of event reported. + * + * This is nullable, as it is extracted from the report JSON, and reports + * missing class information must still be ingested. + */ + class STRING, + /* * JSON representation of the ereport as received from the SP. * @@ -5860,6 +5868,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( sled_id UUID NOT NULL, sled_serial TEXT NOT NULL, + /* + * The ereport class, which indicates the category of event reported. + * + * This is nullable, as it is extracted from the report JSON, and reports + * missing class information must still be ingested. + */ + class STRING, + /* * JSON representation of the ereport as received from the sled-agent. * diff --git a/schema/crdb/ereports/up01.sql b/schema/crdb/ereports/up01.sql index 2641ab758ca..922bfc602e1 100644 --- a/schema/crdb/ereports/up01.sql +++ b/schema/crdb/ereports/up01.sql @@ -37,6 +37,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.sp_ereport ( serial_number STRING, part_number STRING, + /* + * The ereport class, which indicates the category of event reported. + * + * This is nullable, as it is extracted from the report JSON, and reports + * missing class information must still be ingested. + */ + class STRING, + /* * JSON representation of the ereport as received from the SP. * diff --git a/schema/crdb/ereports/up05.sql b/schema/crdb/ereports/up05.sql index 289fec3c42e..4fc61fffac1 100644 --- a/schema/crdb/ereports/up05.sql +++ b/schema/crdb/ereports/up05.sql @@ -21,6 +21,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( sled_id UUID NOT NULL, sled_serial TEXT NOT NULL, + /* + * The ereport class, which indicates the category of event reported. + * + * This is nullable, as it is extracted from the report JSON, and reports + * missing class information must still be ingested. + */ + class STRING, + /* * JSON representation of the ereport as received from the sled-agent. * From 917c6edc5903bdeb14dd91e247bb167d2259700a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 24 Jun 2025 15:41:46 -0700 Subject: [PATCH 61/61] update OMDB expectorate output again --- dev-tools/omdb/tests/successes.out | 4 +-- dev-tools/omdb/tests/usage_errors.out | 51 ++++++++++++++------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index e547cfdda3b..8a92d094ec8 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -756,7 +756,7 @@ task: "sp_ereport_ingester" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms /!\ errors: 1 - - failed to resolve MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + - failed to resolve MGS addresses: proto error: no records found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } total ereports received: 0 new ereports ingested: 0 total HTTP requests sent: 0 @@ -1296,7 +1296,7 @@ task: "sp_ereport_ingester" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms /!\ errors: 1 - - failed to resolve MGS addresses: no record found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + - failed to resolve MGS addresses: proto error: no records found for Query { name: Name("_mgs._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } total ereports received: 0 new ereports ingested: 0 total HTTP requests sent: 0 diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index f55e0dcf0d5..d85524842bc 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -519,14 +519,14 @@ Options: -h, --help Print help Connection Options: - --db-url URL of the database SQL interface [env: OMDB_DB_URL=] - --dns-server [env: OMDB_DNS_SERVER=] + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] Database Options: - --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] - [default: 500] - --include-deleted whether to include soft-deleted records when enumerating objects that - can be soft-deleted + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted Safety Options: -w, --destructive Allow potentially-destructive subcommands @@ -543,20 +543,21 @@ Options: --log-level log level filter [env: LOG_LEVEL=] [default: warn] -s, --serial Include only ereports from systems with the provided serial numbers -i, --id Include only ereports from the provided reporter restart IDs + -c, --class Include only ereports with the provided class strings -b, --before Include only ereports collected before this timestamp - -a, --after Include only ereports collected after this timestamp --color Color output [default: auto] [possible values: auto, always, never] + -a, --after Include only ereports collected after this timestamp -h, --help Print help Connection Options: - --db-url URL of the database SQL interface [env: OMDB_DB_URL=] - --dns-server [env: OMDB_DNS_SERVER=] + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] Database Options: - --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] - [default: 500] - --include-deleted whether to include soft-deleted records when enumerating objects that - can be soft-deleted + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted Safety Options: -w, --destructive Allow potentially-destructive subcommands @@ -582,14 +583,14 @@ Options: -h, --help Print help Connection Options: - --db-url URL of the database SQL interface [env: OMDB_DB_URL=] - --dns-server [env: OMDB_DNS_SERVER=] + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] Database Options: - --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] - [default: 500] - --include-deleted whether to include soft-deleted records when enumerating objects that - can be soft-deleted + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted Safety Options: -w, --destructive Allow potentially-destructive subcommands @@ -614,14 +615,14 @@ Options: -h, --help Print help Connection Options: - --db-url URL of the database SQL interface [env: OMDB_DB_URL=] - --dns-server [env: OMDB_DNS_SERVER=] + --db-url URL of the database SQL interface [env: OMDB_DB_URL=] + --dns-server [env: OMDB_DNS_SERVER=] Database Options: - --fetch-limit limit to apply to queries that fetch rows [env: OMDB_FETCH_LIMIT=] - [default: 500] - --include-deleted whether to include soft-deleted records when enumerating objects that - can be soft-deleted + --fetch-limit limit to apply to queries that fetch rows [env: + OMDB_FETCH_LIMIT=] [default: 500] + --include-deleted whether to include soft-deleted records when enumerating objects + that can be soft-deleted Safety Options: -w, --destructive Allow potentially-destructive subcommands