diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 09c31d4c8ea..165e3fe782b 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -10,7 +10,10 @@ use super::ddm_admin_client::{DdmAdminClient, DdmError}; use super::params::SledAgentRequest; use super::rss_handle::RssHandle; use super::server::TrustQuorumMembership; -use super::trust_quorum::{RackSecret, ShareDistribution, TrustQuorumError}; +use super::trust_quorum::{ + RackSecret, SerializableShareDistribution, ShareDistribution, + TrustQuorumError, +}; use super::views::SledAgentResponse; use crate::config::Config as SledConfig; use crate::illumos::dladm::{self, Dladm, PhysicalLink}; @@ -22,7 +25,9 @@ use omicron_common::api::external::{Error as ExternalError, MacAddr}; use omicron_common::backoff::{ internal_service_policy, retry_notify, BackoffError, }; +use serde::{Deserialize, Serialize}; use slog::Logger; +use std::borrow::Cow; use std::collections::HashSet; use std::net::{Ipv6Addr, SocketAddrV6}; use std::path::{Path, PathBuf}; @@ -192,7 +197,7 @@ impl Agent { let request_path = get_sled_agent_request_path(); let trust_quorum = if request_path.exists() { info!(agent.log, "Sled already configured, loading sled agent"); - let sled_request: SledAgentRequest = toml::from_str( + let sled_request: PersistentSledAgentRequest = toml::from_str( &tokio::fs::read_to_string(&request_path).await.map_err( |err| BootstrapError::Io { message: format!( @@ -203,10 +208,13 @@ impl Agent { )?, ) .map_err(|err| BootstrapError::Toml { path: request_path, err })?; - agent.request_agent(&sled_request).await?; - TrustQuorumMembership::Known(Arc::new( - sled_request.trust_quorum_share, - )) + + let trust_quorum_share = + sled_request.trust_quorum_share.map(ShareDistribution::from); + agent + .request_agent(&*sled_request.request, &trust_quorum_share) + .await?; + TrustQuorumMembership::Known(Arc::new(trust_quorum_share)) } else { TrustQuorumMembership::Uninitialized }; @@ -219,6 +227,7 @@ impl Agent { pub async fn request_agent( &self, request: &SledAgentRequest, + trust_quorum_share: &Option, ) -> Result { info!(&self.log, "Loading Sled Agent: {:?}", request); @@ -243,7 +252,7 @@ impl Agent { // partially-initialized rack where we may have a share from a // previously-started-but-not-completed init process? If rerunning // it produces different shares this check will fail. - if request.trust_quorum_share != *self.share.lock().await { + if *trust_quorum_share != *self.share.lock().await { let err_str = concat!( "Sled Agent already running with", " a different trust quorum share" @@ -270,23 +279,26 @@ impl Agent { maybe_agent.replace(server); info!(&self.log, "Sled Agent loaded; recording configuration"); - *self.share.lock().await = request.trust_quorum_share.clone(); + *self.share.lock().await = trust_quorum_share.clone(); // Record this request so the sled agent can be automatically // initialized on the next boot. + // + // danger handling: `serialized_request` contains our trust quorum + // share; we do not log it and only write it to the designated path. + let serialized_request = PersistentSledAgentRequest { + request: Cow::Borrowed(request), + trust_quorum_share: trust_quorum_share.clone().map(Into::into), + } + .danger_serialize_as_toml() + .expect("Cannot serialize request"); + let path = get_sled_agent_request_path(); - tokio::fs::write( - &path, - &toml::to_string( - &toml::Value::try_from(&request) - .expect("Cannot serialize request"), - ) - .expect("Cannot convert toml to string"), - ) - .await - .map_err(|err| BootstrapError::Io { - message: format!("Recording Sled Agent request to {path:?}"), - err, + tokio::fs::write(&path, &serialized_request).await.map_err(|err| { + BootstrapError::Io { + message: format!("Recording Sled Agent request to {path:?}"), + err, + } })?; // Start trying to notify ddmd of our sled prefix so it can @@ -481,10 +493,38 @@ impl Agent { } } +// We intentionally DO NOT derive `Debug` or `Serialize`; both provide avenues +// by which we may accidentally log the contents of our trust quorum share. +#[derive(Deserialize, PartialEq)] +struct PersistentSledAgentRequest<'a> { + request: Cow<'a, SledAgentRequest>, + trust_quorum_share: Option, +} + +impl PersistentSledAgentRequest<'_> { + /// On success, the returned string will contain our raw + /// `trust_quorum_share`. This method is named `danger_*` to remind the + /// caller that they must not log this string. + fn danger_serialize_as_toml(&self) -> Result { + #[derive(Serialize)] + #[serde(remote = "PersistentSledAgentRequest")] + struct PersistentSledAgentRequestDef<'a> { + request: Cow<'a, SledAgentRequest>, + trust_quorum_share: Option, + } + + let mut out = String::with_capacity(128); + let mut serializer = toml::Serializer::new(&mut out); + PersistentSledAgentRequestDef::serialize(self, &mut serializer)?; + Ok(out) + } +} + #[cfg(test)] mod tests { use super::*; use macaddr::MacAddr6; + use uuid::Uuid; #[test] fn test_mac_to_socket_addr() { @@ -495,4 +535,33 @@ mod tests { &"fdb0:a840:2510:1::1".parse::().unwrap(), ); } + + #[test] + fn persistent_sled_agent_request_serialization_round_trips() { + let secret = RackSecret::new(); + let (mut shares, verifier) = secret.split(2, 4).unwrap(); + + let request = PersistentSledAgentRequest { + request: Cow::Owned(SledAgentRequest { + id: Uuid::new_v4(), + subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + rack_id: Uuid::new_v4(), + }), + trust_quorum_share: Some( + ShareDistribution { + threshold: 2, + verifier, + share: shares.pop().unwrap(), + member_device_id_certs: vec![], + } + .into(), + ), + }; + + let serialized = request.danger_serialize_as_toml().unwrap(); + let deserialized: PersistentSledAgentRequest = + toml::from_slice(serialized.as_bytes()).unwrap(); + + assert!(request == deserialized, "serialization round trip failed"); + } } diff --git a/sled-agent/src/bootstrap/client.rs b/sled-agent/src/bootstrap/client.rs index bed5d1fe87f..22541ffd7d4 100644 --- a/sled-agent/src/bootstrap/client.rs +++ b/sled-agent/src/bootstrap/client.rs @@ -8,6 +8,7 @@ use super::params::version; use super::params::Request; use super::params::RequestEnvelope; use super::params::SledAgentRequest; +use super::trust_quorum::ShareDistribution; use super::views::SledAgentResponse; use crate::bootstrap::views::Response; use crate::bootstrap::views::ResponseEnvelope; @@ -90,8 +91,12 @@ impl<'a> Client<'a> { pub(crate) async fn start_sled( &self, request: &SledAgentRequest, + trust_quorum_share: Option, ) -> Result { - let request = Request::SledAgentRequest(Cow::Borrowed(request)); + let request = Request::SledAgentRequest( + Cow::Borrowed(request), + trust_quorum_share.map(Into::into), + ); match self.request_response(request).await? { Response::SledAgentResponse(response) => Ok(response), @@ -142,8 +147,11 @@ impl<'a> Client<'a> { // Build and serialize our request. let envelope = RequestEnvelope { version: version::V1, request }; - let mut buf = - serde_json::to_vec(&envelope).map_err(Error::Serialize)?; + + // "danger" note: `buf` contains a raw trust quorum share; we must not + // log or otherwise persist it! We only write it to `stream`. + let buf = + envelope.danger_serialize_as_json().map_err(Error::Serialize)?; let request_length = u32::try_from(buf.len()) .expect("serialized bootstrap-agent request length overflowed u32"); @@ -163,7 +171,7 @@ impl<'a> Client<'a> { return Err(Error::BadResponseLength(response_length)); } - buf.resize(response_length as usize, 0); + let mut buf = vec![0; response_length as usize]; stream.read_exact(&mut buf).await.map_err(Error::ReadResponse)?; // Deserialize and handle the response. diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index fdbbf2c4295..d778adeab4a 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -4,7 +4,7 @@ //! Request types for the bootstrap agent -use super::trust_quorum::ShareDistribution; +use super::trust_quorum::SerializableShareDistribution; use omicron_common::address::{Ipv6Subnet, SLED_PREFIX}; use serde::{Deserialize, Serialize}; use std::borrow::Cow; @@ -16,38 +16,118 @@ pub struct SledAgentRequest { /// Uuid of the Sled Agent to be created. pub id: Uuid, - /// Portion of the IP space to be managed by the Sled Agent. - pub subnet: Ipv6Subnet, - /// Uuid of the rack to which this sled agent belongs. pub rack_id: Uuid, - /// Share of the rack secret for this Sled Agent. - // TODO-cleanup This is currently optional because we don't do trust quorum - // shares for single-node deployments (i.e., most dev/test environments), - // but eventually this should be required. - pub trust_quorum_share: Option, + // Note: The order of these fields is load bearing, because we serialize + // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it + // must come after non-table fields. + /// Portion of the IP space to be managed by the Sled Agent. + pub subnet: Ipv6Subnet, } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +// We intentionally DO NOT derive `Debug` or `Serialize`; both provide avenues +// by which we may accidentally log the contents of our `share`. To serialize a +// request, use `RequestEnvelope::danger_serialize_as_json()`. +#[derive(Clone, Deserialize, PartialEq)] // Clippy wants us to put the SledAgentRequest in a Box, but (a) it's not _that_ // big (a couple hundred bytes), and (b) that makes matching annoying. // `Request`s are relatively rare over the life of a sled agent. #[allow(clippy::large_enum_variant)] pub enum Request<'a> { /// Send configuration information for launching a Sled Agent. - SledAgentRequest(Cow<'a, SledAgentRequest>), + SledAgentRequest( + Cow<'a, SledAgentRequest>, + Option, + ), /// Request the sled's share of the rack secret. ShareRequest, } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Deserialize, PartialEq)] pub struct RequestEnvelope<'a> { pub version: u32, pub request: Request<'a>, } +impl RequestEnvelope<'_> { + /// On success, the returned `Vec` will contain our raw + /// trust quorum share. This method is named `danger_*` to remind the + /// caller that they must not log it. + pub(crate) fn danger_serialize_as_json( + &self, + ) -> Result, serde_json::Error> { + #[derive(Serialize)] + #[serde(remote = "Request")] + #[allow(clippy::large_enum_variant)] + pub enum RequestDef<'a> { + /// Send configuration information for launching a Sled Agent. + SledAgentRequest( + Cow<'a, SledAgentRequest>, + Option, + ), + + /// Request the sled's share of the rack secret. + ShareRequest, + } + + #[derive(Serialize)] + #[serde(remote = "RequestEnvelope")] + struct RequestEnvelopeDef<'a> { + version: u32, + #[serde(borrow, with = "RequestDef")] + request: Request<'a>, + } + + let mut writer = Vec::with_capacity(128); + let mut serializer = serde_json::Serializer::new(&mut writer); + RequestEnvelopeDef::serialize(self, &mut serializer)?; + Ok(writer) + } +} + pub(super) mod version { pub(crate) const V1: u32 = 1; } + +#[cfg(test)] +mod tests { + use std::net::Ipv6Addr; + + use super::*; + use crate::bootstrap::trust_quorum::RackSecret; + use crate::bootstrap::trust_quorum::ShareDistribution; + + #[test] + fn json_serialization_round_trips() { + let secret = RackSecret::new(); + let (mut shares, verifier) = secret.split(2, 4).unwrap(); + + let envelope = RequestEnvelope { + version: 1, + request: Request::SledAgentRequest( + Cow::Owned(SledAgentRequest { + id: Uuid::new_v4(), + subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + rack_id: Uuid::new_v4(), + }), + Some( + ShareDistribution { + threshold: 2, + verifier, + share: shares.pop().unwrap(), + member_device_id_certs: vec![], + } + .into(), + ), + ), + }; + + let serialized = envelope.danger_serialize_as_json().unwrap(); + let deserialized: RequestEnvelope = + serde_json::from_slice(&serialized).unwrap(); + + assert!(envelope == deserialized, "serialization round trip failed"); + } +} diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index c4aac6d5425..9bc3b6555d1 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -6,6 +6,7 @@ use super::client as bootstrap_agent_client; use super::params::SledAgentRequest; +use super::trust_quorum::ShareDistribution; use crate::rack_setup::config::SetupServiceConfig; use crate::rack_setup::service::Service; use crate::sp::SpHandle; @@ -67,6 +68,7 @@ async fn initialize_sled_agent( log: &Logger, bootstrap_addr: SocketAddrV6, request: &SledAgentRequest, + trust_quorum_share: Option, sp: &Option, ) -> Result<(), bootstrap_agent_client::Error> { let client = bootstrap_agent_client::Client::new( @@ -74,22 +76,24 @@ async fn initialize_sled_agent( sp, // TODO-cleanup: Creating a bootstrap client requires the list of trust // quorum members (as clients should always know the set of possible - // servers they can connect to), but `request.trust_quorum_share` is + // servers they can connect to), but `trust_quorum_share` is // optional for now because we don't yet require trust quorum in all // sled-agent deployments. We use `.map_or(&[], ...)` here to pass an // empty set of trust quorum members if we're in such a // trust-quorum-free deployment. This would cause any sprockets // connections to fail with unknown peers, but in a trust-quorum-free // deployment we don't actually wrap connections in sprockets. - request - .trust_quorum_share + trust_quorum_share .as_ref() .map_or(&[], |share| share.member_device_id_certs.as_slice()), log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), ); let sled_agent_initialize = || async { - client.start_sled(request).await.map_err(BackoffError::transient)?; + client + .start_sled(request, trust_quorum_share.clone()) + .await + .map_err(BackoffError::transient)?; Ok::<(), BackoffError>(()) }; @@ -121,7 +125,7 @@ fn rss_channel( } type InnerInitRequest = ( - Vec<(SocketAddrV6, SledAgentRequest)>, + Vec<(SocketAddrV6, SledAgentRequest, Option)>, oneshot::Sender>, ); @@ -141,7 +145,11 @@ impl BootstrapAgentHandle { /// that failed to initialize). pub(crate) async fn initialize_sleds( self, - requests: Vec<(SocketAddrV6, SledAgentRequest)>, + requests: Vec<( + SocketAddrV6, + SledAgentRequest, + Option, + )>, ) -> Result<(), String> { let (tx, rx) = oneshot::channel(); @@ -181,21 +189,27 @@ impl BootstrapAgentHandleReceiver { // of the initialization requests, allowing them to run concurrently. let mut futs = requests .into_iter() - .map(|(bootstrap_addr, request)| async move { + .map(|(bootstrap_addr, request, trust_quorum_share)| async move { info!( log, "Received initialization request from RSS"; "request" => ?request, "target_sled" => %bootstrap_addr, ); - initialize_sled_agent(log, bootstrap_addr, &request, sp) - .await - .map_err(|err| { - format!( - "Failed to initialize sled agent at {}: {}", - bootstrap_addr, err - ) - })?; + initialize_sled_agent( + log, + bootstrap_addr, + &request, + trust_quorum_share, + sp, + ) + .await + .map_err(|err| { + format!( + "Failed to initialize sled agent at {}: {}", + bootstrap_addr, err + ) + })?; info!( log, "Initialized sled agent"; diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 45a8cc349cb..a24b65dbf24 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -270,16 +270,20 @@ async fn serve_request_before_quorum_initialization( .map_err(|err| format!("Failed to establish sprockets session: {err}"))?; let response = match read_request(&mut stream).await? { - Request::SledAgentRequest(request) => { - match bootstrap_agent.request_agent(&*request).await { + Request::SledAgentRequest(request, trust_quorum_share) => { + let trust_quorum_share = + trust_quorum_share.map(ShareDistribution::from); + match bootstrap_agent + .request_agent(&*request, &trust_quorum_share) + .await + { Ok(response) => { // If this send fails, it means our caller already received // our share from a different // `serve_request_before_quorum_initialization()` task // (i.e., from another incoming request from RSS). We'll // ignore such failures. - let _ = - tx_share.send(request.trust_quorum_share.clone()).await; + let _ = tx_share.send(trust_quorum_share).await; Ok(Response::SledAgentResponse(response)) } @@ -320,7 +324,7 @@ async fn serve_request_after_quorum_initialization( .map_err(|err| format!("Failed to establish sprockets session: {err}"))?; let response = match read_request(&mut stream).await? { - Request::SledAgentRequest(request) => { + Request::SledAgentRequest(request, _trust_quorum_share) => { warn!( log, "Received sled agent request after we're initialized"; "request" => ?request, diff --git a/sled-agent/src/bootstrap/trust_quorum/mod.rs b/sled-agent/src/bootstrap/trust_quorum/mod.rs index 75ed9960e4a..5aa249f48e8 100644 --- a/sled-agent/src/bootstrap/trust_quorum/mod.rs +++ b/sled-agent/src/bootstrap/trust_quorum/mod.rs @@ -16,4 +16,5 @@ mod share_distribution; pub use error::TrustQuorumError; pub use rack_secret::RackSecret; +pub use share_distribution::SerializableShareDistribution; pub use share_distribution::ShareDistribution; diff --git a/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs index d044c880c15..c8019ef8040 100644 --- a/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs +++ b/sled-agent/src/bootstrap/trust_quorum/share_distribution.rs @@ -2,7 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use serde::{Deserialize, Serialize}; +use serde::Deserialize; +use serde::Serialize; use sprockets_host::Ed25519Certificate; use std::fmt; use vsss_rs::Share; @@ -12,7 +13,9 @@ use super::rack_secret::Verifier; /// A ShareDistribution is an individual share of a secret along with all the /// metadata required to allow a server in possession of the share to know how /// to correctly recreate a split secret. -#[derive(Clone, PartialEq, Serialize, Deserialize)] +// We intentionally DO NOT derive `Debug` or `Serialize`; both provide avenues +// by which we may accidentally log the contents of our `share`. +#[derive(Clone, PartialEq, Deserialize)] pub struct ShareDistribution { pub threshold: usize, pub verifier: Verifier, @@ -38,3 +41,39 @@ impl fmt::Debug for ShareDistribution { .finish() } } + +/// This type is equivalent to `ShareDistribution` but implements `Serialize`. +/// It should be used _very carefully_; `ShareDistribution` should be preferred +/// in almost all cases to avoid accidental spillage of our `Share` contents. +/// This type should only be used to build careful serialization routines that +/// need to deal with trust quorum shares; e.g., +/// `RequestEnvelope::danger_serialize_as_json()`. +#[derive(Clone, PartialEq, Serialize, Deserialize)] +pub struct SerializableShareDistribution { + pub threshold: usize, + pub share: Share, + pub member_device_id_certs: Vec, + pub verifier: Verifier, +} + +impl From for SerializableShareDistribution { + fn from(dist: ShareDistribution) -> Self { + Self { + threshold: dist.threshold, + verifier: dist.verifier, + share: dist.share, + member_device_id_certs: dist.member_device_id_certs, + } + } +} + +impl From for ShareDistribution { + fn from(dist: SerializableShareDistribution) -> Self { + Self { + threshold: dist.threshold, + verifier: dist.verifier, + share: dist.share, + member_device_id_certs: dist.member_device_id_certs, + } + } +} diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 359a8b6dec3..45f133a5eb6 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -289,33 +289,7 @@ impl ServiceInner { &self, config: &Config, bootstrap_addrs: Vec, - member_device_id_certs: &[Ed25519Certificate], ) -> Result, SetupServiceError> { - // Create a rack secret, unless we're in the single-sled case. - let mut maybe_rack_secret_shares = generate_rack_secret( - config.rack_secret_threshold, - member_device_id_certs, - &self.log, - )?; - - // Confirm that the returned iterator (if we got one) is the length we - // expect. - if let Some(rack_secret_shares) = maybe_rack_secret_shares.as_ref() { - // TODO-cleanup Asserting here seems fine as long as - // `member_device_id_certs` is hard-coded from a config file, but - // once we start collecting them over the management network we - // should probably attach them at the type level to the bootstrap - // addrs, which would remove the need for this assertion. - assert_eq!( - rack_secret_shares.len(), - bootstrap_addrs.len(), - concat!( - "Number of trust quorum members does not match ", - "number of bootstrap addresses" - ) - ); - } - let bootstrap_addrs = bootstrap_addrs.into_iter().enumerate(); let reserved_rack_subnet = ReservedRackSubnet::new(config.az_subnet()); let dns_subnets = reserved_rack_subnet.get_dns_subnets(); @@ -382,15 +356,6 @@ impl ServiceInner { id: Uuid::new_v4(), subnet, rack_id, - trust_quorum_share: maybe_rack_secret_shares - .as_mut() - .map(|shares_iter| { - // We asserted when creating - // `maybe_rack_secret_shares` that it contained - // exactly the number of shares as we have - // bootstrap addrs, so we can unwrap here. - shares_iter.next().unwrap() - }), }, services_request: request, }, @@ -563,17 +528,45 @@ impl ServiceInner { plan } else { info!(self.log, "Creating new allocation plan"); - self.create_plan(config, addrs, member_device_id_certs).await? + self.create_plan(config, addrs).await? }; + // Generate our rack secret, unless we're in the single-sled case. + let mut maybe_rack_secret_shares = generate_rack_secret( + config.rack_secret_threshold, + member_device_id_certs, + &self.log, + )?; + + // Confirm that the returned iterator (if we got one) is the length we + // expect. + if let Some(rack_secret_shares) = maybe_rack_secret_shares.as_ref() { + // TODO-cleanup Asserting here seems fine as long as + // `member_device_id_certs` is hard-coded from a config file, but + // once we start collecting them over the management network we + // should probably attach them at the type level to the bootstrap + // addrs, which would remove the need for this assertion. + assert_eq!( + rack_secret_shares.len(), + plan.len(), + concat!( + "Number of trust quorum members does not match ", + "number of sleds in the plan" + ) + ); + } + // Forward the sled initialization requests to our sled-agent. local_bootstrap_agent .initialize_sleds( plan.iter() - .map(|(bootstrap_addr, allocation)| { + .map(move |(bootstrap_addr, allocation)| { ( *bootstrap_addr, allocation.initialization_request.clone(), + maybe_rack_secret_shares + .as_mut() + .map(|shares| shares.next().unwrap()), ) }) .collect(),