diff --git a/account_manager/src/validator/slashing_protection.rs b/account_manager/src/validator/slashing_protection.rs index bcd860a4847..57d532d0ae2 100644 --- a/account_manager/src/validator/slashing_protection.rs +++ b/account_manager/src/validator/slashing_protection.rs @@ -90,7 +90,7 @@ pub fn cli_run( let slashing_protection_database = SlashingDatabase::open_or_create(&slashing_protection_db_path).map_err(|e| { format!( - "Unable to open database at {}: {:?}", + "Unable to open slashing protection database at {}: {:?}", slashing_protection_db_path.display(), e ) @@ -198,7 +198,7 @@ pub fn cli_run( let slashing_protection_database = SlashingDatabase::open(&slashing_protection_db_path) .map_err(|e| { format!( - "Unable to open database at {}: {:?}", + "Unable to open slashing protection database at {}: {:?}", slashing_protection_db_path.display(), e ) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 990f4b6099c..bf87d3afca2 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -120,7 +120,7 @@ use std::time::Duration; use store::iter::{BlockRootsIterator, ParentRootBlockIterator, StateRootsIterator}; use store::{ BlobSidecarListFromRoot, DatabaseBlock, Error as DBError, HotColdDB, HotStateSummary, - KeyValueStore, KeyValueStoreOp, StoreItem, StoreOp, + KeyValueStoreOp, StoreItem, StoreOp, }; use task_executor::{ShutdownReason, TaskExecutor}; use tokio_stream::Stream; @@ -3986,8 +3986,6 @@ impl BeaconChain { ops.push(StoreOp::PutBlock(block_root, signed_block.clone())); ops.push(StoreOp::PutState(block.state_root(), &state)); - let txn_lock = self.store.hot_db.begin_rw_transaction(); - if let Err(e) = self.store.do_atomically_with_block_and_blobs_cache(ops) { error!( msg = "Restoring fork choice from disk", @@ -3999,7 +3997,6 @@ impl BeaconChain { .err() .unwrap_or(e.into())); } - drop(txn_lock); // The fork choice write-lock is dropped *after* the on-disk database has been updated. // This prevents inconsistency between the two at the expense of concurrency. @@ -6794,13 +6791,22 @@ impl BeaconChain { #[allow(clippy::type_complexity)] pub fn chain_dump( &self, + ) -> Result>>, Error> { + self.chain_dump_from_slot(Slot::new(0)) + } + + /// As for `chain_dump` but dumping only the portion of the chain newer than `from_slot`. + #[allow(clippy::type_complexity)] + pub fn chain_dump_from_slot( + &self, + from_slot: Slot, ) -> Result>>, Error> { let mut dump = vec![]; let mut prev_block_root = None; let mut prev_beacon_state = None; - for res in self.forwards_iter_block_roots(Slot::new(0))? { + for res in self.forwards_iter_block_roots(from_slot)? { let (beacon_block_root, _) = res?; // Do not include snapshots at skipped slots. diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 1bbf845fa5f..72104e439c0 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -90,7 +90,7 @@ use std::fmt::Debug; use std::fs; use std::io::Write; use std::sync::Arc; -use store::{Error as DBError, HotStateSummary, KeyValueStore, StoreOp}; +use store::{Error as DBError, KeyValueStore}; use strum::AsRefStr; use task_executor::JoinHandle; use tracing::{debug, error}; @@ -1477,28 +1477,19 @@ impl ExecutionPendingBlock { // processing, but we get early access to it. let state_root = state.update_tree_hash_cache()?; - // Store the state immediately. - let txn_lock = chain.store.hot_db.begin_rw_transaction(); + // Store the state immediately. States are ONLY deleted on finalization pruning, so + // we won't have race conditions where we should have written a state and didn't. let state_already_exists = chain.store.load_hot_state_summary(&state_root)?.is_some(); - let state_batch = if state_already_exists { + if state_already_exists { // If the state exists, we do not need to re-write it. - vec![] } else { - vec![if state.slot() % T::EthSpec::slots_per_epoch() == 0 { - StoreOp::PutState(state_root, &state) - } else { - StoreOp::PutStateSummary( - state_root, - HotStateSummary::new(&state_root, &state)?, - ) - }] + // Recycle store codepath to create a state summary and store the state / diff + let mut ops = vec![]; + chain.store.store_hot_state(&state_root, &state, &mut ops)?; + chain.store.hot_db.do_atomically(ops)?; }; - chain - .store - .do_atomically_with_block_and_blobs_cache(state_batch)?; - drop(txn_lock); state_root }; diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 2346aca00b5..43c529c4116 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -42,8 +42,8 @@ use store::{Error as StoreError, HotColdDB, ItemStore, KeyValueStoreOp}; use task_executor::{ShutdownReason, TaskExecutor}; use tracing::{debug, error, info}; use types::{ - BeaconBlock, BeaconState, BlobSidecarList, ChainSpec, Checkpoint, DataColumnSidecarList, Epoch, - EthSpec, FixedBytesExtended, Hash256, Signature, SignedBeaconBlock, Slot, + BeaconBlock, BeaconState, BlobSidecarList, ChainSpec, DataColumnSidecarList, Epoch, EthSpec, + FixedBytesExtended, Hash256, Signature, SignedBeaconBlock, Slot, }; /// An empty struct used to "witness" all the `BeaconChainTypes` traits. It has no user-facing @@ -380,21 +380,29 @@ where } /// Starts a new chain from a genesis state. - pub fn genesis_state(mut self, beacon_state: BeaconState) -> Result { + pub fn genesis_state(mut self, mut beacon_state: BeaconState) -> Result { let store = self.store.clone().ok_or("genesis_state requires a store")?; - let (genesis, updated_builder) = self.set_genesis_state(beacon_state)?; - self = updated_builder; - - // Stage the database's metadata fields for atomic storage when `build` is called. + // Initialize anchor info before attempting to write the genesis state. // Since v4.4.0 we will set the anchor with a dummy state upper limit in order to prevent // historic states from being retained (unless `--reconstruct-historic-states` is set). let retain_historic_states = self.chain_config.reconstruct_historic_states; + let genesis_beacon_block = genesis_block(&mut beacon_state, &self.spec)?; self.pending_io_batch.push( store - .init_anchor_info(genesis.beacon_block.message(), retain_historic_states) + .init_anchor_info( + genesis_beacon_block.parent_root(), + genesis_beacon_block.slot(), + Slot::new(0), + retain_historic_states, + ) .map_err(|e| format!("Failed to initialize genesis anchor: {:?}", e))?, ); + + let (genesis, updated_builder) = self.set_genesis_state(beacon_state)?; + self = updated_builder; + + // Stage the database's metadata fields for atomic storage when `build` is called. self.pending_io_batch.push( store .init_blob_info(genesis.beacon_block.slot()) @@ -519,6 +527,13 @@ where } } + debug!( + slot = %weak_subj_slot, + state_root = ?weak_subj_state_root, + block_root = ?weak_subj_block_root, + "Storing split from weak subjectivity state" + ); + // Set the store's split point *before* storing genesis so that genesis is stored // immediately in the freezer DB. store.set_split(weak_subj_slot, weak_subj_state_root, weak_subj_block_root); @@ -539,6 +554,26 @@ where .cold_db .do_atomically(block_root_batch) .map_err(|e| format!("Error writing frozen block roots: {e:?}"))?; + debug!( + from = %weak_subj_block.slot(), + to_excl = %weak_subj_state.slot(), + block_root = ?weak_subj_block_root, + "Stored frozen block roots at skipped slots" + ); + + // Write the anchor to memory before calling `put_state` otherwise hot hdiff can't store + // states that do not align with the `start_slot` grid. + let retain_historic_states = self.chain_config.reconstruct_historic_states; + self.pending_io_batch.push( + store + .init_anchor_info( + weak_subj_block.parent_root(), + weak_subj_block.slot(), + weak_subj_slot, + retain_historic_states, + ) + .map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?, + ); // Write the state, block and blobs non-atomically, it doesn't matter if they're forgotten // about on a crash restart. @@ -549,6 +584,8 @@ where weak_subj_state.clone(), ) .map_err(|e| format!("Failed to set checkpoint state as finalized state: {:?}", e))?; + // Note: post hot hdiff must update the anchor info before attempting to put_state otherwise + // the write will fail if the weak_subj_slot is not aligned with the snapshot moduli. store .put_state(&weak_subj_state_root, &weak_subj_state) .map_err(|e| format!("Failed to store weak subjectivity state: {e:?}"))?; @@ -578,13 +615,7 @@ where // Stage the database's metadata fields for atomic storage when `build` is called. // This prevents the database from restarting in an inconsistent state if the anchor // info or split point is written before the `PersistedBeaconChain`. - let retain_historic_states = self.chain_config.reconstruct_historic_states; self.pending_io_batch.push(store.store_split_in_batch()); - self.pending_io_batch.push( - store - .init_anchor_info(weak_subj_block.message(), retain_historic_states) - .map_err(|e| format!("Failed to initialize anchor info: {:?}", e))?, - ); self.pending_io_batch.push( store .init_blob_info(weak_subj_block.slot()) @@ -596,13 +627,6 @@ where .map_err(|e| format!("Failed to initialize data column info: {:?}", e))?, ); - // Store pruning checkpoint to prevent attempting to prune before the anchor state. - self.pending_io_batch - .push(store.pruning_checkpoint_store_op(Checkpoint { - root: weak_subj_block_root, - epoch: weak_subj_state.slot().epoch(E::slots_per_epoch()), - })); - let snapshot = BeaconSnapshot { beacon_block_root: weak_subj_block_root, beacon_block: Arc::new(weak_subj_block), diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 348e6d52a64..57e19393165 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -151,6 +151,7 @@ impl BeaconChain { // Store block roots, including at all skip slots in the freezer DB. for slot in (block.slot().as_u64()..prev_block_slot.as_u64()).rev() { + debug!(%slot, ?block_root, "Storing frozen block to root mapping"); cold_batch.push(KeyValueStoreOp::PutKeyValue( DBColumn::BeaconBlockRoots, slot.to_be_bytes().to_vec(), diff --git a/beacon_node/beacon_chain/src/migrate.rs b/beacon_node/beacon_chain/src/migrate.rs index 03c468a35ef..09534fc4ccf 100644 --- a/beacon_node/beacon_chain/src/migrate.rs +++ b/beacon_node/beacon_chain/src/migrate.rs @@ -1,5 +1,5 @@ use crate::errors::BeaconChainError; -use crate::summaries_dag::{DAGStateSummaryV22, Error as SummariesDagError, StateSummariesDAG}; +use crate::summaries_dag::{DAGStateSummary, Error as SummariesDagError, StateSummariesDAG}; use parking_lot::Mutex; use std::collections::HashSet; use std::mem; @@ -7,7 +7,7 @@ use std::sync::{mpsc, Arc}; use std::thread; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use store::hot_cold_store::{migrate_database, HotColdDBError}; -use store::{Error, ItemStore, StoreOp}; +use store::{Error, ItemStore, Split, StoreOp}; pub use store::{HotColdDB, MemoryStore}; use tracing::{debug, error, info, warn}; use types::{BeaconState, BeaconStateHash, Checkpoint, Epoch, EthSpec, Hash256, Slot}; @@ -343,18 +343,23 @@ impl, Cold: ItemStore> BackgroundMigrator {} + Ok(split_change) => { + // Migration run, return the split before the migration + split_change.previous + } Err(Error::HotColdDBError(HotColdDBError::FreezeSlotUnaligned(slot))) => { debug!( slot = slot.as_u64(), "Database migration postponed, unaligned finalized block" ); + // Migration did not run, return the current split info + db.get_split_info() } Err(e) => { warn!(error = ?e, "Database migration failed"); @@ -367,6 +372,7 @@ impl, Cold: ItemStore> BackgroundMigrator, Cold: ItemStore> BackgroundMigrator, new_finalized_checkpoint: Checkpoint, + split_prior_to_migration: Split, ) -> Result { let new_finalized_slot = new_finalized_checkpoint .epoch @@ -519,6 +526,7 @@ impl, Cold: ItemStore> BackgroundMigrator, Cold: ItemStore> BackgroundMigrator, BeaconChainError>>()?; - - // De-duplicate block roots to reduce block reads below - let summary_block_roots = HashSet::::from_iter( - state_summaries - .iter() - .map(|(_, summary)| summary.latest_block_root), - ); + .map(|(state_root, summary)| (state_root, summary.into())) + .collect::>(); // Sanity check, there is at least one summary with the new finalized block root - if !summary_block_roots.contains(&new_finalized_checkpoint.root) { + if !state_summaries + .iter() + .any(|(_, s)| s.latest_block_root == new_finalized_checkpoint.root) + { return Err(BeaconChainError::PruningError( PruningError::MissingSummaryForFinalizedCheckpoint( new_finalized_checkpoint.root, @@ -562,16 +551,31 @@ impl, Cold: ItemStore> BackgroundMigrator 1 { + let state_summaries_dag_roots_post_split = state_summaries_dag_roots + .iter() + .filter(|(_, s)| s.slot >= split_prior_to_migration.slot) + .collect::>(); + + // Because of the additional HDiffs kept for the grid prior to finalization the tree_roots + // function will consider them roots. Those are expected. We just want to assert that the + // relevant tree of states (post-split) is well-formed. + // + // This warning could also fire if we have imported a block that doesn't descend from the + // new finalized state, and has had its ancestor state summaries pruned by a previous + // run. See: https://github.com/sigp/lighthouse/issues/7270. + if state_summaries_dag_roots_post_split.len() > 1 { warn!( - state_summaries_dag_roots = ?state_summaries_dag_roots, + location = "pruning", + new_finalized_state_root = ?new_finalized_state_root, + split_prior_to_migration_slot = %split_prior_to_migration.slot, + state_summaries_dag_roots_post_split = ?state_summaries_dag_roots_post_split, error = "summaries dag found more than one root", "Notify the devs your hot DB has some inconsistency. Pruning will fix it but devs want to know about it", ); @@ -626,10 +630,17 @@ impl, Cold: ItemStore> BackgroundMigrator = HashSet::new(); let mut states_to_prune: HashSet<(Slot, Hash256)> = HashSet::new(); + let mut kept_summaries_for_hdiff = vec![]; // Consider the following block tree where we finalize block `[0]` at the checkpoint `(f)`. // There's a block `[3]` that descendends from the finalized block but NOT from the @@ -650,6 +661,30 @@ impl, Cold: ItemStore> BackgroundMigrator, Cold: ItemStore> BackgroundMigrator, Cold: ItemStore> BackgroundMigrator( let ops = migration_schema_v23::downgrade_from_v23::(db.clone())?; db.store_schema_version_atomically(to, ops) } + (SchemaVersion(23), SchemaVersion(24)) => { + let ops = migration_schema_v24::upgrade_to_v24::(db.clone())?; + db.store_schema_version_atomically(to, ops) + } + (SchemaVersion(24), SchemaVersion(23)) => { + let ops = migration_schema_v24::downgrade_from_v24::(db.clone())?; + db.store_schema_version_atomically(to, ops) + } // Anything else is an error. (_, _) => Err(HotColdDBError::UnsupportedSchemaVersion { target_version: to, diff --git a/beacon_node/beacon_chain/src/schema_change/migration_schema_v23.rs b/beacon_node/beacon_chain/src/schema_change/migration_schema_v23.rs index d0f8202679c..d70f41bb7eb 100644 --- a/beacon_node/beacon_chain/src/schema_change/migration_schema_v23.rs +++ b/beacon_node/beacon_chain/src/schema_change/migration_schema_v23.rs @@ -43,21 +43,26 @@ pub fn upgrade_to_v23( let state_root = state_root_result?; debug!( ?state_root, - "Deleting temporary state flag on v23 schema migration" + "Deleting temporary state on v23 schema migration" ); ops.push(KeyValueStoreOp::DeleteKey( DBColumn::BeaconStateTemporary, state_root.as_slice().to_vec(), )); - // Here we SHOULD delete the items for key `state_root` in columns `BeaconState` and - // `BeaconStateSummary`. However, in the event we have dangling temporary states at the time - // of the migration, the first pruning routine will prune them. They will be a tree branch / - // root not part of the finalized tree and trigger a warning log once. - // - // We believe there may be race conditions concerning temporary flags where a necessary - // canonical state is marked as temporary. In current stable, a restart with that DB will - // corrupt the DB. In the unlikely case this happens we choose to leave the states and - // allow pruning to clean them. + + // We also delete the temporary states themselves. Although there are known issue with + // temporary states and this could lead to DB corruption, we will only corrupt the DB in + // cases where the DB would be corrupted by restarting on v7.0.x. We consider these DBs + // "too far gone". Deleting here has the advantage of not generating warnings about + // disjoint state DAGs in the v24 upgrade, or the first pruning after migration. + ops.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconState, + state_root.as_slice().to_vec(), + )); + ops.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconStateSummary, + state_root.as_slice().to_vec(), + )); } Ok(ops) diff --git a/beacon_node/beacon_chain/src/schema_change/migration_schema_v24.rs b/beacon_node/beacon_chain/src/schema_change/migration_schema_v24.rs new file mode 100644 index 00000000000..6901c99ceec --- /dev/null +++ b/beacon_node/beacon_chain/src/schema_change/migration_schema_v24.rs @@ -0,0 +1,605 @@ +use crate::{ + beacon_chain::BeaconChainTypes, + summaries_dag::{DAGStateSummary, DAGStateSummaryV22, StateSummariesDAG}, +}; +use ssz::{Decode, DecodeError, Encode}; +use ssz_derive::Encode; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; +use store::{ + hdiff::StorageStrategy, + hot_cold_store::{HotStateSummaryV22, OptionalDiffBaseState}, + DBColumn, Error, HotColdDB, HotStateSummary, KeyValueStore, KeyValueStoreOp, StoreItem, +}; +use tracing::{debug, info, warn}; +use types::{ + BeaconState, ChainSpec, Checkpoint, CommitteeCache, EthSpec, Hash256, Slot, CACHED_EPOCHS, +}; + +/// We stopped using the pruning checkpoint in schema v23 but never explicitly deleted it. +/// +/// We delete it as part of the v24 migration. +pub const PRUNING_CHECKPOINT_KEY: Hash256 = Hash256::repeat_byte(3); + +pub fn store_full_state_v22( + state_root: &Hash256, + state: &BeaconState, + ops: &mut Vec, +) -> Result<(), Error> { + let bytes = StorageContainer::new(state).as_ssz_bytes(); + ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconState, + state_root.as_slice().to_vec(), + bytes, + )); + Ok(()) +} + +/// Fetch a V22 state from the database either as a full state or using block replay. +pub fn get_state_v22( + db: &Arc>, + state_root: &Hash256, + spec: &ChainSpec, +) -> Result>, Error> { + let Some(summary) = db.get_item::(state_root)? else { + return Ok(None); + }; + let Some(base_state) = + get_full_state_v22(&db.hot_db, &summary.epoch_boundary_state_root, spec)? + else { + return Ok(None); + }; + // Loading hot states via block replay doesn't care about the schema version, so we can use + // the DB's current method for this. + let update_cache = false; + db.load_hot_state_using_replay( + base_state, + summary.slot, + summary.latest_block_root, + update_cache, + ) + .map(Some) +} + +pub fn get_full_state_v22, E: EthSpec>( + db: &KV, + state_root: &Hash256, + spec: &ChainSpec, +) -> Result>, Error> { + match db.get_bytes(DBColumn::BeaconState, state_root.as_slice())? { + Some(bytes) => { + let container = StorageContainer::from_ssz_bytes(&bytes, spec)?; + Ok(Some(container.try_into()?)) + } + None => Ok(None), + } +} + +/// A container for storing `BeaconState` components. +/// +/// DEPRECATED. +#[derive(Encode)] +pub struct StorageContainer { + state: BeaconState, + committee_caches: Vec>, +} + +impl StorageContainer { + /// Create a new instance for storing a `BeaconState`. + pub fn new(state: &BeaconState) -> Self { + Self { + state: state.clone(), + committee_caches: state.committee_caches().to_vec(), + } + } + + pub fn from_ssz_bytes(bytes: &[u8], spec: &ChainSpec) -> Result { + // We need to use the slot-switching `from_ssz_bytes` of `BeaconState`, which doesn't + // compose with the other SSZ utils, so we duplicate some parts of `ssz_derive` here. + let mut builder = ssz::SszDecoderBuilder::new(bytes); + + builder.register_anonymous_variable_length_item()?; + builder.register_type::>()?; + + let mut decoder = builder.build()?; + + let state = decoder.decode_next_with(|bytes| BeaconState::from_ssz_bytes(bytes, spec))?; + let committee_caches = decoder.decode_next()?; + + Ok(Self { + state, + committee_caches, + }) + } +} + +impl TryInto> for StorageContainer { + type Error = Error; + + fn try_into(mut self) -> Result, Error> { + let mut state = self.state; + + for i in (0..CACHED_EPOCHS).rev() { + if i >= self.committee_caches.len() { + return Err(Error::SszDecodeError(DecodeError::BytesInvalid( + "Insufficient committees for BeaconState".to_string(), + ))); + }; + + state.committee_caches_mut()[i] = self.committee_caches.remove(i); + } + + Ok(state) + } +} + +/// The checkpoint used for pruning the database. +/// +/// Updated whenever pruning is successful. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct PruningCheckpoint { + pub checkpoint: Checkpoint, +} + +impl StoreItem for PruningCheckpoint { + fn db_column() -> DBColumn { + DBColumn::BeaconMeta + } + + fn as_store_bytes(&self) -> Vec { + self.checkpoint.as_ssz_bytes() + } + + fn from_store_bytes(bytes: &[u8]) -> Result { + Ok(PruningCheckpoint { + checkpoint: Checkpoint::from_ssz_bytes(bytes)?, + }) + } +} + +pub fn upgrade_to_v24( + db: Arc>, +) -> Result, Error> { + let mut migrate_ops = vec![]; + let split = db.get_split_info(); + let hot_hdiff_start_slot = split.slot; + + // Delete the `PruningCheckpoint` (no longer used). + migrate_ops.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconMeta, + PRUNING_CHECKPOINT_KEY.as_slice().to_vec(), + )); + + // Sanity check to make sure the HDiff grid is aligned with the epoch start + if hot_hdiff_start_slot % T::EthSpec::slots_per_epoch() != 0 { + return Err(Error::MigrationError(format!( + "hot_hdiff_start_slot is not first slot in epoch {hot_hdiff_start_slot}" + ))); + } + + // After V24 hot tree states, the in-memory `anchor_info.anchor_slot` is the start slot of the + // hot HDiff grid. Before the migration, it's set to the slot of the anchor state in the DB: + // - the genesis state on a genesis sync, or + // - the checkpoint state on a checkpoint sync. + // + // If the node has been running for a while the `anchor_slot` might be less than the finalized + // checkpoint. This upgrade constructs a grid only with unfinalized states, rooted in the + // current finalized state. So we set the `anchor_slot` to `split.slot` to root the grid in the + // current finalized state. Each migration sets the split to + // ``` + // Split { slot: finalized_state.slot(), state_root: finalized_state_root } + // ``` + { + let anchor_info = db.get_anchor_info(); + + // If the node is already an archive node, we can set the anchor slot to 0 and copy + // snapshots and diffs from the freezer DB to the hot DB in order to establish an initial + // hot grid that is aligned/"perfect" (no `start_slot`/`anchor_slot` to worry about). + // + // This only works if all of the following are true: + // + // - We have the previous snapshot for the split state stored in the freezer DB, i.e. + // if `previous_snapshot_slot >= state_upper_limit`. + // - The split state itself will be stored as a diff or snapshot in the new grid. We choose + // not to support a split state that requires block replay, because computing its previous + // state root from the DAG is not straight-forward. + let dummy_start_slot = Slot::new(0); + let closest_layer_points = db + .hierarchy + .closest_layer_points(split.slot, dummy_start_slot); + + let previous_snapshot_slot = + closest_layer_points + .iter() + .copied() + .min() + .ok_or(Error::MigrationError( + "closest_layer_points must not be empty".to_string(), + ))?; + + if previous_snapshot_slot >= anchor_info.state_upper_limit + && db + .hierarchy + .storage_strategy(split.slot, dummy_start_slot) + .is_ok_and(|strategy| !strategy.is_replay_from()) + { + info!( + %previous_snapshot_slot, + split_slot = %split.slot, + "Aligning hot diff grid to freezer" + ); + + // Set anchor slot to 0 in case it was set to something else by a previous checkpoint + // sync. + let mut new_anchor_info = anchor_info.clone(); + new_anchor_info.anchor_slot = Slot::new(0); + + // Update the anchor on disk atomically if migration is successful + migrate_ops.push(db.compare_and_set_anchor_info(anchor_info, new_anchor_info)?); + + // Copy each of the freezer layers to the hot DB in slot ascending order. + for layer_slot in closest_layer_points.into_iter().rev() { + // Do not try to load the split state itself from the freezer, it won't be there. + // It will be migrated in the main loop below. + if layer_slot == split.slot { + continue; + } + + let mut freezer_state = db.load_cold_state_by_slot(layer_slot)?; + + let state_root = freezer_state.canonical_root()?; + + let mut state_ops = vec![]; + db.store_hot_state(&state_root, &freezer_state, &mut state_ops)?; + db.hot_db.do_atomically(state_ops)?; + } + } else { + // Otherwise for non-archive nodes, set the anchor slot for the hot grid to the current + // split slot (the oldest slot available). + let mut new_anchor_info = anchor_info.clone(); + new_anchor_info.anchor_slot = hot_hdiff_start_slot; + + // Update the anchor in disk atomically if migration is successful + migrate_ops.push(db.compare_and_set_anchor_info(anchor_info, new_anchor_info)?); + } + } + + let state_summaries_dag = new_dag::(&db)?; + + // We compute the state summaries DAG outside of a DB migration. Therefore if the DB is properly + // prunned, it should have a single root equal to the split. + let state_summaries_dag_roots = state_summaries_dag.tree_roots(); + if state_summaries_dag_roots.len() == 1 { + let (root_summary_state_root, root_summary) = + state_summaries_dag_roots.first().expect("len == 1"); + if *root_summary_state_root != split.state_root { + warn!( + ?root_summary_state_root, + ?root_summary, + ?split, + "State summaries DAG root is not the split" + ); + } + } else { + warn!( + location = "migration", + state_summaries_dag_roots = ?state_summaries_dag_roots, + "State summaries DAG found more than one root" + ); + } + + // Sort summaries by slot so we have their ancestor diffs already stored when we store them. + // If the summaries are sorted topologically we can insert them into the DB like if they were a + // new state, re-using existing code. As states are likely to be sequential the diff cache + // should kick in making the migration more efficient. If we just iterate the column of + // summaries we may get distance state of each iteration. + let summaries_by_slot = state_summaries_dag.summaries_by_slot_ascending(); + debug!( + summaries_count = state_summaries_dag.summaries_count(), + slots_count = summaries_by_slot.len(), + min_slot = ?summaries_by_slot.first_key_value().map(|(slot, _)| slot), + max_slot = ?summaries_by_slot.last_key_value().map(|(slot, _)| slot), + ?state_summaries_dag_roots, + %hot_hdiff_start_slot, + split_state_root = ?split.state_root, + "Starting hot states migration" + ); + + // Upgrade all hot DB state summaries to the new type: + // - Set all summaries of boundary states to `Snapshot` type + // - Set all others to `Replay` pointing to `epoch_boundary_state_root` + + let mut diffs_written = 0; + let mut summaries_written = 0; + let mut last_log_time = Instant::now(); + + for (slot, old_hot_state_summaries) in summaries_by_slot { + for (state_root, old_summary) in old_hot_state_summaries { + if slot < hot_hdiff_start_slot { + // To reach here, there must be some pruning issue with the DB where we still have + // hot states below the split slot. This states can't be migrated as we can't compute + // a storage strategy for them. After this if else block, the summary and state are + // scheduled for deletion. + debug!( + %slot, + ?state_root, + "Ignoring state summary prior to split slot" + ); + } else { + // 1. Store snapshot or diff at this slot (if required). + let storage_strategy = db.hot_storage_strategy(slot)?; + debug!( + %slot, + ?state_root, + ?storage_strategy, + "Migrating state summary" + ); + + match storage_strategy { + StorageStrategy::DiffFrom(_) | StorageStrategy::Snapshot => { + // Load the state and re-store it as a snapshot or diff. + let state = get_state_v22::(&db, &state_root, &db.spec)? + .ok_or(Error::MissingState(state_root))?; + + // Store immediately so that future diffs can load and diff from it. + let mut ops = vec![]; + // We must commit the hot state summary immediately, otherwise we can't diff + // against it and future writes will fail. That's why we write the new hot + // summaries in a different column to have both new and old data present at + // once. Otherwise if the process crashes during the migration the database will + // be broken. + db.store_hot_state_summary(&state_root, &state, &mut ops)?; + db.store_hot_state_diffs(&state_root, &state, &mut ops)?; + db.hot_db.do_atomically(ops)?; + diffs_written += 1; + } + StorageStrategy::ReplayFrom(diff_base_slot) => { + // Optimization: instead of having to load the state of each summary we load x32 + // less states by manually computing the HotStateSummary roots using the + // computed state dag. + // + // No need to store diffs for states that will be reconstructed by replaying + // blocks. + // + // 2. Convert the summary to the new format. + if state_root == split.state_root { + return Err(Error::MigrationError( + "unreachable: split state should be stored as a snapshot or diff" + .to_string(), + )); + } + let previous_state_root = state_summaries_dag + .previous_state_root(state_root) + .map_err(|e| { + Error::MigrationError(format!( + "error computing previous_state_root {e:?}" + )) + })?; + + let diff_base_state = OptionalDiffBaseState::new( + diff_base_slot, + state_summaries_dag + .ancestor_state_root_at_slot(state_root, diff_base_slot) + .map_err(|e| { + Error::MigrationError(format!( + "error computing ancestor_state_root_at_slot \ + ({state_root:?}, {diff_base_slot}): {e:?}" + )) + })?, + ); + + let new_summary = HotStateSummary { + slot, + latest_block_root: old_summary.latest_block_root, + latest_block_slot: old_summary.latest_block_slot, + previous_state_root, + diff_base_state, + }; + let op = new_summary.as_kv_store_op(state_root); + // It's not necessary to immediately commit the summaries of states that are + // ReplayFrom. However we do so for simplicity. + db.hot_db.do_atomically(vec![op])?; + } + } + } + + // 3. Stage old data for deletion. + if slot % T::EthSpec::slots_per_epoch() == 0 { + migrate_ops.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconState, + state_root.as_slice().to_vec(), + )); + } + + // Delete previous summaries + migrate_ops.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconStateSummary, + state_root.as_slice().to_vec(), + )); + + summaries_written += 1; + if last_log_time.elapsed() > Duration::from_secs(5) { + last_log_time = Instant::now(); + info!( + diffs_written, + summaries_written, + summaries_count = state_summaries_dag.summaries_count(), + "Hot states migration in progress" + ); + } + } + } + + info!( + diffs_written, + summaries_written, + summaries_count = state_summaries_dag.summaries_count(), + "Hot states migration complete" + ); + + Ok(migrate_ops) +} + +pub fn downgrade_from_v24( + db: Arc>, +) -> Result, Error> { + let state_summaries = db + .load_hot_state_summaries()? + .into_iter() + .map(|(state_root, summary)| (state_root, summary.into())) + .collect::>(); + + info!( + summaries_count = state_summaries.len(), + "DB downgrade of v24 state summaries started" + ); + + let state_summaries_dag = StateSummariesDAG::new(state_summaries) + .map_err(|e| Error::MigrationError(format!("Error on new StateSumariesDAG {e:?}")))?; + + let mut migrate_ops = vec![]; + let mut states_written = 0; + let mut summaries_written = 0; + let mut summaries_skipped = 0; + let mut last_log_time = Instant::now(); + + // Rebuild the PruningCheckpoint from the split. + let split = db.get_split_info(); + let pruning_checkpoint = PruningCheckpoint { + checkpoint: Checkpoint { + epoch: split.slot.epoch(T::EthSpec::slots_per_epoch()), + root: split.block_root, + }, + }; + migrate_ops.push(pruning_checkpoint.as_kv_store_op(PRUNING_CHECKPOINT_KEY)); + + // Convert state summaries back to the old format. + for (state_root, summary) in state_summaries_dag + .summaries_by_slot_ascending() + .into_iter() + .flat_map(|(_, summaries)| summaries) + { + // No need to migrate any states prior to the split. The v22 schema does not need them, and + // they would generate warnings about a disjoint DAG when re-upgrading to V24. + if summary.slot < split.slot { + debug!( + slot = %summary.slot, + ?state_root, + "Skipping migration of pre-split state" + ); + summaries_skipped += 1; + continue; + } + + // If boundary state: persist. + // Do not cache these states as they are unlikely to be relevant later. + let update_cache = false; + if summary.slot % T::EthSpec::slots_per_epoch() == 0 { + let (state, _) = db + .load_hot_state(&state_root, update_cache)? + .ok_or(Error::MissingState(state_root))?; + + // Immediately commit the state, so we don't OOM. It's stored in a different + // column so if the migration crashes we'll just store extra harmless junk in the DB. + let mut state_write_ops = vec![]; + store_full_state_v22(&state_root, &state, &mut state_write_ops)?; + db.hot_db.do_atomically(state_write_ops)?; + states_written += 1; + } + + // Persist old summary. + let epoch_boundary_state_slot = summary.slot - summary.slot % T::EthSpec::slots_per_epoch(); + let old_summary = HotStateSummaryV22 { + slot: summary.slot, + latest_block_root: summary.latest_block_root, + epoch_boundary_state_root: state_summaries_dag + .ancestor_state_root_at_slot(state_root, epoch_boundary_state_slot) + .map_err(|e| { + Error::MigrationError(format!( + "error computing ancestor_state_root_at_slot({state_root:?}, {epoch_boundary_state_slot}) {e:?}" + )) + })?, + }; + migrate_ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconStateSummary, + state_root.as_slice().to_vec(), + old_summary.as_ssz_bytes(), + )); + summaries_written += 1; + + if last_log_time.elapsed() > Duration::from_secs(5) { + last_log_time = Instant::now(); + info!( + states_written, + summaries_written, + summaries_count = state_summaries_dag.summaries_count(), + "DB downgrade of v24 state summaries in progress" + ); + } + } + + // Delete all V24 schema data. We do this outside the loop over summaries to ensure we cover + // every piece of data and to simplify logic around skipping certain summaries that do not get + // migrated. + for db_column in [ + DBColumn::BeaconStateHotSummary, + DBColumn::BeaconStateHotDiff, + DBColumn::BeaconStateHotSnapshot, + ] { + for key in db.hot_db.iter_column_keys::(db_column) { + let state_root = key?; + migrate_ops.push(KeyValueStoreOp::DeleteKey( + db_column, + state_root.as_slice().to_vec(), + )); + } + } + + info!( + states_written, + summaries_written, + summaries_skipped, + summaries_count = state_summaries_dag.summaries_count(), + "DB downgrade of v24 state summaries completed" + ); + + Ok(migrate_ops) +} + +fn new_dag( + db: &HotColdDB, +) -> Result { + // Collect all sumaries for unfinalized states + let state_summaries_v22 = db + .hot_db + // Collect summaries from the legacy V22 column BeaconStateSummary + .iter_column::(DBColumn::BeaconStateSummary) + .map(|res| { + let (key, value) = res?; + let state_root: Hash256 = key; + let summary = HotStateSummaryV22::from_ssz_bytes(&value)?; + let block_root = summary.latest_block_root; + // Read blocks to get the block slot and parent root. In Holesky forced finalization it + // took 5100 ms to read 15072 state summaries, so it's not really necessary to + // de-duplicate block reads. + let block = db + .get_blinded_block(&block_root)? + .ok_or(Error::MissingBlock(block_root))?; + + Ok(( + state_root, + DAGStateSummaryV22 { + slot: summary.slot, + latest_block_root: summary.latest_block_root, + block_slot: block.slot(), + block_parent_root: block.parent_root(), + }, + )) + }) + .collect::, Error>>()?; + + StateSummariesDAG::new_from_v22(state_summaries_v22) + .map_err(|e| Error::MigrationError(format!("error computing states summaries dag {e:?}"))) +} diff --git a/beacon_node/beacon_chain/src/summaries_dag.rs b/beacon_node/beacon_chain/src/summaries_dag.rs index 8dff2ac7be1..42d078baebe 100644 --- a/beacon_node/beacon_chain/src/summaries_dag.rs +++ b/beacon_node/beacon_chain/src/summaries_dag.rs @@ -3,6 +3,7 @@ use std::{ cmp::Ordering, collections::{btree_map::Entry, BTreeMap, HashMap}, }; +use store::HotStateSummary; use types::{Hash256, Slot}; #[derive(Debug, Clone, Copy)] @@ -57,6 +58,12 @@ pub enum Error { root_state_root: Hash256, root_state_slot: Slot, }, + CircularAncestorChain { + state_root: Hash256, + previous_state_root: Hash256, + slot: Slot, + last_slot: Slot, + }, } impl StateSummariesDAG { @@ -311,10 +318,24 @@ impl StateSummariesDAG { } let mut ancestors = vec![]; + let mut last_slot = None; loop { if let Some(summary) = self.state_summaries_by_state_root.get(&state_root) { + // Detect cycles, including the case where `previous_state_root == state_root`. + if let Some(last_slot) = last_slot { + if summary.slot >= last_slot { + return Err(Error::CircularAncestorChain { + state_root, + previous_state_root: summary.previous_state_root, + slot: summary.slot, + last_slot, + }); + } + } + ancestors.push((state_root, summary.slot)); - state_root = summary.previous_state_root + last_slot = Some(summary.slot); + state_root = summary.previous_state_root; } else { return Ok(ancestors); } @@ -336,6 +357,17 @@ impl StateSummariesDAG { } } +impl From for DAGStateSummary { + fn from(value: HotStateSummary) -> Self { + Self { + slot: value.slot, + latest_block_root: value.latest_block_root, + latest_block_slot: value.latest_block_slot, + previous_state_root: value.previous_state_root, + } + } +} + #[cfg(test)] mod tests { use super::{DAGStateSummaryV22, Error, StateSummariesDAG}; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 51c7f0c289e..d6b76ac2ca6 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -24,15 +24,18 @@ use state_processing::{state_advance::complete_state_advance, BlockReplayer}; use std::collections::HashMap; use std::collections::HashSet; use std::convert::TryInto; +use std::str::FromStr; use std::sync::{Arc, LazyLock}; use std::time::Duration; use store::database::interface::BeaconNodeBackend; use store::metadata::{SchemaVersion, CURRENT_SCHEMA_VERSION, STATE_UPPER_LIMIT_NO_RETAIN}; use store::{ + hdiff::HierarchyConfig, iter::{BlockRootsIterator, StateRootsIterator}, BlobInfo, DBColumn, HotColdDB, StoreConfig, }; use tempfile::{tempdir, TempDir}; +use tracing::info; use types::test_utils::{SeedableRng, XorShiftRng}; use types::*; @@ -121,15 +124,16 @@ fn get_harness_generic( harness } -fn count_states_descendant_of_block( +fn get_states_descendant_of_block( store: &HotColdDB, BeaconNodeBackend>, block_root: Hash256, -) -> usize { +) -> Vec<(Hash256, Slot)> { let summaries = store.load_hot_state_summaries().unwrap(); summaries .iter() .filter(|(_, s)| s.latest_block_root == block_root) - .count() + .map(|(state_root, summary)| (*state_root, summary.slot)) + .collect() } #[tokio::test] @@ -516,15 +520,18 @@ async fn epoch_boundary_state_attestation_processing() { .get_blinded_block(&block_root) .unwrap() .expect("block exists"); + // Use get_state as the state may be finalized by this point let mut epoch_boundary_state = store - .load_epoch_boundary_state(&block.state_root()) + .get_state(&block.state_root(), None, CACHE_STATE_IN_TESTS) .expect("no error") - .expect("epoch boundary state exists"); + .unwrap_or_else(|| { + panic!("epoch boundary state should exist {:?}", block.state_root()) + }); let ebs_state_root = epoch_boundary_state.update_tree_hash_cache().unwrap(); let mut ebs_of_ebs = store - .load_epoch_boundary_state(&ebs_state_root) + .get_state(&ebs_state_root, None, CACHE_STATE_IN_TESTS) .expect("no error") - .expect("ebs of ebs exists"); + .unwrap_or_else(|| panic!("ebs of ebs should exist {ebs_state_root:?}")); ebs_of_ebs.apply_pending_mutations().unwrap(); assert_eq!(epoch_boundary_state, ebs_of_ebs); @@ -2171,7 +2178,8 @@ async fn garbage_collect_temp_states_from_failed_block_on_finalization() { let slots_per_epoch = E::slots_per_epoch(); - let genesis_state = harness.get_current_state(); + let mut genesis_state = harness.get_current_state(); + let genesis_state_root = genesis_state.update_tree_hash_cache().unwrap(); let block_slot = Slot::new(2 * slots_per_epoch); let ((signed_block, _), state) = harness.make_block(genesis_state, block_slot).await; @@ -2198,7 +2206,7 @@ async fn garbage_collect_temp_states_from_failed_block_on_finalization() { // The bad block parent root is the genesis block root. There's `block_slot - 1` temporary // states to remove + the genesis state = block_slot. assert_eq!( - count_states_descendant_of_block(&store, bad_block_parent_root), + get_states_descendant_of_block(&store, bad_block_parent_root).len(), block_slot.as_usize(), ); @@ -2216,11 +2224,12 @@ async fn garbage_collect_temp_states_from_failed_block_on_finalization() { // Check that the finalization migration ran. assert_ne!(store.get_split_slot(), 0); - // Check that temporary states have been pruned. The genesis block is not a descendant of the - // latest finalized checkpoint, so all its states have been pruned from the hot DB, = 0. + // Check that temporary states have been pruned. assert_eq!( - count_states_descendant_of_block(&store, bad_block_parent_root), - 0 + get_states_descendant_of_block(&store, bad_block_parent_root), + // The genesis state is kept to support the HDiff grid + vec![(genesis_state_root, Slot::new(0))], + "get_states_descendant_of_block({bad_block_parent_root:?})" ); } @@ -2322,6 +2331,8 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .get_state(&wss_state_root, Some(checkpoint_slot), CACHE_STATE_IN_TESTS) .unwrap() .unwrap(); + let wss_state_slot = wss_state.slot(); + let wss_block_slot = wss_block.slot(); // Add more blocks that advance finalization further. harness.advance_slot(); @@ -2414,12 +2425,14 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .unwrap(); let slot = full_block.slot(); + let full_block_root = full_block.canonical_root(); let state_root = full_block.state_root(); + info!(block_root = ?full_block_root, ?state_root, %slot, "Importing block from chain dump"); beacon_chain.slot_clock.set_slot(slot.as_u64()); beacon_chain .process_block( - full_block.canonical_root(), + full_block_root, harness.build_rpc_block_from_store_blobs(Some(block_root), Arc::new(full_block)), NotifyExecutionLayer::Yes, BlockImportSource::Lookup, @@ -2506,8 +2519,19 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { HistoricalBlockError::InvalidSignature )); + let available_blocks_slots = available_blocks + .iter() + .map(|block| (block.block().slot(), block.block().canonical_root())) + .collect::>(); + info!( + ?available_blocks_slots, + "wss_block_slot" = wss_block.slot().as_usize(), + "Importing historical block batch" + ); + // Importing the batch with valid signatures should succeed. let available_blocks_dup = available_blocks.iter().map(clone_block).collect::>(); + assert_eq!(beacon_chain.store.get_oldest_block_slot(), wss_block.slot()); beacon_chain .import_historical_block_batch(available_blocks_dup) .unwrap(); @@ -2518,6 +2542,17 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { .import_historical_block_batch(available_blocks) .unwrap(); + // Sanity check for non-aligned WSS starts, to make sure the WSS block is persisted properly + if wss_block_slot != wss_state_slot { + let new_node_block_root_at_wss_block = beacon_chain + .store + .get_cold_block_root(wss_block_slot) + .unwrap() + .unwrap(); + info!(?new_node_block_root_at_wss_block, %wss_block_slot); + assert_eq!(new_node_block_root_at_wss_block, wss_block.canonical_root()); + } + // The forwards iterator should now match the original chain let forwards = beacon_chain .forwards_iter_block_roots(Slot::new(0)) @@ -2571,11 +2606,25 @@ async fn weak_subjectivity_sync_test(slots: Vec, checkpoint_slot: Slot) { } // Anchor slot is still set to the slot of the checkpoint block. - assert_eq!(store.get_anchor_info().anchor_slot, wss_block.slot()); + // Note: since hot tree states the anchor slot is set to the aligned ws state slot + // https://github.com/sigp/lighthouse/pull/6750 + let wss_aligned_slot = if checkpoint_slot % E::slots_per_epoch() == 0 { + checkpoint_slot + } else { + (checkpoint_slot.epoch(E::slots_per_epoch()) + Epoch::new(1)) + .start_slot(E::slots_per_epoch()) + }; + assert_eq!(store.get_anchor_info().anchor_slot, wss_aligned_slot); + assert_eq!( + store.get_anchor_info().state_upper_limit, + Slot::new(u64::MAX) + ); + info!(anchor = ?store.get_anchor_info(), "anchor pre"); // Reconstruct states. store.clone().reconstruct_historic_states(None).unwrap(); - assert_eq!(store.get_anchor_info().anchor_slot, 0); + assert_eq!(store.get_anchor_info().anchor_slot, wss_aligned_slot); + assert_eq!(store.get_anchor_info().state_upper_limit, Slot::new(0)); } /// Test that blocks and attestations that refer to states around an unaligned split state are @@ -3007,12 +3056,27 @@ async fn revert_minority_fork_on_resume() { // version is correct. This is the easiest schema test to write without historic versions of // Lighthouse on-hand, but has the disadvantage that the min version needs to be adjusted manually // as old downgrades are deprecated. -#[tokio::test] -async fn schema_downgrade_to_min_version() { +async fn schema_downgrade_to_min_version( + store_config: StoreConfig, + reconstruct_historic_states: bool, +) { let num_blocks_produced = E::slots_per_epoch() * 4; let db_path = tempdir().unwrap(); - let store = get_store(&db_path); - let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + let spec = test_spec::(); + + let chain_config = ChainConfig { + reconstruct_historic_states, + ..ChainConfig::default() + }; + let import_all_data_columns = false; + + let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); + let harness = get_harness_generic( + store.clone(), + LOW_VALIDATOR_COUNT, + chain_config.clone(), + import_all_data_columns, + ); harness .extend_chain( @@ -3032,7 +3096,7 @@ async fn schema_downgrade_to_min_version() { drop(harness); // Re-open the store. - let store = get_store(&db_path); + let store = get_store_generic(&db_path, store_config, spec); // Downgrade. migrate_schema::>(store.clone(), CURRENT_SCHEMA_VERSION, min_version) @@ -3045,16 +3109,28 @@ async fn schema_downgrade_to_min_version() { // Recreate the harness. let harness = BeaconChainHarness::builder(MinimalEthSpec) .default_spec() + .chain_config(chain_config) .keypairs(KEYPAIRS[0..LOW_VALIDATOR_COUNT].to_vec()) .testing_slot_clock(slot_clock) .resumed_disk_store(store.clone()) .mock_execution_layer() .build(); + // Check chain dump for appropriate range depending on whether this is an archive node. + let chain_dump_start_slot = if reconstruct_historic_states { + Slot::new(0) + } else { + store.get_split_slot() + }; + check_finalization(&harness, num_blocks_produced); check_split_slot(&harness, store.clone()); - check_chain_dump(&harness, num_blocks_produced + 1); - check_iterators(&harness); + check_chain_dump_from_slot( + &harness, + chain_dump_start_slot, + num_blocks_produced + 1 - chain_dump_start_slot.as_u64(), + ); + check_iterators_from_slot(&harness, chain_dump_start_slot); // Check that downgrading beyond the minimum version fails (bound is *tight*). let min_version_sub_1 = SchemaVersion(min_version.as_u64().checked_sub(1).unwrap()); @@ -3062,6 +3138,66 @@ async fn schema_downgrade_to_min_version() { .expect_err("should not downgrade below minimum version"); } +// Schema upgrade/downgrade on an archive node where the optimised migration does apply due +// to the split state being aligned to a diff layer. +#[tokio::test] +async fn schema_downgrade_to_min_version_archive_node_grid_aligned() { + // Need to use 3 as the hierarchy exponent to get diffs on every epoch boundary with minimal + // spec. + schema_downgrade_to_min_version( + StoreConfig { + hierarchy_config: HierarchyConfig::from_str("3,4,5").unwrap(), + prune_payloads: false, + ..StoreConfig::default() + }, + true, + ) + .await +} + +// Schema upgrade/downgrade on an archive node where the optimised migration DOES NOT apply +// due to the split state NOT being aligned to a diff layer. +#[tokio::test] +async fn schema_downgrade_to_min_version_archive_node_grid_unaligned() { + schema_downgrade_to_min_version( + StoreConfig { + hierarchy_config: HierarchyConfig::from_str("7").unwrap(), + prune_payloads: false, + ..StoreConfig::default() + }, + true, + ) + .await +} + +// Schema upgrade/downgrade on a full node with a fairly normal per-epoch diff config. +#[tokio::test] +async fn schema_downgrade_to_min_version_full_node_per_epoch_diffs() { + schema_downgrade_to_min_version( + StoreConfig { + hierarchy_config: HierarchyConfig::from_str("3,4,5").unwrap(), + prune_payloads: false, + ..StoreConfig::default() + }, + false, + ) + .await +} + +// Schema upgrade/downgrade on a full node with dense per-slot diffs. +#[tokio::test] +async fn schema_downgrade_to_min_version_full_node_dense_diffs() { + schema_downgrade_to_min_version( + StoreConfig { + hierarchy_config: HierarchyConfig::from_str("0,3,4,5").unwrap(), + prune_payloads: false, + ..StoreConfig::default() + }, + true, + ) + .await +} + /// Check that blob pruning prunes blobs older than the data availability boundary. #[tokio::test] async fn deneb_prune_blobs_happy_case() { @@ -3447,6 +3583,163 @@ async fn prune_historic_states() { check_split_slot(&harness, store); } +// Test the function `get_ancestor_state_root` for slots prior to the split where we only have +// sparse summaries stored. +#[tokio::test] +async fn ancestor_state_root_prior_to_split() { + let db_path = tempdir().unwrap(); + + let spec = test_spec::(); + + let store_config = StoreConfig { + prune_payloads: false, + hierarchy_config: HierarchyConfig::from_str("5,7,8").unwrap(), + ..StoreConfig::default() + }; + let chain_config = ChainConfig { + reconstruct_historic_states: false, + ..ChainConfig::default() + }; + let import_all_data_columns = false; + + let store = get_store_generic(&db_path, store_config, spec); + let harness = get_harness_generic( + store.clone(), + LOW_VALIDATOR_COUNT, + chain_config, + import_all_data_columns, + ); + + // Produce blocks until we have passed through two full snapshot periods. This period length is + // determined by the hierarchy config set above. + let num_blocks = 2 * store + .hierarchy + .next_snapshot_slot(Slot::new(1)) + .unwrap() + .as_u64(); + + for num_blocks_so_far in 0..num_blocks { + harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + // Check that `get_ancestor_state_root` can look up the grid-aligned ancestors of every hot + // state, even at ancestor slots prior to the split. + let head_state = harness.get_current_state(); + assert_eq!(head_state.slot().as_u64(), num_blocks_so_far + 1); + + let split_slot = store.get_split_slot(); + let anchor_slot = store.get_anchor_info().anchor_slot; + + for state_slot in (split_slot.as_u64()..=num_blocks_so_far).map(Slot::new) { + for ancestor_slot in store + .hierarchy + .closest_layer_points(state_slot, anchor_slot) + { + // The function currently doesn't consider a state an ancestor of itself, so this + // does not work. + if ancestor_slot == state_slot { + continue; + } + let ancestor_state_root = store::hot_cold_store::get_ancestor_state_root( + &store, + &head_state, + ancestor_slot, + ) + .unwrap_or_else(|e| { + panic!( + "get_ancestor_state_root failed for state_slot={state_slot}, \ + ancestor_slot={ancestor_slot}, head_slot={}. error: {e:?}", + head_state.slot() + ) + }); + + // Check state root correctness. + assert_eq!( + store + .load_hot_state_summary(&ancestor_state_root) + .unwrap() + .unwrap_or_else(|| panic!( + "no summary found for {ancestor_state_root:?} (slot {ancestor_slot})" + )) + .slot, + ancestor_slot, + ) + } + } + } + + // This test only makes sense if the split is non-zero by the end. + assert_ne!(store.get_split_slot(), 0); +} + +// Test that the chain operates correctly when the split state is stored as a ReplayFrom. +#[tokio::test] +async fn replay_from_split_state() { + let db_path = tempdir().unwrap(); + + let spec = test_spec::(); + + let store_config = StoreConfig { + prune_payloads: false, + hierarchy_config: HierarchyConfig::from_str("5").unwrap(), + ..StoreConfig::default() + }; + let chain_config = ChainConfig { + reconstruct_historic_states: false, + ..ChainConfig::default() + }; + let import_all_data_columns = false; + + let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); + let harness = get_harness_generic( + store.clone(), + LOW_VALIDATOR_COUNT, + chain_config, + import_all_data_columns, + ); + + // Produce blocks until we finalize epoch 3 which will not be stored as a snapshot. + let num_blocks = 5 * E::slots_per_epoch() as usize; + + harness + .extend_chain( + num_blocks, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let split = store.get_split_info(); + let anchor_slot = store.get_anchor_info().anchor_slot; + assert_eq!(split.slot, 3 * E::slots_per_epoch()); + assert_eq!(anchor_slot, 0); + assert!(store + .hierarchy + .storage_strategy(split.slot, anchor_slot) + .unwrap() + .is_replay_from()); + + // Close the database and reopen it. + drop(store); + drop(harness); + + let store = get_store_generic(&db_path, store_config, spec); + + // Check that the split state is still accessible. + assert_eq!(store.get_split_slot(), split.slot); + let state = store + .get_hot_state(&split.state_root, false) + .unwrap() + .expect("split state should be present"); + assert_eq!(state.slot(), split.slot); +} + /// Checks that two chains are the same, for the purpose of these tests. /// /// Several fields that are hard/impossible to check are ignored (e.g., the store). @@ -3540,7 +3833,11 @@ fn check_split_slot( /// Check that all the states in a chain dump have the correct tree hash. fn check_chain_dump(harness: &TestHarness, expected_len: u64) { - let mut chain_dump = harness.chain.chain_dump().unwrap(); + check_chain_dump_from_slot(harness, Slot::new(0), expected_len) +} + +fn check_chain_dump_from_slot(harness: &TestHarness, from_slot: Slot, expected_len: u64) { + let mut chain_dump = harness.chain.chain_dump_from_slot(from_slot).unwrap(); assert_eq!(chain_dump.len() as u64, expected_len); @@ -3588,7 +3885,7 @@ fn check_chain_dump(harness: &TestHarness, expected_len: u64) { let mut forward_block_roots = harness .chain - .forwards_iter_block_roots(Slot::new(0)) + .forwards_iter_block_roots(from_slot) .expect("should get iter") .map(Result::unwrap) .collect::>(); @@ -3609,10 +3906,14 @@ fn check_chain_dump(harness: &TestHarness, expected_len: u64) { /// Check that every state from the canonical chain is in the database, and that the /// reverse state and block root iterators reach genesis. fn check_iterators(harness: &TestHarness) { + check_iterators_from_slot(harness, Slot::new(0)) +} + +fn check_iterators_from_slot(harness: &TestHarness, slot: Slot) { let mut max_slot = None; for (state_root, slot) in harness .chain - .forwards_iter_state_roots(Slot::new(0)) + .forwards_iter_state_roots(slot) .expect("should get iter") .map(Result::unwrap) { @@ -3634,7 +3935,7 @@ fn check_iterators(harness: &TestHarness) { assert_eq!( harness .chain - .forwards_iter_block_roots(Slot::new(0)) + .forwards_iter_block_roots(slot) .expect("should get iter") .last() .map(Result::unwrap) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 7d086dcc326..1c384aa4113 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -808,14 +808,26 @@ pub fn cli_app() -> Command { Arg::new("hdiff-buffer-cache-size") .long("hdiff-buffer-cache-size") .value_name("SIZE") - .help("Number of hierarchical diff (hdiff) buffers to cache in memory. Each buffer \ - is around the size of a BeaconState so you should be cautious about setting \ - this value too high. This flag is irrelevant for most nodes, which run with \ - state pruning enabled.") + .help("Number of cold hierarchical diff (hdiff) buffers to cache in memory. Each \ + buffer is around the size of a BeaconState so you should be cautious about \ + setting this value too high. This flag is irrelevant for most nodes, which \ + run with state pruning enabled.") .default_value("16") .action(ArgAction::Set) .display_order(0) ) + .arg( + Arg::new("hot-hdiff-buffer-cache-size") + .long("hot-hdiff-buffer-cache-size") + .value_name("SIZE") + .help("Number of hot hierarchical diff (hdiff) buffers to cache in memory. Each \ + buffer is around the size of a BeaconState so you should be cautious about \ + setting this value too high. Setting this value higher can reduce the time \ + taken to store new states on disk at the cost of higher memory usage.") + .default_value("1") + .action(ArgAction::Set) + .display_order(0) + ) .arg( Arg::new("state-cache-size") .long("state-cache-size") @@ -1645,7 +1657,7 @@ pub fn cli_app() -> Command { .arg( Arg::new("delay-data-column-publishing") .long("delay-data-column-publishing") - .value_name("SECONDS") + .value_name("SECONDS") .action(ArgAction::Set) .help_heading(FLAG_HEADER) .help("TESTING ONLY: Artificially delay data column publishing by the specified number of seconds. \ diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index e887aa9abce..67245a22148 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -418,7 +418,13 @@ pub fn get_config( if let Some(hdiff_buffer_cache_size) = clap_utils::parse_optional(cli_args, "hdiff-buffer-cache-size")? { - client_config.store.hdiff_buffer_cache_size = hdiff_buffer_cache_size; + client_config.store.cold_hdiff_buffer_cache_size = hdiff_buffer_cache_size; + } + + if let Some(hdiff_buffer_cache_size) = + clap_utils::parse_optional(cli_args, "hot-hdiff-buffer-cache-size")? + { + client_config.store.hot_hdiff_buffer_cache_size = hdiff_buffer_cache_size; } client_config.store.compact_on_init = cli_args.get_flag("compact-db"); diff --git a/beacon_node/store/src/config.rs b/beacon_node/store/src/config.rs index a84573eb406..c16573df5e4 100644 --- a/beacon_node/store/src/config.rs +++ b/beacon_node/store/src/config.rs @@ -1,6 +1,6 @@ use crate::hdiff::HierarchyConfig; use crate::superstruct; -use crate::{AnchorInfo, DBColumn, Error, Split, StoreItem}; +use crate::{DBColumn, Error, StoreItem}; use serde::{Deserialize, Serialize}; use ssz::{Decode, Encode}; use ssz_derive::{Decode, Encode}; @@ -24,7 +24,8 @@ pub const DEFAULT_STATE_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(128); pub const DEFAULT_STATE_CACHE_HEADROOM: NonZeroUsize = new_non_zero_usize(1); pub const DEFAULT_COMPRESSION_LEVEL: i32 = 1; pub const DEFAULT_HISTORIC_STATE_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(1); -pub const DEFAULT_HDIFF_BUFFER_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(16); +pub const DEFAULT_COLD_HDIFF_BUFFER_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(16); +pub const DEFAULT_HOT_HDIFF_BUFFER_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(1); const EST_COMPRESSION_FACTOR: usize = 2; pub const DEFAULT_EPOCHS_PER_BLOB_PRUNE: u64 = 1; pub const DEFAULT_BLOB_PUNE_MARGIN_EPOCHS: u64 = 0; @@ -42,8 +43,10 @@ pub struct StoreConfig { pub compression_level: i32, /// Maximum number of historic states to store in the in-memory historic state cache. pub historic_state_cache_size: NonZeroUsize, - /// Maximum number of `HDiffBuffer`s to store in memory. - pub hdiff_buffer_cache_size: NonZeroUsize, + /// Maximum number of cold `HDiffBuffer`s to store in memory. + pub cold_hdiff_buffer_cache_size: NonZeroUsize, + /// Maximum number of hot `HDiffBuffers` to store in memory. + pub hot_hdiff_buffer_cache_size: NonZeroUsize, /// Whether to compact the database on initialization. pub compact_on_init: bool, /// Whether to compact the database during database pruning. @@ -65,14 +68,12 @@ pub struct StoreConfig { /// Variant of `StoreConfig` that gets written to disk. Contains immutable configuration params. #[superstruct( - variants(V1, V22), + variants(V22), variant_attributes(derive(Debug, Clone, PartialEq, Eq, Encode, Decode)) )] #[derive(Clone, Debug, PartialEq, Eq)] pub struct OnDiskStoreConfig { - #[superstruct(only(V1))] - pub slots_per_restore_point: u64, - /// Prefix byte to future-proof versions of the `OnDiskStoreConfig` post V1 + /// Prefix byte to future-proof versions of the `OnDiskStoreConfig`. #[superstruct(only(V22))] version_byte: u8, #[superstruct(only(V22))] @@ -90,10 +91,6 @@ impl OnDiskStoreConfigV22 { #[derive(Debug, Clone)] pub enum StoreConfigError { - MismatchedSlotsPerRestorePoint { - config: u64, - on_disk: u64, - }, InvalidCompressionLevel { level: i32, }, @@ -112,7 +109,8 @@ impl Default for StoreConfig { state_cache_size: DEFAULT_STATE_CACHE_SIZE, state_cache_headroom: DEFAULT_STATE_CACHE_HEADROOM, historic_state_cache_size: DEFAULT_HISTORIC_STATE_CACHE_SIZE, - hdiff_buffer_cache_size: DEFAULT_HDIFF_BUFFER_CACHE_SIZE, + cold_hdiff_buffer_cache_size: DEFAULT_COLD_HDIFF_BUFFER_CACHE_SIZE, + hot_hdiff_buffer_cache_size: DEFAULT_HOT_HDIFF_BUFFER_CACHE_SIZE, compression_level: DEFAULT_COMPRESSION_LEVEL, compact_on_init: false, compact_on_prune: true, @@ -134,21 +132,13 @@ impl StoreConfig { pub fn check_compatibility( &self, on_disk_config: &OnDiskStoreConfig, - split: &Split, - anchor: &AnchorInfo, ) -> Result<(), StoreConfigError> { - // Allow changing the hierarchy exponents if no historic states are stored. - let no_historic_states_stored = anchor.no_historic_states_stored(split.slot); - let hierarchy_config_changed = - if let Ok(on_disk_hierarchy_config) = on_disk_config.hierarchy_config() { - *on_disk_hierarchy_config != self.hierarchy_config - } else { - false - }; - - if hierarchy_config_changed && !no_historic_states_stored { + // We previously allowed the hierarchy exponents to change on non-archive nodes, but since + // schema v24 and the use of hdiffs in the hot DB, changing will require a resync. + let current_config = self.as_disk_config(); + if current_config != *on_disk_config { Err(StoreConfigError::IncompatibleStoreConfig { - config: self.as_disk_config(), + config: current_config, on_disk: on_disk_config.clone(), }) } else { @@ -222,32 +212,21 @@ impl StoreItem for OnDiskStoreConfig { fn as_store_bytes(&self) -> Vec { match self { - OnDiskStoreConfig::V1(value) => value.as_ssz_bytes(), OnDiskStoreConfig::V22(value) => value.as_ssz_bytes(), } } fn from_store_bytes(bytes: &[u8]) -> Result { - // NOTE: V22 config can never be deserialized as a V1 because the minimum length of its - // serialization is: 1 prefix byte + 1 offset (OnDiskStoreConfigV1 container) + - // 1 offset (HierarchyConfig container) = 9. - if let Ok(value) = OnDiskStoreConfigV1::from_ssz_bytes(bytes) { - return Ok(Self::V1(value)); + match bytes.first() { + Some(22) => Ok(Self::V22(OnDiskStoreConfigV22::from_ssz_bytes(bytes)?)), + version_byte => Err(StoreConfigError::InvalidVersionByte(version_byte.copied()).into()), } - - Ok(Self::V22(OnDiskStoreConfigV22::from_ssz_bytes(bytes)?)) } } #[cfg(test)] mod test { use super::*; - use crate::{ - metadata::{ANCHOR_FOR_ARCHIVE_NODE, ANCHOR_UNINITIALIZED, STATE_UPPER_LIMIT_NO_RETAIN}, - AnchorInfo, Split, - }; - use ssz::DecodeError; - use types::{Hash256, Slot}; #[test] fn check_compatibility_ok() { @@ -257,24 +236,7 @@ mod test { let on_disk_config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new( store_config.hierarchy_config.clone(), )); - let split = Split::default(); - assert!(store_config - .check_compatibility(&on_disk_config, &split, &ANCHOR_UNINITIALIZED) - .is_ok()); - } - - #[test] - fn check_compatibility_after_migration() { - let store_config = StoreConfig { - ..Default::default() - }; - let on_disk_config = OnDiskStoreConfig::V1(OnDiskStoreConfigV1 { - slots_per_restore_point: 8192, - }); - let split = Split::default(); - assert!(store_config - .check_compatibility(&on_disk_config, &split, &ANCHOR_UNINITIALIZED) - .is_ok()); + assert!(store_config.check_compatibility(&on_disk_config).is_ok()); } #[test] @@ -283,70 +245,11 @@ mod test { let on_disk_config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new(HierarchyConfig { exponents: vec![5, 8, 11, 13, 16, 18, 21], })); - let split = Split { - slot: Slot::new(32), - ..Default::default() - }; - assert!(store_config - .check_compatibility(&on_disk_config, &split, &ANCHOR_FOR_ARCHIVE_NODE) - .is_err()); - } - - #[test] - fn check_compatibility_hierarchy_config_update() { - let store_config = StoreConfig { - ..Default::default() - }; - let on_disk_config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new(HierarchyConfig { - exponents: vec![5, 8, 11, 13, 16, 18, 21], - })); - let split = Split::default(); - let anchor = AnchorInfo { - anchor_slot: Slot::new(0), - oldest_block_slot: Slot::new(0), - oldest_block_parent: Hash256::ZERO, - state_upper_limit: STATE_UPPER_LIMIT_NO_RETAIN, - state_lower_limit: Slot::new(0), - }; - assert!(store_config - .check_compatibility(&on_disk_config, &split, &anchor) - .is_ok()); - } - - #[test] - fn serde_on_disk_config_v0_from_v1_default() { - let config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new(<_>::default())); - let config_bytes = config.as_store_bytes(); - // On a downgrade, the previous version of lighthouse will attempt to deserialize the - // prefixed V22 as just the V1 version. - assert_eq!( - OnDiskStoreConfigV1::from_ssz_bytes(&config_bytes).unwrap_err(), - DecodeError::InvalidByteLength { - len: 16, - expected: 8 - }, - ); - } - - #[test] - fn serde_on_disk_config_v0_from_v1_empty() { - let config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new(HierarchyConfig { - exponents: vec![], - })); - let config_bytes = config.as_store_bytes(); - // On a downgrade, the previous version of lighthouse will attempt to deserialize the - // prefixed V22 as just the V1 version. - assert_eq!( - OnDiskStoreConfigV1::from_ssz_bytes(&config_bytes).unwrap_err(), - DecodeError::InvalidByteLength { - len: 9, - expected: 8 - }, - ); + assert!(store_config.check_compatibility(&on_disk_config).is_err()); } #[test] - fn serde_on_disk_config_v1_roundtrip() { + fn on_disk_config_v22_roundtrip() { let config = OnDiskStoreConfig::V22(OnDiskStoreConfigV22::new(<_>::default())); let bytes = config.as_store_bytes(); assert_eq!(bytes[0], 22); diff --git a/beacon_node/store/src/database/interface.rs b/beacon_node/store/src/database/interface.rs index b213433241c..bccf7996177 100644 --- a/beacon_node/store/src/database/interface.rs +++ b/beacon_node/store/src/database/interface.rs @@ -105,15 +105,6 @@ impl KeyValueStore for BeaconNodeBackend { } } - fn begin_rw_transaction(&self) -> parking_lot::MutexGuard<()> { - match self { - #[cfg(feature = "leveldb")] - BeaconNodeBackend::LevelDb(txn) => leveldb_impl::LevelDB::begin_rw_transaction(txn), - #[cfg(feature = "redb")] - BeaconNodeBackend::Redb(txn) => redb_impl::Redb::begin_rw_transaction(txn), - } - } - fn compact(&self) -> Result<(), Error> { match self { #[cfg(feature = "leveldb")] diff --git a/beacon_node/store/src/database/leveldb_impl.rs b/beacon_node/store/src/database/leveldb_impl.rs index 81d6d1d4bd2..e990333fa3a 100644 --- a/beacon_node/store/src/database/leveldb_impl.rs +++ b/beacon_node/store/src/database/leveldb_impl.rs @@ -13,7 +13,6 @@ use leveldb::{ iterator::{Iterable, LevelDBIterator}, options::{Options, ReadOptions}, }; -use parking_lot::{Mutex, MutexGuard}; use std::collections::HashSet; use std::marker::PhantomData; use std::path::Path; @@ -23,8 +22,6 @@ use super::interface::WriteOptions; pub struct LevelDB { db: Database, - /// A mutex to synchronise sensitive read-write transactions. - transaction_mutex: Mutex<()>, _phantom: PhantomData, } @@ -43,11 +40,9 @@ impl LevelDB { options.create_if_missing = true; let db = Database::open(path, options)?; - let transaction_mutex = Mutex::new(()); Ok(Self { db, - transaction_mutex, _phantom: PhantomData, }) } @@ -177,10 +172,6 @@ impl LevelDB { Ok(()) } - pub fn begin_rw_transaction(&self) -> MutexGuard<()> { - self.transaction_mutex.lock() - } - /// Compact all values in the states and states flag columns. pub fn compact(&self) -> Result<(), Error> { let _timer = metrics::start_timer(&metrics::DISK_DB_COMPACT_TIMES); diff --git a/beacon_node/store/src/database/redb_impl.rs b/beacon_node/store/src/database/redb_impl.rs index cbe575d184e..10d387adc8a 100644 --- a/beacon_node/store/src/database/redb_impl.rs +++ b/beacon_node/store/src/database/redb_impl.rs @@ -1,6 +1,6 @@ use crate::{metrics, ColumnIter, ColumnKeyIter, Key}; use crate::{DBColumn, Error, KeyValueStoreOp}; -use parking_lot::{Mutex, MutexGuard, RwLock}; +use parking_lot::RwLock; use redb::TableDefinition; use std::collections::HashSet; use std::{borrow::BorrowMut, marker::PhantomData, path::Path}; @@ -13,7 +13,6 @@ pub const DB_FILE_NAME: &str = "database.redb"; pub struct Redb { db: RwLock, - transaction_mutex: Mutex<()>, _phantom: PhantomData, } @@ -31,7 +30,6 @@ impl Redb { pub fn open(path: &Path) -> Result { let db_file = path.join(DB_FILE_NAME); let db = redb::Database::create(db_file)?; - let transaction_mutex = Mutex::new(()); for column in DBColumn::iter() { Redb::::create_table(&db, column.into())?; @@ -39,7 +37,6 @@ impl Redb { Ok(Self { db: db.into(), - transaction_mutex, _phantom: PhantomData, }) } @@ -61,10 +58,6 @@ impl Redb { opts } - pub fn begin_rw_transaction(&self) -> MutexGuard<()> { - self.transaction_mutex.lock() - } - pub fn put_bytes_with_options( &self, col: DBColumn, diff --git a/beacon_node/store/src/errors.rs b/beacon_node/store/src/errors.rs index cff08bc6557..eb1fb647187 100644 --- a/beacon_node/store/src/errors.rs +++ b/beacon_node/store/src/errors.rs @@ -1,6 +1,6 @@ use crate::chunked_vector::ChunkError; use crate::config::StoreConfigError; -use crate::hot_cold_store::HotColdDBError; +use crate::hot_cold_store::{HotColdDBError, StateSummaryIteratorError}; use crate::{hdiff, DBColumn}; #[cfg(feature = "leveldb")] use leveldb::error::Error as LevelDBError; @@ -26,6 +26,9 @@ pub enum Error { SplitPointModified(Slot, Slot), ConfigError(StoreConfigError), MigrationError(String), + /// The store's `anchor_info` is still the default uninitialized value when attempting a state + /// write + AnchorUninitialized, /// The store's `anchor_info` was mutated concurrently, the latest modification wasn't applied. AnchorInfoConcurrentMutation, /// The store's `blob_info` was mutated concurrently, the latest modification wasn't applied. @@ -47,11 +50,16 @@ pub enum Error { expected: Hash256, computed: Hash256, }, + MissingState(Hash256), + MissingHotStateSummary(Hash256), + MissingHotStateSnapshot(Hash256, Slot), MissingGenesisState, MissingSnapshot(Slot), + LoadingHotHdiffBufferError(String, Hash256, Box), + LoadingHotStateError(String, Hash256, Box), BlockReplayError(BlockReplayError), AddPayloadLogicError, - InvalidKey, + InvalidKey(String), InvalidBytes, InconsistentFork(InconsistentFork), #[cfg(feature = "leveldb")] @@ -75,6 +83,26 @@ pub enum Error { MissingBlock(Hash256), GenesisStateUnknown, ArithError(safe_arith::ArithError), + MismatchedDiffBaseState { + expected_slot: Slot, + stored_slot: Slot, + }, + SnapshotDiffBaseState { + slot: Slot, + }, + LoadAnchorInfo(Box), + LoadSplit(Box), + LoadBlobInfo(Box), + LoadDataColumnInfo(Box), + LoadConfig(Box), + LoadHotStateSummary(Hash256, Box), + LoadHotStateSummaryForSplit(Box), + StateSummaryIteratorError { + error: StateSummaryIteratorError, + from_state_root: Hash256, + from_state_slot: Slot, + target_slot: Slot, + }, } pub trait HandleUnavailable { diff --git a/beacon_node/store/src/hdiff.rs b/beacon_node/store/src/hdiff.rs index a659c654520..5731ebcbe0e 100644 --- a/beacon_node/store/src/hdiff.rs +++ b/beacon_node/store/src/hdiff.rs @@ -27,6 +27,7 @@ pub enum Error { Compression(std::io::Error), InvalidSszState(ssz::DecodeError), InvalidBalancesLength, + LessThanStart(Slot, Slot), } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)] @@ -67,6 +68,10 @@ impl FromStr for HierarchyConfig { return Err("hierarchy-exponents must be in ascending order".to_string()); } + if exponents.is_empty() { + return Err("empty exponents".to_string()); + } + Ok(HierarchyConfig { exponents }) } } @@ -478,7 +483,9 @@ impl ValidatorsDiff { Hash256::ZERO }, // effective_balance can increase and decrease - effective_balance: y.effective_balance - x.effective_balance, + effective_balance: y + .effective_balance + .wrapping_sub(x.effective_balance), // slashed can only change from false into true. In an index re-use it can // switch back to false, but in that case the pubkey will also change. slashed: y.slashed, @@ -642,10 +649,26 @@ impl HierarchyConfig { Err(Error::InvalidHierarchy) } } + + pub fn exponent_for_slot(slot: Slot) -> u32 { + slot.as_u64().trailing_zeros() + } } impl HierarchyModuli { - pub fn storage_strategy(&self, slot: Slot) -> Result { + /// * `slot` - Slot of the storage strategy + /// * `start_slot` - Slot before which states are not available. Initial snapshot point, which + /// may not be aligned to the hierarchy moduli values. Given an example of + /// exponents [5,13,21], to reconstruct state at slot 3,000,003: if start = 3,000,002 + /// layer 2 diff will point to the start snapshot instead of the layer 1 diff at + /// 2998272. + pub fn storage_strategy(&self, slot: Slot, start_slot: Slot) -> Result { + match slot.cmp(&start_slot) { + Ordering::Less => return Err(Error::LessThanStart(slot, start_slot)), + Ordering::Equal => return Ok(StorageStrategy::Snapshot), + Ordering::Greater => {} // continue + } + // last = full snapshot interval let last = self.moduli.last().copied().ok_or(Error::InvalidHierarchy)?; // first = most frequent diff layer, need to replay blocks from this layer @@ -667,14 +690,22 @@ impl HierarchyModuli { .find_map(|(&n_big, &n_small)| { if slot % n_small == 0 { // Diff from the previous layer. - Some(StorageStrategy::DiffFrom(slot / n_big * n_big)) + let from = slot / n_big * n_big; + // Or from start point + let from = std::cmp::max(from, start_slot); + Some(StorageStrategy::DiffFrom(from)) } else { // Keep trying with next layer None } }) // Exhausted layers, need to replay from most frequent layer - .unwrap_or(StorageStrategy::ReplayFrom(slot / first * first))) + .unwrap_or_else(|| { + let from = slot / first * first; + // Or from start point + let from = std::cmp::max(from, start_slot); + StorageStrategy::ReplayFrom(from) + })) } /// Return the smallest slot greater than or equal to `slot` at which a full snapshot should @@ -703,6 +734,26 @@ impl HierarchyModuli { |second_layer_moduli| Ok(slot % *second_layer_moduli == 0), ) } + + /// For each layer, returns the closest diff less than or equal to `slot`. + pub fn closest_layer_points(&self, slot: Slot, start_slot: Slot) -> Vec { + let mut layers = self + .moduli + .iter() + .map(|&n| { + let from = slot / n * n; + // Or from start point + std::cmp::max(from, start_slot) + }) + .collect::>(); + + // Remove duplication caused by the capping at `start_slot` (multiple + // layers may have the same slot equal to `start_slot`), or shared multiples (a slot that is + // a multiple of 2**n will also be a multiple of 2**m for all m < n). + layers.dedup(); + + layers + } } impl StorageStrategy { @@ -732,6 +783,27 @@ impl StorageStrategy { } .map(Slot::from) } + + /// Returns the slot that storage_strategy points to. + pub fn diff_base_slot(&self) -> Option { + match self { + Self::ReplayFrom(from) => Some(*from), + Self::DiffFrom(from) => Some(*from), + Self::Snapshot => None, + } + } + + pub fn is_replay_from(&self) -> bool { + matches!(self, Self::ReplayFrom(_)) + } + + pub fn is_diff_from(&self) -> bool { + matches!(self, Self::DiffFrom(_)) + } + + pub fn is_snapshot(&self) -> bool { + matches!(self, Self::Snapshot) + } } #[cfg(test)] @@ -743,34 +815,37 @@ mod tests { fn default_storage_strategy() { let config = HierarchyConfig::default(); config.validate().unwrap(); + let sslot = Slot::new(0); let moduli = config.to_moduli().unwrap(); // Full snapshots at multiples of 2^21. let snapshot_freq = Slot::new(1 << 21); assert_eq!( - moduli.storage_strategy(Slot::new(0)).unwrap(), + moduli.storage_strategy(Slot::new(0), sslot).unwrap(), StorageStrategy::Snapshot ); assert_eq!( - moduli.storage_strategy(snapshot_freq).unwrap(), + moduli.storage_strategy(snapshot_freq, sslot).unwrap(), StorageStrategy::Snapshot ); assert_eq!( - moduli.storage_strategy(snapshot_freq * 3).unwrap(), + moduli.storage_strategy(snapshot_freq * 3, sslot).unwrap(), StorageStrategy::Snapshot ); // Diffs should be from the previous layer (the snapshot in this case), and not the previous diff in the same layer. let first_layer = Slot::new(1 << 18); assert_eq!( - moduli.storage_strategy(first_layer * 2).unwrap(), + moduli.storage_strategy(first_layer * 2, sslot).unwrap(), StorageStrategy::DiffFrom(Slot::new(0)) ); let replay_strategy_slot = first_layer + 1; assert_eq!( - moduli.storage_strategy(replay_strategy_slot).unwrap(), + moduli + .storage_strategy(replay_strategy_slot, sslot) + .unwrap(), StorageStrategy::ReplayFrom(first_layer) ); } @@ -940,4 +1015,93 @@ mod tests { ] ); } + + // Test that the diffs and snapshots required for storage of split states are retained in the + // hot DB as the split slot advances, if we begin from an initial configuration where this + // invariant holds. + fn test_slots_retained_invariant(hierarchy: HierarchyModuli, start_slot: u64, epoch_jump: u64) { + let start_slot = Slot::new(start_slot); + let mut finalized_slot = start_slot; + + // Initially we have just one snapshot stored at the `start_slot`. This is what checkpoint + // sync sets up (or the V24 migration). + let mut retained_slots = vec![finalized_slot]; + + // Iterate until we've reached two snapshots in the future. + let stop_at = hierarchy + .next_snapshot_slot(hierarchy.next_snapshot_slot(start_slot).unwrap() + 1) + .unwrap(); + + while finalized_slot <= stop_at { + // Jump multiple epocsh at a time because inter-epoch states are not interesting and + // would take too long to iterate over. + let new_finalized_slot = finalized_slot + 32 * epoch_jump; + + let new_retained_slots = hierarchy.closest_layer_points(new_finalized_slot, start_slot); + + for slot in &new_retained_slots { + // All new retained slots must either be already stored prior to the old finalized + // slot, OR newer than the finalized slot (i.e. stored in the hot DB as part of + // regular state storage). + assert!(retained_slots.contains(slot) || *slot >= finalized_slot); + } + + retained_slots = new_retained_slots; + finalized_slot = new_finalized_slot; + } + } + + #[test] + fn slots_retained_invariant() { + let cases = [ + // Default hierarchy with a start_slot between the 2^13 and 2^16 layers. + ( + HierarchyConfig::default().to_moduli().unwrap(), + 2 * (1 << 14) - 5 * 32, + 1, + ), + // Default hierarchy with a start_slot between the 2^13 and 2^16 layers, with 8 epochs + // finalizing at a time (should not make any difference). + ( + HierarchyConfig::default().to_moduli().unwrap(), + 2 * (1 << 14) - 5 * 32, + 8, + ), + // Very dense hierarchy config. + ( + HierarchyConfig::from_str("5,7") + .unwrap() + .to_moduli() + .unwrap(), + 32, + 1, + ), + // Very dense hierarchy config that skips a whole snapshot on its first finalization. + ( + HierarchyConfig::from_str("5,7") + .unwrap() + .to_moduli() + .unwrap(), + 32, + 1 << 7, + ), + ]; + + for (hierarchy, start_slot, epoch_jump) in cases { + test_slots_retained_invariant(hierarchy, start_slot, epoch_jump); + } + } + + #[test] + fn closest_layer_points_unique() { + let hierarchy = HierarchyConfig::default().to_moduli().unwrap(); + + let start_slot = Slot::new(0); + let end_slot = hierarchy.next_snapshot_slot(Slot::new(1)).unwrap(); + + for slot in (0..end_slot.as_u64()).map(Slot::new) { + let closest_layer_points = hierarchy.closest_layer_points(slot, start_slot); + assert!(closest_layer_points.is_sorted_by(|a, b| a > b)); + } + } } diff --git a/beacon_node/store/src/historic_state_cache.rs b/beacon_node/store/src/historic_state_cache.rs index c0e8f8346c9..e5abb04c076 100644 --- a/beacon_node/store/src/historic_state_cache.rs +++ b/beacon_node/store/src/historic_state_cache.rs @@ -34,11 +34,17 @@ impl HistoricStateCache { pub fn get_hdiff_buffer(&mut self, slot: Slot) -> Option { if let Some(buffer_ref) = self.hdiff_buffers.get(&slot) { - let _timer = metrics::start_timer(&metrics::BEACON_HDIFF_BUFFER_CLONE_TIMES); + let _timer = metrics::start_timer_vec( + &metrics::BEACON_HDIFF_BUFFER_CLONE_TIME, + metrics::COLD_METRIC, + ); Some(buffer_ref.clone()) } else if let Some(state) = self.states.get(&slot) { let buffer = HDiffBuffer::from_state(state.clone()); - let _timer = metrics::start_timer(&metrics::BEACON_HDIFF_BUFFER_CLONE_TIMES); + let _timer = metrics::start_timer_vec( + &metrics::BEACON_HDIFF_BUFFER_CLONE_TIME, + metrics::COLD_METRIC, + ); let cloned = buffer.clone(); drop(_timer); self.hdiff_buffers.put(slot, cloned); diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 1663ec7b4d4..4d94042b5b0 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -1,21 +1,22 @@ use crate::config::{OnDiskStoreConfig, StoreConfig}; use crate::database::interface::BeaconNodeBackend; use crate::forwards_iter::{HybridForwardsBlockRootsIterator, HybridForwardsStateRootsIterator}; -use crate::hdiff::{HDiff, HDiffBuffer, HierarchyModuli, StorageStrategy}; +use crate::hdiff::{HDiff, HDiffBuffer, HierarchyConfig, HierarchyModuli, StorageStrategy}; use crate::historic_state_cache::HistoricStateCache; -use crate::impls::beacon_state::{get_full_state, store_full_state}; use crate::iter::{BlockRootsIterator, ParentRootBlockIterator, RootsIterator}; use crate::memory_store::MemoryStore; use crate::metadata::{ - AnchorInfo, BlobInfo, CompactionTimestamp, DataColumnInfo, PruningCheckpoint, SchemaVersion, - ANCHOR_FOR_ARCHIVE_NODE, ANCHOR_INFO_KEY, ANCHOR_UNINITIALIZED, BLOB_INFO_KEY, - COMPACTION_TIMESTAMP_KEY, CONFIG_KEY, CURRENT_SCHEMA_VERSION, DATA_COLUMN_INFO_KEY, - PRUNING_CHECKPOINT_KEY, SCHEMA_VERSION_KEY, SPLIT_KEY, STATE_UPPER_LIMIT_NO_RETAIN, + AnchorInfo, BlobInfo, CompactionTimestamp, DataColumnInfo, SchemaVersion, ANCHOR_INFO_KEY, + ANCHOR_UNINITIALIZED, BLOB_INFO_KEY, COMPACTION_TIMESTAMP_KEY, CONFIG_KEY, + CURRENT_SCHEMA_VERSION, DATA_COLUMN_INFO_KEY, SCHEMA_VERSION_KEY, SPLIT_KEY, + STATE_UPPER_LIMIT_NO_RETAIN, }; use crate::state_cache::{PutStateOutcome, StateCache}; use crate::{ - get_data_column_key, metrics, parse_data_column_key, BlobSidecarListFromRoot, DBColumn, - DatabaseBlock, Error, ItemStore, KeyValueStoreOp, StoreItem, StoreOp, + get_data_column_key, + metrics::{self, COLD_METRIC, HOT_METRIC}, + parse_data_column_key, BlobSidecarListFromRoot, DBColumn, DatabaseBlock, Error, ItemStore, + KeyValueStoreOp, StoreItem, StoreOp, }; use itertools::{process_results, Itertools}; use lru::LruCache; @@ -28,7 +29,7 @@ use state_processing::{ block_replayer::PreSlotHook, AllCaches, BlockProcessingError, BlockReplayer, SlotProcessingError, }; -use std::cmp::min; +use std::cmp::{min, Ordering}; use std::collections::{HashMap, HashSet}; use std::io::{Read, Write}; use std::marker::PhantomData; @@ -59,7 +60,7 @@ pub struct HotColdDB, Cold: ItemStore> { /// The starting slots for the range of data columns stored in the database. data_column_info: RwLock, pub(crate) config: StoreConfig, - pub(crate) hierarchy: HierarchyModuli, + pub hierarchy: HierarchyModuli, /// Cold database containing compact historical data. pub cold_db: Cold, /// Database containing blobs. If None, store falls back to use `cold_db`. @@ -159,9 +160,13 @@ pub enum HotColdDBError { MissingColdStateSummary(Hash256), MissingHotStateSummary(Hash256), MissingEpochBoundaryState(Hash256, Hash256), + MissingHotState { + state_root: Hash256, + requested_by_state_summary: (Hash256, Slot), + }, MissingPrevState(Hash256), MissingSplitState(Hash256, Slot), - MissingStateDiff(Hash256), + MissingHotHDiff(Hash256), MissingHDiff(Slot), MissingExecutionPayload(Hash256), MissingFullBlockExecutionPayloadPruned(Hash256, Slot), @@ -170,7 +175,7 @@ pub enum HotColdDBError { MissingFrozenBlock(Slot), MissingPathToBlobsDatabase, BlobsPreviouslyInDefaultStore, - HotStateSummaryError(BeaconStateError), + HdiffGetPriorStateRootError(Slot, Slot), RestorePointDecodeError(ssz::DecodeError), BlockReplayBeaconError(BeaconStateError), BlockReplaySlotError(SlotProcessingError), @@ -203,6 +208,8 @@ impl HotColdDB, MemoryStore> { let hierarchy = config.hierarchy_config.to_moduli()?; + // NOTE: Anchor slot is initialized to 0, which is only valid for new DBs. We shouldn't + // be reusing memory stores, but if we want to do that we should redo this. let db = HotColdDB { split: RwLock::new(Split::default()), anchor_info: RwLock::new(ANCHOR_UNINITIALIZED), @@ -215,9 +222,10 @@ impl HotColdDB, MemoryStore> { state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, + config.hot_hdiff_buffer_cache_size, )), historic_state_cache: Mutex::new(HistoricStateCache::new( - config.hdiff_buffer_cache_size, + config.cold_hdiff_buffer_cache_size, config.historic_state_cache_size, )), config, @@ -243,12 +251,16 @@ impl HotColdDB, BeaconNodeBackend> { config: StoreConfig, spec: Arc, ) -> Result, Error> { + debug!("Opening HotColdDB"); config.verify::()?; let hierarchy = config.hierarchy_config.to_moduli()?; + debug!(?hot_path, "Opening LevelDB"); let hot_db = BeaconNodeBackend::open(&config, hot_path)?; + let anchor_info = RwLock::new(Self::load_anchor_info(&hot_db)?); + debug!(?anchor_info, "Loaded anchor info"); let db = HotColdDB { split: RwLock::new(Split::default()), @@ -262,9 +274,10 @@ impl HotColdDB, BeaconNodeBackend> { state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, + config.hot_hdiff_buffer_cache_size, )), historic_state_cache: Mutex::new(HistoricStateCache::new( - config.hdiff_buffer_cache_size, + config.cold_hdiff_buffer_cache_size, config.historic_state_cache_size, )), config, @@ -279,12 +292,27 @@ impl HotColdDB, BeaconNodeBackend> { // Load the previous split slot from the database (if any). This ensures we can // stop and restart correctly. This needs to occur *before* running any migrations // because some migrations load states and depend on the split. + // + // We use a method that is ambivalent to the state summaries being V22 or V24, because + // we need to support several scenarios: + // + // - Migrating from V22 to V24: initially summaries are V22 , and we need + // to be able to load a block root from them. Loading the split partially at first + // (without reading a V24 summary) and then completing the full load after the migration + // runs is possible in this case, but not in the next case. + // - Migrating from V24 to V22: initially summaries are V24, but after the migration runs + // they will be V22. If we used the "load full split after migration" approach with strict + // V24 summaries, it would break when trying to read V22 summaries after the migration. + // + // Therefore we take the most flexible approach of reading _either_ a V22 or V24 summary and + // using this to load the split correctly the first time. if let Some(split) = db.load_split()? { *db.split.write() = split; info!( %split.slot, - split_state = ?split.state_root, + ?split.state_root, + ?split.block_root, "Hot-Cold DB initialized" ); } @@ -353,6 +381,16 @@ impl HotColdDB, BeaconNodeBackend> { "Blob DB initialized" ); + // Ensure that any on-disk config is compatible with the supplied config. + // + // We do this prior to the migration now, because we don't want the migration using the + // in-memory config if it is inconsistent with the on-disk config. In future we may need + // to put this in/after the migration if the migration changes the config format. + if let Some(disk_config) = db.load_config()? { + db.config.check_compatibility(&disk_config)?; + } + db.store_config()?; + // Ensure that the schema version of the on-disk database matches the software. // If the version is mismatched, an automatic migration will be attempted. let db = Arc::new(db); @@ -362,31 +400,16 @@ impl HotColdDB, BeaconNodeBackend> { to_version = CURRENT_SCHEMA_VERSION.as_u64(), "Attempting schema migration" ); - migrate_schema(db.clone(), schema_version, CURRENT_SCHEMA_VERSION)?; + migrate_schema(db.clone(), schema_version, CURRENT_SCHEMA_VERSION).map_err(|e| { + Error::MigrationError(format!( + "Migrating from {:?} to {:?}: {:?}", + schema_version, CURRENT_SCHEMA_VERSION, e + )) + })?; } else { db.store_schema_version(CURRENT_SCHEMA_VERSION)?; } - // Ensure that any on-disk config is compatible with the supplied config. - if let Some(disk_config) = db.load_config()? { - let split = db.get_split_info(); - let anchor = db.get_anchor_info(); - db.config - .check_compatibility(&disk_config, &split, &anchor)?; - - // Inform user if hierarchy config is changing. - if let Ok(hierarchy_config) = disk_config.hierarchy_config() { - if &db.config.hierarchy_config != hierarchy_config { - info!( - previous_config = %hierarchy_config, - new_config = %db.config.hierarchy_config, - "Updating historic state config" - ); - } - } - } - db.store_config()?; - // TODO(tree-states): Here we can choose to prune advanced states to reclaim disk space. As // it's a foreground task there's no risk of race condition that can corrupt the DB. // Advanced states for invalid blocks that were never written to the DB, or descendants of @@ -400,20 +423,51 @@ impl HotColdDB, BeaconNodeBackend> { info!("Foreground compaction complete"); } + debug!(anchor = ?db.get_anchor_info(), "Store anchor info"); + Ok(db) } } impl, Cold: ItemStore> HotColdDB { + fn cold_storage_strategy(&self, slot: Slot) -> Result { + // The start slot for the freezer HDiff is always 0 + Ok(self.hierarchy.storage_strategy(slot, Slot::new(0))?) + } + + pub fn hot_storage_strategy(&self, slot: Slot) -> Result { + Ok(self + .hierarchy + .storage_strategy(slot, self.hot_hdiff_start_slot()?)?) + } + + pub fn hot_hdiff_start_slot(&self) -> Result { + let anchor_slot = self.anchor_info.read_recursive().anchor_slot; + if anchor_slot == u64::MAX { + // If hot_hdiff_start_slot returns such a high value all writes will fail. This should + // never happen, but it's best to stop this useless value from propagating downstream + Err(Error::AnchorUninitialized) + } else { + Ok(anchor_slot) + } + } + pub fn update_finalized_state( &self, state_root: Hash256, block_root: Hash256, state: BeaconState, ) -> Result<(), Error> { - self.state_cache - .lock() - .update_finalized_state(state_root, block_root, state) + let start_slot = self.get_anchor_info().anchor_slot; + let pre_finalized_slots_to_retain = self + .hierarchy + .closest_layer_points(state.slot(), start_slot); + self.state_cache.lock().update_finalized_state( + state_root, + block_root, + state, + &pre_finalized_slots_to_retain, + ) } pub fn state_cache_len(&self) -> usize { @@ -431,20 +485,34 @@ impl, Cold: ItemStore> HotColdDB &metrics::STORE_BEACON_BLOB_CACHE_SIZE, self.block_cache.lock().blob_cache.len() as i64, ); + let state_cache = self.state_cache.lock(); metrics::set_gauge( &metrics::STORE_BEACON_STATE_CACHE_SIZE, - self.state_cache.lock().len() as i64, + state_cache.len() as i64, ); + metrics::set_gauge_vec( + &metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_SIZE, + HOT_METRIC, + state_cache.num_hdiff_buffers() as i64, + ); + metrics::set_gauge_vec( + &metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_BYTE_SIZE, + HOT_METRIC, + state_cache.hdiff_buffer_mem_usage() as i64, + ); + drop(state_cache); metrics::set_gauge( &metrics::STORE_BEACON_HISTORIC_STATE_CACHE_SIZE, hsc_metrics.num_state as i64, ); - metrics::set_gauge( + metrics::set_gauge_vec( &metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_SIZE, + COLD_METRIC, hsc_metrics.num_hdiff as i64, ); - metrics::set_gauge( + metrics::set_gauge_vec( &metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_BYTE_SIZE, + COLD_METRIC, hsc_metrics.hdiff_byte_size as i64, ); @@ -887,14 +955,6 @@ impl, Cold: ItemStore> HotColdDB } } - pub fn put_state_summary( - &self, - state_root: &Hash256, - summary: HotStateSummary, - ) -> Result<(), Error> { - self.hot_db.put(state_root, &summary) - } - /// Store a state in the store. pub fn put_state(&self, state_root: &Hash256, state: &BeaconState) -> Result<(), Error> { let mut ops: Vec = Vec::new(); @@ -986,7 +1046,14 @@ impl, Cold: ItemStore> HotColdDB }; // It's a bit redundant but we elect to cache the state here and down below. let mut opt_state = self - .load_hot_state(&state_root, true)? + .load_hot_state(&state_root, true) + .map_err(|e| { + Error::LoadingHotStateError( + format!("get advanced {block_root} {max_slot}"), + state_root, + e.into(), + ) + })? .map(|(state, _block_root)| (state_root, state)); if let Some((state_root, state)) = opt_state.as_mut() { @@ -1098,41 +1165,6 @@ impl, Cold: ItemStore> HotColdDB ) } - /// Load an epoch boundary state by using the hot state summary look-up. - /// - /// Will fall back to the cold DB if a hot state summary is not found. - /// - /// NOTE: only used in tests at the moment - pub fn load_epoch_boundary_state( - &self, - state_root: &Hash256, - ) -> Result>, Error> { - if let Some(HotStateSummary { - epoch_boundary_state_root, - .. - }) = self.load_hot_state_summary(state_root)? - { - // NOTE: minor inefficiency here because we load an unnecessary hot state summary - let (state, _) = self - .load_hot_state(&epoch_boundary_state_root, true)? - .ok_or(HotColdDBError::MissingEpochBoundaryState( - epoch_boundary_state_root, - *state_root, - ))?; - Ok(Some(state)) - } else { - // Try the cold DB - match self.load_cold_state_slot(state_root)? { - Some(state_slot) => { - let epoch_boundary_slot = - state_slot / E::slots_per_epoch() * E::slots_per_epoch(); - self.load_cold_state_by_slot(epoch_boundary_slot).map(Some) - } - None => Ok(None), - } - } - } - pub fn put_item(&self, key: &Hash256, item: &I) -> Result<(), Error> { self.hot_db.put(key, item) } @@ -1206,13 +1238,39 @@ impl, Cold: ItemStore> HotColdDB StoreOp::DeleteState(state_root, slot) => { // Delete the hot state summary. key_value_batch.push(KeyValueStoreOp::DeleteKey( - DBColumn::BeaconStateSummary, + DBColumn::BeaconStateHotSummary, state_root.as_slice().to_vec(), )); - if slot.is_none_or(|slot| slot % E::slots_per_epoch() == 0) { + if let Some(slot) = slot { + match self.hot_storage_strategy(slot)? { + StorageStrategy::Snapshot => { + // Full state stored in this position + key_value_batch.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconStateHotSnapshot, + state_root.as_slice().to_vec(), + )); + } + StorageStrategy::DiffFrom(_) => { + // Diff stored in this position + key_value_batch.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconStateHotDiff, + state_root.as_slice().to_vec(), + )); + } + StorageStrategy::ReplayFrom(_) => { + // Nothing else to delete + } + } + } else { + // NOTE(hdiff): Attempt to delete both snapshots and diffs if we don't know + // the slot. + key_value_batch.push(KeyValueStoreOp::DeleteKey( + DBColumn::BeaconStateHotSnapshot, + state_root.as_slice().to_vec(), + )); key_value_batch.push(KeyValueStoreOp::DeleteKey( - DBColumn::BeaconState, + DBColumn::BeaconStateHotDiff, state_root.as_slice().to_vec(), )); } @@ -1420,9 +1478,6 @@ impl, Cold: ItemStore> HotColdDB state: &BeaconState, ops: &mut Vec, ) -> Result<(), Error> { - // Avoid storing states in the database if they already exist in the state cache. - // The exception to this is the finalized state, which must exist in the cache before it - // is stored on disk. match self.state_cache.lock().put_state( *state_root, state.get_latest_block_root(*state_root), @@ -1443,28 +1498,127 @@ impl, Cold: ItemStore> HotColdDB state_slot = %state.slot(), "State already exists in state cache", ); - return Ok(()); + // NOTE: We used to return early here, but had some issues with states being + // in the cache but not on disk. Instead of relying on the cache we try loading + // the state summary below and rely on that instead. } - PutStateOutcome::Finalized => {} // Continue to store. + // Continue to store. + PutStateOutcome::Finalized | PutStateOutcome::PreFinalizedHDiffBuffer => {} } - // On the epoch boundary, store the full state. - if state.slot() % E::slots_per_epoch() == 0 { + // Computing diffs is expensive so we avoid it if we already have this state stored on + // disk. + if self.load_hot_state_summary(state_root)?.is_some() { debug!( slot = %state.slot(), ?state_root, - "Storing full state on epoch boundary" + "Skipping storage of state already in the DB" ); - store_full_state(state_root, state, ops)?; + return Ok(()); } + let summary = self.store_hot_state_summary(state_root, state, ops)?; + self.store_hot_state_diffs(state_root, state, ops)?; + + debug!( + ?state_root, + slot = %state.slot(), + storage_strategy = ?self.hot_storage_strategy(state.slot())?, + diff_base_state = %summary.diff_base_state, + previous_state_root = ?summary.previous_state_root, + "Storing hot state summary and diffs" + ); + + Ok(()) + } + + /// Store a post-finalization state efficiently in the hot database. + pub fn store_hot_state_summary( + &self, + state_root: &Hash256, + state: &BeaconState, + ops: &mut Vec, + ) -> Result { // Store a summary of the state. // We store one even for the epoch boundary states, as we may need their slots // when doing a look up by state root. - let hot_state_summary = HotStateSummary::new(state_root, state)?; - let op = hot_state_summary.as_kv_store_op(*state_root); - ops.push(op); + let hot_state_summary = HotStateSummary::new( + self, + *state_root, + state, + self.hot_storage_strategy(state.slot())?, + )?; + ops.push(hot_state_summary.as_kv_store_op(*state_root)); + Ok(hot_state_summary) + } + pub fn store_hot_state_diffs( + &self, + state_root: &Hash256, + state: &BeaconState, + ops: &mut Vec, + ) -> Result<(), Error> { + let slot = state.slot(); + let storage_strategy = self.hot_storage_strategy(slot)?; + match storage_strategy { + StorageStrategy::ReplayFrom(_) => { + // Already have persisted the state summary, don't persist anything else + } + StorageStrategy::Snapshot => { + self.store_hot_state_as_snapshot(state_root, state, ops)?; + } + StorageStrategy::DiffFrom(from_slot) => { + let from_root = get_ancestor_state_root(self, state, from_slot).map_err(|e| { + Error::StateSummaryIteratorError { + error: e, + from_state_root: *state_root, + from_state_slot: state.slot(), + target_slot: slot, + } + })?; + self.store_hot_state_as_diff(state_root, state, from_root, ops)?; + } + } + Ok(()) + } + + fn store_hot_state_as_diff( + &self, + state_root: &Hash256, + state: &BeaconState, + from_root: Hash256, + ops: &mut Vec, + ) -> Result<(), Error> { + let base_buffer = { + let _t = metrics::start_timer_vec( + &metrics::BEACON_HDIFF_BUFFER_LOAD_BEFORE_STORE_TIME, + HOT_METRIC, + ); + self.load_hot_hdiff_buffer(from_root).map_err(|e| { + Error::LoadingHotHdiffBufferError( + format!("store state as diff {state_root:?} {}", state.slot()), + from_root, + e.into(), + ) + })? + }; + let target_buffer = HDiffBuffer::from_state(state.clone()); + let diff = { + let _timer = metrics::start_timer_vec(&metrics::BEACON_HDIFF_COMPUTE_TIME, HOT_METRIC); + HDiff::compute(&base_buffer, &target_buffer, &self.config)? + }; + let diff_bytes = diff.as_ssz_bytes(); + let layer = HierarchyConfig::exponent_for_slot(state.slot()); + metrics::observe_vec( + &metrics::BEACON_HDIFF_SIZES, + &[&layer.to_string()], + diff_bytes.len() as f64, + ); + ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconStateHotDiff, + state_root.as_slice().to_vec(), + diff_bytes, + )); Ok(()) } @@ -1483,7 +1637,9 @@ impl, Cold: ItemStore> HotColdDB warn!(?state_root, "State cache missed"); } - let state_from_disk = self.load_hot_state(state_root, update_cache)?; + let state_from_disk = self.load_hot_state(state_root, update_cache).map_err(|e| { + Error::LoadingHotStateError("get state".to_owned(), *state_root, e.into()) + })?; if let Some((mut state, block_root)) = state_from_disk { state.update_tree_hash_cache()?; @@ -1516,6 +1672,88 @@ impl, Cold: ItemStore> HotColdDB } } + fn load_hot_hdiff_buffer(&self, state_root: Hash256) -> Result { + if let Some(buffer) = self + .state_cache + .lock() + .get_hdiff_buffer_by_state_root(state_root) + { + return Ok(buffer); + } + + let Some(HotStateSummary { + slot, + diff_base_state, + .. + }) = self.load_hot_state_summary(&state_root)? + else { + return Err(Error::MissingHotStateSummary(state_root)); + }; + + let buffer = match self.hot_storage_strategy(slot)? { + StorageStrategy::Snapshot => { + let Some(state) = self.load_hot_state_as_snapshot(state_root)? else { + let existing_snapshots = self.load_hot_state_snapshot_roots()?; + debug!( + requested = ?state_root, + existing_snapshots = ?existing_snapshots, + "Missing hot state snapshot" + ); + return Err(Error::MissingHotStateSnapshot(state_root, slot)); + }; + HDiffBuffer::from_state(state) + } + StorageStrategy::DiffFrom(from_slot) => { + let from_state_root = diff_base_state.get_root(from_slot)?; + let mut buffer = self.load_hot_hdiff_buffer(from_state_root).map_err(|e| { + Error::LoadingHotHdiffBufferError( + format!("load hdiff DiffFrom {from_slot} {state_root}"), + from_state_root, + e.into(), + ) + })?; + let diff = self.load_hot_hdiff(state_root)?; + { + let _timer = + metrics::start_timer_vec(&metrics::BEACON_HDIFF_APPLY_TIME, HOT_METRIC); + diff.apply(&mut buffer, &self.config)?; + } + buffer + } + StorageStrategy::ReplayFrom(from_slot) => { + let from_state_root = diff_base_state.get_root(from_slot)?; + self.load_hot_hdiff_buffer(from_state_root).map_err(|e| { + Error::LoadingHotHdiffBufferError( + format!("load hdiff ReplayFrom {from_slot} {state_root}"), + from_state_root, + e.into(), + ) + })? + } + }; + + // Add buffer to cache for future calls. + self.state_cache + .lock() + .put_hdiff_buffer(state_root, slot, &buffer); + + Ok(buffer) + } + + fn load_hot_hdiff(&self, state_root: Hash256) -> Result { + let bytes = { + let _t = metrics::start_timer_vec(&metrics::BEACON_HDIFF_READ_TIME, HOT_METRIC); + self.hot_db + .get_bytes(DBColumn::BeaconStateHotDiff, state_root.as_slice())? + .ok_or(HotColdDBError::MissingHotHDiff(state_root))? + }; + let hdiff = { + let _t = metrics::start_timer_vec(&metrics::BEACON_HDIFF_DECODE_TIME, HOT_METRIC); + HDiff::from_ssz_bytes(&bytes)? + }; + Ok(hdiff) + } + /// Load a post-finalization state from the hot database. /// /// Will replay blocks from the nearest epoch boundary. @@ -1532,64 +1770,64 @@ impl, Cold: ItemStore> HotColdDB if let Some(HotStateSummary { slot, latest_block_root, - epoch_boundary_state_root, + diff_base_state, + .. }) = self.load_hot_state_summary(state_root)? { - let mut boundary_state = - get_full_state(&self.hot_db, &epoch_boundary_state_root, &self.spec)?.ok_or( - HotColdDBError::MissingEpochBoundaryState( - epoch_boundary_state_root, - *state_root, - ), - )?; - - // Immediately rebase the state from disk on the finalized state so that we can reuse - // parts of the tree for state root calculation in `replay_blocks`. - self.state_cache - .lock() - .rebase_on_finalized(&mut boundary_state, &self.spec)?; + let mut state = match self.hot_storage_strategy(slot)? { + strat @ StorageStrategy::Snapshot | strat @ StorageStrategy::DiffFrom(_) => { + let buffer_timer = metrics::start_timer_vec( + &metrics::BEACON_HDIFF_BUFFER_LOAD_TIME, + HOT_METRIC, + ); + let buffer = self.load_hot_hdiff_buffer(*state_root).map_err(|e| { + Error::LoadingHotHdiffBufferError( + format!("load state {strat:?} {slot}"), + *state_root, + e.into(), + ) + })?; + drop(buffer_timer); + let mut state = buffer.as_state(&self.spec)?; + + // Immediately rebase the state from diffs on the finalized state so that we + // can utilise structural sharing and don't consume excess memory. + self.state_cache + .lock() + .rebase_on_finalized(&mut state, &self.spec)?; - // Optimization to avoid even *thinking* about replaying blocks if we're already - // on an epoch boundary. - let mut state = if slot % E::slots_per_epoch() == 0 { - boundary_state - } else { - // If replaying blocks, and `update_cache` is true, also cache the epoch boundary - // state that this state is based on. It may be useful as the basis of more states - // in the same epoch. - let state_cache_hook = |state_root, state: &mut BeaconState| { - if !update_cache || state.slot() % E::slots_per_epoch() != 0 { - return Ok(()); - } - // Ensure all caches are built before attempting to cache. - state.update_tree_hash_cache()?; - state.build_all_caches(&self.spec)?; - - let latest_block_root = state.get_latest_block_root(state_root); - if let PutStateOutcome::New(_) = - self.state_cache - .lock() - .put_state(state_root, latest_block_root, state)? - { - debug!( - ?state_root, - state_slot = %state.slot(), - descendant_slot = %slot, - "Cached ancestor state", - ); - } - Ok(()) - }; - let blocks = - self.load_blocks_to_replay(boundary_state.slot(), slot, latest_block_root)?; - let _t = metrics::start_timer(&metrics::STORE_BEACON_REPLAY_HOT_BLOCKS_TIME); - self.replay_blocks( - boundary_state, - blocks, - slot, - no_state_root_iter(), - Some(Box::new(state_cache_hook)), - )? + state + } + StorageStrategy::ReplayFrom(from_slot) => { + let from_state_root = diff_base_state.get_root(from_slot)?; + + let (mut base_state, _) = self + .load_hot_state(&from_state_root, update_cache) + .map_err(|e| { + Error::LoadingHotStateError( + format!("load state ReplayFrom {from_slot}"), + *state_root, + e.into(), + ) + })? + .ok_or(HotColdDBError::MissingHotState { + state_root: from_state_root, + requested_by_state_summary: (*state_root, slot), + })?; + + // Immediately rebase the state from disk on the finalized state so that we can + // reuse parts of the tree for state root calculation in `replay_blocks`. + self.state_cache + .lock() + .rebase_on_finalized(&mut base_state, &self.spec)?; + + self.load_hot_state_using_replay( + base_state, + slot, + latest_block_root, + update_cache, + )? + } }; state.apply_pending_mutations()?; @@ -1599,6 +1837,56 @@ impl, Cold: ItemStore> HotColdDB } } + pub fn load_hot_state_using_replay( + &self, + base_state: BeaconState, + slot: Slot, + latest_block_root: Hash256, + update_cache: bool, + ) -> Result, Error> { + if base_state.slot() == slot { + return Ok(base_state); + } + + let blocks = self.load_blocks_to_replay(base_state.slot(), slot, latest_block_root)?; + let _t = metrics::start_timer(&metrics::STORE_BEACON_REPLAY_HOT_BLOCKS_TIME); + + // If replaying blocks, and `update_cache` is true, also cache the epoch boundary + // state that this state is based on. It may be useful as the basis of more states + // in the same epoch. + let state_cache_hook = |state_root, state: &mut BeaconState| { + if !update_cache || state.slot() % E::slots_per_epoch() != 0 { + return Ok(()); + } + // Ensure all caches are built before attempting to cache. + state.update_tree_hash_cache()?; + state.build_all_caches(&self.spec)?; + + let latest_block_root = state.get_latest_block_root(state_root); + if let PutStateOutcome::New(_) = + self.state_cache + .lock() + .put_state(state_root, latest_block_root, state)? + { + debug!( + ?state_root, + state_slot = %state.slot(), + descendant_slot = %slot, + "Cached ancestor state", + ); + } + Ok(()) + }; + + self.replay_blocks( + base_state, + blocks, + slot, + no_state_root_iter(), + Some(Box::new(state_cache_hook)), + ) + } + pub fn store_cold_state_summary( &self, state_root: &Hash256, @@ -1624,7 +1912,7 @@ impl, Cold: ItemStore> HotColdDB self.store_cold_state_summary(state_root, state.slot(), ops)?; let slot = state.slot(); - match self.hierarchy.storage_strategy(slot)? { + match self.cold_storage_strategy(slot)? { StorageStrategy::ReplayFrom(from) => { debug!( strategy = "replay", @@ -1699,6 +1987,54 @@ impl, Cold: ItemStore> HotColdDB } } + pub fn store_hot_state_as_snapshot( + &self, + state_root: &Hash256, + state: &BeaconState, + ops: &mut Vec, + ) -> Result<(), Error> { + let bytes = state.as_ssz_bytes(); + let compressed_value = { + let _timer = metrics::start_timer(&metrics::STORE_BEACON_STATE_FREEZER_COMPRESS_TIME); + let mut out = Vec::with_capacity(self.config.estimate_compressed_size(bytes.len())); + let mut encoder = Encoder::new(&mut out, self.config.compression_level) + .map_err(Error::Compression)?; + encoder.write_all(&bytes).map_err(Error::Compression)?; + encoder.finish().map_err(Error::Compression)?; + out + }; + + ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconStateHotSnapshot, + state_root.as_slice().to_vec(), + compressed_value, + )); + Ok(()) + } + + fn load_hot_state_bytes_as_snapshot( + &self, + state_root: Hash256, + ) -> Result>, Error> { + match self + .hot_db + .get_bytes(DBColumn::BeaconStateHotSnapshot, state_root.as_slice())? + { + Some(bytes) => { + let _timer = + metrics::start_timer(&metrics::STORE_BEACON_STATE_FREEZER_DECOMPRESS_TIME); + let mut ssz_bytes = + Vec::with_capacity(self.config.estimate_decompressed_size(bytes.len())); + let mut decoder = Decoder::new(&*bytes).map_err(Error::Compression)?; + decoder + .read_to_end(&mut ssz_bytes) + .map_err(Error::Compression)?; + Ok(Some(ssz_bytes)) + } + None => Ok(None), + } + } + fn load_cold_state_as_snapshot(&self, slot: Slot) -> Result>, Error> { Ok(self .load_cold_state_bytes_as_snapshot(slot)? @@ -1706,6 +2042,22 @@ impl, Cold: ItemStore> HotColdDB .transpose()?) } + fn load_hot_state_as_snapshot( + &self, + state_root: Hash256, + ) -> Result>, Error> { + Ok(self + .load_hot_state_bytes_as_snapshot(state_root)? + .map(|bytes| BeaconState::from_ssz_bytes(&bytes, &self.spec)) + .transpose()?) + } + + fn load_hot_state_snapshot_roots(&self) -> Result, Error> { + self.hot_db + .iter_column_keys::(DBColumn::BeaconStateHotSnapshot) + .collect() + } + pub fn store_cold_state_as_diff( &self, state: &BeaconState, @@ -1714,15 +2066,24 @@ impl, Cold: ItemStore> HotColdDB ) -> Result<(), Error> { // Load diff base state bytes. let (_, base_buffer) = { - let _t = metrics::start_timer(&metrics::STORE_BEACON_HDIFF_BUFFER_LOAD_FOR_STORE_TIME); + let _t = metrics::start_timer_vec( + &metrics::BEACON_HDIFF_BUFFER_LOAD_BEFORE_STORE_TIME, + COLD_METRIC, + ); self.load_hdiff_buffer_for_slot(from_slot)? }; let target_buffer = HDiffBuffer::from_state(state.clone()); let diff = { - let _timer = metrics::start_timer(&metrics::STORE_BEACON_HDIFF_BUFFER_COMPUTE_TIME); + let _timer = metrics::start_timer_vec(&metrics::BEACON_HDIFF_COMPUTE_TIME, COLD_METRIC); HDiff::compute(&base_buffer, &target_buffer, &self.config)? }; let diff_bytes = diff.as_ssz_bytes(); + let layer = HierarchyConfig::exponent_for_slot(state.slot()); + metrics::observe_vec( + &metrics::BEACON_HDIFF_SIZES, + &[&layer.to_string()], + diff_bytes.len() as f64, + ); ops.push(KeyValueStoreOp::PutKeyValue( DBColumn::BeaconStateDiff, @@ -1746,7 +2107,7 @@ impl, Cold: ItemStore> HotColdDB /// /// Will reconstruct the state if it lies between restore points. pub fn load_cold_state_by_slot(&self, slot: Slot) -> Result, Error> { - let storage_strategy = self.hierarchy.storage_strategy(slot)?; + let storage_strategy = self.cold_storage_strategy(slot)?; // Search for a state from this slot or a recent prior slot in the historic state cache. let mut historic_state_cache = self.historic_state_cache.lock(); @@ -1775,10 +2136,10 @@ impl, Cold: ItemStore> HotColdDB // Load using the diff hierarchy. For states that require replay we recurse into this // function so that we can try to get their pre-state *as a state* rather than an hdiff // buffer. - match self.hierarchy.storage_strategy(slot)? { + match self.cold_storage_strategy(slot)? { StorageStrategy::Snapshot | StorageStrategy::DiffFrom(_) => { let buffer_timer = - metrics::start_timer(&metrics::STORE_BEACON_HDIFF_BUFFER_LOAD_TIME); + metrics::start_timer_vec(&metrics::BEACON_HDIFF_BUFFER_LOAD_TIME, COLD_METRIC); let (_, buffer) = self.load_hdiff_buffer_for_slot(slot)?; drop(buffer_timer); let state = buffer.as_state(&self.spec)?; @@ -1847,13 +2208,13 @@ impl, Cold: ItemStore> HotColdDB fn load_hdiff_for_slot(&self, slot: Slot) -> Result { let bytes = { - let _t = metrics::start_timer(&metrics::BEACON_HDIFF_READ_TIMES); + let _t = metrics::start_timer_vec(&metrics::BEACON_HDIFF_READ_TIME, COLD_METRIC); self.cold_db .get_bytes(DBColumn::BeaconStateDiff, &slot.as_u64().to_be_bytes())? .ok_or(HotColdDBError::MissingHDiff(slot))? }; let hdiff = { - let _t = metrics::start_timer(&metrics::BEACON_HDIFF_DECODE_TIMES); + let _t = metrics::start_timer_vec(&metrics::BEACON_HDIFF_DECODE_TIME, COLD_METRIC); HDiff::from_ssz_bytes(&bytes)? }; Ok(hdiff) @@ -1867,15 +2228,15 @@ impl, Cold: ItemStore> HotColdDB %slot, "Hit hdiff buffer cache" ); - metrics::inc_counter(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_HIT); + metrics::inc_counter_vec(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_HIT, COLD_METRIC); return Ok((slot, buffer)); } - metrics::inc_counter(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_MISS); + metrics::inc_counter_vec(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_MISS, COLD_METRIC); // Load buffer for the previous state. // This amount of recursion (<10 levels) should be OK. let t = std::time::Instant::now(); - match self.hierarchy.storage_strategy(slot)? { + match self.cold_storage_strategy(slot)? { // Base case. StorageStrategy::Snapshot => { let state = self @@ -1904,7 +2265,7 @@ impl, Cold: ItemStore> HotColdDB let diff = self.load_hdiff_for_slot(slot)?; { let _timer = - metrics::start_timer(&metrics::STORE_BEACON_HDIFF_BUFFER_APPLY_TIME); + metrics::start_timer_vec(&metrics::BEACON_HDIFF_APPLY_TIME, COLD_METRIC); diff.apply(&mut buffer, &self.config)?; } @@ -2176,11 +2537,11 @@ impl, Cold: ItemStore> HotColdDB /// Initialise the anchor info for checkpoint sync starting from `block`. pub fn init_anchor_info( &self, - block: BeaconBlockRef<'_, E>, + oldest_block_parent: Hash256, + oldest_block_slot: Slot, + anchor_slot: Slot, retain_historic_states: bool, ) -> Result { - let anchor_slot = block.slot(); - // Set the `state_upper_limit` to the slot of the *next* checkpoint. let next_snapshot_slot = self.hierarchy.next_snapshot_slot(anchor_slot)?; let state_upper_limit = if !retain_historic_states { @@ -2188,17 +2549,12 @@ impl, Cold: ItemStore> HotColdDB } else { next_snapshot_slot }; - let anchor_info = if state_upper_limit == 0 && anchor_slot == 0 { - // Genesis archive node: no anchor because we *will* store all states. - ANCHOR_FOR_ARCHIVE_NODE - } else { - AnchorInfo { - anchor_slot, - oldest_block_slot: anchor_slot, - oldest_block_parent: block.parent_root(), - state_upper_limit, - state_lower_limit: self.spec.genesis_slot, - } + let anchor_info = AnchorInfo { + anchor_slot, + oldest_block_slot, + oldest_block_parent, + state_upper_limit, + state_lower_limit: self.spec.genesis_slot, }; self.compare_and_set_anchor_info(ANCHOR_UNINITIALIZED, anchor_info) } @@ -2245,7 +2601,8 @@ impl, Cold: ItemStore> HotColdDB /// Load the anchor info from disk. fn load_anchor_info(hot_db: &Hot) -> Result { Ok(hot_db - .get(&ANCHOR_INFO_KEY)? + .get(&ANCHOR_INFO_KEY) + .map_err(|e| Error::LoadAnchorInfo(e.into()))? .unwrap_or(ANCHOR_UNINITIALIZED)) } @@ -2328,7 +2685,9 @@ impl, Cold: ItemStore> HotColdDB /// Load the blob info from disk, but do not set `self.blob_info`. fn load_blob_info(&self) -> Result, Error> { - self.hot_db.get(&BLOB_INFO_KEY) + self.hot_db + .get(&BLOB_INFO_KEY) + .map_err(|e| Error::LoadBlobInfo(e.into())) } /// Store the given `blob_info` to disk. @@ -2373,7 +2732,9 @@ impl, Cold: ItemStore> HotColdDB /// Load the blob info from disk, but do not set `self.data_column_info`. fn load_data_column_info(&self) -> Result, Error> { - self.hot_db.get(&DATA_COLUMN_INFO_KEY) + self.hot_db + .get(&DATA_COLUMN_INFO_KEY) + .map_err(|e| Error::LoadDataColumnInfo(e.into())) } /// Store the given `data_column_info` to disk. @@ -2432,7 +2793,9 @@ impl, Cold: ItemStore> HotColdDB /// Load previously-stored config from disk. fn load_config(&self) -> Result, Error> { - self.hot_db.get(&CONFIG_KEY) + self.hot_db + .get(&CONFIG_KEY) + .map_err(|e| Error::LoadConfig(e.into())) } /// Write the config to disk. @@ -2442,18 +2805,24 @@ impl, Cold: ItemStore> HotColdDB /// Load the split point from disk, sans block root. fn load_split_partial(&self) -> Result, Error> { - self.hot_db.get(&SPLIT_KEY) + self.hot_db + .get(&SPLIT_KEY) + .map_err(|e| Error::LoadSplit(e.into())) } /// Load the split point from disk, including block root. fn load_split(&self) -> Result, Error> { match self.load_split_partial()? { Some(mut split) => { + debug!(?split, "Loaded split partial"); // Load the hot state summary to get the block root. - let summary = self.load_hot_state_summary(&split.state_root)?.ok_or( - HotColdDBError::MissingSplitState(split.state_root, split.slot), - )?; - split.block_root = summary.latest_block_root; + let latest_block_root = self + .load_block_root_from_summary_any_version(&split.state_root) + .ok_or(HotColdDBError::MissingSplitState( + split.state_root, + split.slot, + ))?; + split.block_root = latest_block_root; Ok(Some(split)) } None => Ok(None), @@ -2478,13 +2847,41 @@ impl, Cold: ItemStore> HotColdDB &self, state_root: &Hash256, ) -> Result, Error> { - self.hot_db.get(state_root) + self.hot_db + .get(state_root) + .map_err(|e| Error::LoadHotStateSummary(*state_root, e.into())) + } + + /// Load a hot state's summary in V22 format, given its root. + pub fn load_hot_state_summary_v22( + &self, + state_root: &Hash256, + ) -> Result, Error> { + self.hot_db + .get(state_root) + .map_err(|e| Error::LoadHotStateSummary(*state_root, e.into())) + } + + /// Load the latest block root for a hot state summary either in modern form, or V22 form. + /// + /// This function is required to open a V22 database for migration to V24, or vice versa. + pub fn load_block_root_from_summary_any_version( + &self, + state_root: &Hash256, + ) -> Option { + if let Ok(Some(summary)) = self.load_hot_state_summary(state_root) { + return Some(summary.latest_block_root); + } + if let Ok(Some(summary)) = self.load_hot_state_summary_v22(state_root) { + return Some(summary.latest_block_root); + } + None } /// Load all hot state summaries present in the hot DB pub fn load_hot_state_summaries(&self) -> Result, Error> { self.hot_db - .iter_column::(DBColumn::BeaconStateSummary) + .iter_column::(DBColumn::BeaconStateHotSummary) .map(|res| { let (state_root, value) = res?; let summary = HotStateSummary::from_ssz_bytes(&value)?; @@ -2521,25 +2918,6 @@ impl, Cold: ItemStore> HotColdDB self.config.compact_on_prune } - /// Load the checkpoint to begin pruning from (the "old finalized checkpoint"). - pub fn load_pruning_checkpoint(&self) -> Result, Error> { - Ok(self - .hot_db - .get(&PRUNING_CHECKPOINT_KEY)? - .map(|pc: PruningCheckpoint| pc.checkpoint)) - } - - /// Store the checkpoint to begin pruning from (the "old finalized checkpoint"). - pub fn store_pruning_checkpoint(&self, checkpoint: Checkpoint) -> Result<(), Error> { - self.hot_db - .do_atomically(vec![self.pruning_checkpoint_store_op(checkpoint)]) - } - - /// Create a staged store for the pruning checkpoint. - pub fn pruning_checkpoint_store_op(&self, checkpoint: Checkpoint) -> KeyValueStoreOp { - PruningCheckpoint { checkpoint }.as_kv_store_op(PRUNING_CHECKPOINT_KEY) - } - /// Load the timestamp of the last compaction as a `Duration` since the UNIX epoch. pub fn load_compaction_timestamp(&self) -> Result, Error> { Ok(self @@ -2576,6 +2954,30 @@ impl, Cold: ItemStore> HotColdDB Ok(ops) } + /// Return a single block root from the cold DB. + /// + /// If the slot is unavailable due to partial block history, `Ok(None)` will be returned. + pub fn get_cold_block_root(&self, slot: Slot) -> Result, Error> { + Ok(self + .cold_db + .get_bytes(DBColumn::BeaconBlockRoots, &slot.as_u64().to_be_bytes())? + .map(|bytes| Hash256::from_ssz_bytes(&bytes)) + .transpose()?) + } + + /// Return a single state root from the cold DB. + /// + /// If the slot is unavailable due to partial state history, `Ok(None)` will be returned. + /// + /// This function will usually only work on an archive node. + pub fn get_cold_state_root(&self, slot: Slot) -> Result, Error> { + Ok(self + .cold_db + .get_bytes(DBColumn::BeaconStateRoots, &slot.as_u64().to_be_bytes())? + .map(|bytes| Hash256::from_ssz_bytes(&bytes)) + .transpose()?) + } + /// Try to prune all execution payloads, returning early if there is no need to prune. pub fn try_prune_execution_payloads(&self, force: bool) -> Result<(), Error> { let split = self.get_split_info(); @@ -2901,7 +3303,7 @@ pub fn migrate_database, Cold: ItemStore>( finalized_state_root: Hash256, finalized_block_root: Hash256, finalized_state: &BeaconState, -) -> Result<(), Error> { +) -> Result { debug!( slot = %finalized_state.slot(), "Freezer migration started" @@ -2910,12 +3312,12 @@ pub fn migrate_database, Cold: ItemStore>( // 0. Check that the migration is sensible. // The new finalized state must increase the current split slot, and lie on an epoch // boundary (in order for the hot state summary scheme to work). - let current_split_slot = store.split.read_recursive().slot; + let current_split = *store.split.read_recursive(); let anchor_info = store.anchor_info.read_recursive().clone(); - if finalized_state.slot() < current_split_slot { + if finalized_state.slot() < current_split.slot { return Err(HotColdDBError::FreezeSlotError { - current_split_slot, + current_split_slot: current_split.slot, proposed_split_slot: finalized_state.slot(), } .into()); @@ -2932,7 +3334,7 @@ pub fn migrate_database, Cold: ItemStore>( // Iterate in descending order until the current split slot let state_roots: Vec<_> = process_results(RootsIterator::new(&store, finalized_state), |iter| { - iter.take_while(|(_, _, slot)| *slot >= current_split_slot) + iter.take_while(|(_, _, slot)| *slot >= current_split.slot) .collect() })?; @@ -2957,7 +3359,7 @@ pub fn migrate_database, Cold: ItemStore>( // Only store the cold state if it's on a diff boundary. // Calling `store_cold_state_summary` instead of `store_cold_state` for those allows us // to skip loading many hot states. - if let StorageStrategy::ReplayFrom(from) = store.hierarchy.storage_strategy(slot)? { + if let StorageStrategy::ReplayFrom(from) = store.cold_storage_strategy(slot)? { // Store slot -> state_root and state_root -> slot mappings. debug!( strategy = "replay", @@ -2991,40 +3393,41 @@ pub fn migrate_database, Cold: ItemStore>( // in the worst case we will restart with the old split and re-run the migration. store.cold_db.do_atomically(cold_db_block_ops)?; store.cold_db.sync()?; - { + let new_split = { let mut split_guard = store.split.write(); - let latest_split_slot = split_guard.slot; + let latest_split = *split_guard; // Detect a situation where the split point is (erroneously) changed from more than one // place in code. - if latest_split_slot != current_split_slot { + if latest_split.slot != current_split.slot { error!( - previous_split_slot = %current_split_slot, - current_split_slot = %latest_split_slot, + previous_split_slot = %current_split.slot, + current_split_slot = %latest_split.slot, "Race condition detected: Split point changed while copying states to the freezer" ); // Assume the freezing procedure will be retried in case this happens. return Err(Error::SplitPointModified( - current_split_slot, - latest_split_slot, + current_split.slot, + latest_split.slot, )); } // Before updating the in-memory split value, we flush it to disk first, so that should the // OS process die at this point, we pick up from the right place after a restart. - let split = Split { + let new_split = Split { slot: finalized_state.slot(), state_root: finalized_state_root, block_root: finalized_block_root, }; - store.hot_db.put_sync(&SPLIT_KEY, &split)?; + store.hot_db.put_sync(&SPLIT_KEY, &new_split)?; // Split point is now persisted in the hot database on disk. The in-memory split point // hasn't been modified elsewhere since we keep a write lock on it. It's safe to update // the in-memory split point now. - *split_guard = split; - } + *split_guard = new_split; + new_split + }; // Update the cache's view of the finalized state. store.update_finalized_state( @@ -3038,7 +3441,16 @@ pub fn migrate_database, Cold: ItemStore>( "Freezer migration complete" ); - Ok(()) + Ok(SplitChange { + previous: current_split, + new: new_split, + }) +} + +#[derive(Debug)] +pub struct SplitChange { + pub previous: Split, + pub new: Split, } /// Struct for storing the split slot and state root in the database. @@ -3075,19 +3487,221 @@ fn no_state_root_iter() -> Option), + LoadStateRootError(Box), + MissingStateRoot { + target_slot: Slot, + state_upper_limit: Slot, + }, + OutOfBoundsInitialSlot, +} + +/// Return the ancestor state root of a state beyond SlotsPerHistoricalRoot using the roots iterator +/// and the store +pub fn get_ancestor_state_root<'a, E: EthSpec, Hot: ItemStore, Cold: ItemStore>( + store: &'a HotColdDB, + from_state: &'a BeaconState, + target_slot: Slot, +) -> Result { + // Use the state itself for recent roots + if let Ok(target_state_root) = from_state.get_state_root(target_slot) { + return Ok(*target_state_root); + } + + // Fetch the anchor info prior to obtaining the split lock. We don't need to hold a lock because + // the `state_upper_limit` can't increase (and rug us) unless state pruning runs, and it never + // runs concurrently. + let state_upper_limit = store.get_anchor_info().state_upper_limit; + + // Hold the split lock so that state summaries are not pruned concurrently with this function + // running. + let split = store.split.read_recursive(); + + // If the state root is in range of the freezer DB's linear state root storage, fetch it + // directly from there. This is useful on archive nodes to avoid some of the complexity of + // traversing the sparse portion of the hdiff grid (prior to the split slot). It is also + // necessary for the v24 schema migration on archive nodes, where there isn't yet any grid + // to traverse. + if target_slot < split.slot && target_slot >= state_upper_limit { + drop(split); + return store + .get_cold_state_root(target_slot) + .map_err(Box::new) + .map_err(StateSummaryIteratorError::LoadStateRootError)? + .ok_or_else(|| StateSummaryIteratorError::MissingStateRoot { + target_slot, + state_upper_limit, + }); + } + + let mut state_root = { + // We can not start loading summaries from `state_root` since its summary has not yet been + // imported. This code path is called during block import. + // + // We need to choose a state_root to start that is + // - An ancestor of `from_state`, AND + // - Its state summary is already written (and not pruned) in the DB + // - Its slot is >= target_slot + // + // If we get to this codepath, (target_slot not in state's state_roots) it means that + // `state.slot()` is greater than `SlotsPerHistoricalRoot`, and `target_slot < state.slot() + // - SlotsPerHistoricalRoot`. + // + // Values we could start from: + // - `state.slot() - 1`: TODO if we don't immediately commit all each state to the DB + // individually, we may be attempting to read a state summary that is stored in a DB ops + // vector but not yet written to the DB. Also starting from this slot is wasteful as we + // know that the target slot is `< state.slot() - SlotsPerHistoricalRoot`. + // - `state.slot() - SlotsPerHistoricalRoot`: The most efficient slot to start. But we risk + // jumping to a state summary that has already been pruned. See the `max(.., split_slot)` + // below + let oldest_slot_in_state_roots = from_state + .slot() + .saturating_sub(Slot::new(E::SlotsPerHistoricalRoot::to_u64())); + + // Don't start with a slot that prior to the finalized state slot. We may be attempting to read + // a hot state summary that has already been pruned as part of the migration and error. HDiffs + // can reference diffs with a slot prior to the finalized checkpoint. But those are sparse so + // the probabiliy of hitting `MissingSummary` error is high. Instead, the summary for the + // finalized state is always available. + let start_slot = std::cmp::max(oldest_slot_in_state_roots, split.slot); + + *from_state + .get_state_root(start_slot) + .map_err(|_| StateSummaryIteratorError::OutOfBoundsInitialSlot)? + }; + + let mut previous_slot = None; + + loop { + let state_summary = store + .load_hot_state_summary(&state_root) + .map_err(|e| StateSummaryIteratorError::LoadSummaryError(Box::new(e)))? + .ok_or(StateSummaryIteratorError::MissingSummary(state_root))?; + + // Protect against infinite loops if the state summaries are not strictly descending + if let Some(previous_slot) = previous_slot { + if state_summary.slot >= previous_slot { + drop(split); + return Err(StateSummaryIteratorError::CircularSummaries { + state_root, + state_slot: state_summary.slot, + previous_slot, + }); + } + } + previous_slot = Some(state_summary.slot); + + match state_summary.slot.cmp(&target_slot) { + Ordering::Less => { + drop(split); + return Err(StateSummaryIteratorError::BelowTarget(state_summary.slot)); + } + Ordering::Equal => return Ok(state_root), + Ordering::Greater => {} // keep going + } + + // Jump to an older state summary that is an ancestor of `state_root` + if let OptionalDiffBaseState::BaseState(DiffBaseState { + slot, + state_root: diff_base_state_root, + }) = state_summary.diff_base_state + { + if target_slot <= slot { + // As an optimization use the HDiff state root to jump states faster + state_root = diff_base_state_root; + } + continue; + } + // Else jump slot by slot + state_root = state_summary.previous_state_root; + } +} + /// Struct for summarising a state in the hot database. /// /// Allows full reconstruction by replaying blocks. -#[derive(Debug, Clone, Copy, Default, Encode, Decode)] +#[derive(Debug, Clone, Copy, Encode, Decode)] pub struct HotStateSummary { pub slot: Slot, pub latest_block_root: Hash256, - epoch_boundary_state_root: Hash256, + pub latest_block_slot: Slot, + pub diff_base_state: OptionalDiffBaseState, + pub previous_state_root: Hash256, +} + +/// Information about the state that a hot state is diffed from or replays blocks from, if any. +/// +/// In the case of a snapshot, there is no diff base state, so this value will be +/// `DiffBaseState::Snapshot`. +#[derive(Debug, Clone, Copy, Encode, Decode)] +#[ssz(enum_behaviour = "union")] +pub enum OptionalDiffBaseState { + // The SSZ crate requires *something* in each variant so we just store a u8 set to 0. + Snapshot(u8), + BaseState(DiffBaseState), +} + +#[derive(Debug, Clone, Copy, Encode, Decode)] +pub struct DiffBaseState { + slot: Slot, + state_root: Hash256, +} + +impl OptionalDiffBaseState { + pub fn new(slot: Slot, state_root: Hash256) -> Self { + Self::BaseState(DiffBaseState { slot, state_root }) + } + + pub fn get_root(&self, slot: Slot) -> Result { + match *self { + Self::Snapshot(_) => Err(Error::SnapshotDiffBaseState { slot }), + Self::BaseState(DiffBaseState { + slot: stored_slot, + state_root, + }) => { + if stored_slot == slot { + Ok(state_root) + } else { + Err(Error::MismatchedDiffBaseState { + expected_slot: slot, + stored_slot, + }) + } + } + } + } +} + +// Succint rendering of (slot, state_root) pair for "Storing hot state summary and diffs" log +impl std::fmt::Display for OptionalDiffBaseState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Snapshot(_) => write!(f, "snapshot"), + Self::BaseState(base_state) => write!(f, "{base_state}"), + } + } +} + +impl std::fmt::Display for DiffBaseState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{:?}", self.slot, self.state_root) + } } impl StoreItem for HotStateSummary { fn db_column() -> DBColumn { - DBColumn::BeaconStateSummary + DBColumn::BeaconStateHotSummary } fn as_store_bytes(&self) -> Vec { @@ -3101,27 +3715,78 @@ impl StoreItem for HotStateSummary { impl HotStateSummary { /// Construct a new summary of the given state. - pub fn new(state_root: &Hash256, state: &BeaconState) -> Result { + pub fn new, Cold: ItemStore>( + store: &HotColdDB, + state_root: Hash256, + state: &BeaconState, + storage_strategy: StorageStrategy, + ) -> Result { // Fill in the state root on the latest block header if necessary (this happens on all // slots where there isn't a skip). - let latest_block_root = state.get_latest_block_root(*state_root); - let epoch_boundary_slot = state.slot() / E::slots_per_epoch() * E::slots_per_epoch(); - let epoch_boundary_state_root = if epoch_boundary_slot == state.slot() { - *state_root + let latest_block_root = state.get_latest_block_root(state_root); + + let get_state_root = |slot| { + if slot == state.slot() { + Ok::<_, Error>(state_root) + } else { + Ok(get_ancestor_state_root(store, state, slot).map_err(|e| { + Error::StateSummaryIteratorError { + error: e, + from_state_root: state_root, + from_state_slot: state.slot(), + target_slot: slot, + } + })?) + } + }; + let diff_base_slot = storage_strategy.diff_base_slot(); + let diff_base_state = if let Some(diff_base_slot) = diff_base_slot { + OptionalDiffBaseState::new(diff_base_slot, get_state_root(diff_base_slot)?) + } else { + OptionalDiffBaseState::Snapshot(0) + }; + + let previous_state_root = if state.slot() == 0 { + // Set to 0x0 for genesis state to prevent any sort of circular reference. + Hash256::zero() } else { - *state - .get_state_root(epoch_boundary_slot) - .map_err(HotColdDBError::HotStateSummaryError)? + get_state_root(state.slot().safe_sub(1_u64)?)? }; Ok(HotStateSummary { slot: state.slot(), latest_block_root, - epoch_boundary_state_root, + latest_block_slot: state.latest_block_header().slot, + diff_base_state, + previous_state_root, }) } } +/// Legacy hot state summary used in schema V22 and before. +/// +/// This can be deleted when we remove V22 support. +#[derive(Debug, Clone, Copy, Encode, Decode)] +pub struct HotStateSummaryV22 { + pub slot: Slot, + pub latest_block_root: Hash256, + pub epoch_boundary_state_root: Hash256, +} + +impl StoreItem for HotStateSummaryV22 { + fn db_column() -> DBColumn { + DBColumn::BeaconStateSummary + } + + fn as_store_bytes(&self) -> Vec { + self.as_ssz_bytes() + } + + fn from_store_bytes(bytes: &[u8]) -> Result { + Ok(Self::from_ssz_bytes(bytes)?) + } +} + /// Struct for summarising a state in the freezer database. #[derive(Debug, Clone, Copy, Default, Encode, Decode)] pub(crate) struct ColdStateSummary { diff --git a/beacon_node/store/src/impls.rs b/beacon_node/store/src/impls.rs index 736585a72aa..691c79ace76 100644 --- a/beacon_node/store/src/impls.rs +++ b/beacon_node/store/src/impls.rs @@ -1,2 +1 @@ -pub mod beacon_state; pub mod execution_payload; diff --git a/beacon_node/store/src/impls/beacon_state.rs b/beacon_node/store/src/impls/beacon_state.rs deleted file mode 100644 index fd08e547f13..00000000000 --- a/beacon_node/store/src/impls/beacon_state.rs +++ /dev/null @@ -1,102 +0,0 @@ -use crate::*; -use ssz::{DecodeError, Encode}; -use ssz_derive::Encode; - -pub fn store_full_state( - state_root: &Hash256, - state: &BeaconState, - ops: &mut Vec, -) -> Result<(), Error> { - let bytes = { - let _overhead_timer = metrics::start_timer(&metrics::BEACON_STATE_WRITE_OVERHEAD_TIMES); - StorageContainer::new(state).as_ssz_bytes() - }; - metrics::inc_counter_by(&metrics::BEACON_STATE_WRITE_BYTES, bytes.len() as u64); - metrics::inc_counter(&metrics::BEACON_STATE_WRITE_COUNT); - ops.push(KeyValueStoreOp::PutKeyValue( - DBColumn::BeaconState, - state_root.as_slice().to_vec(), - bytes, - )); - Ok(()) -} - -pub fn get_full_state, E: EthSpec>( - db: &KV, - state_root: &Hash256, - spec: &ChainSpec, -) -> Result>, Error> { - let total_timer = metrics::start_timer(&metrics::BEACON_STATE_READ_TIMES); - - match db.get_bytes(DBColumn::BeaconState, state_root.as_slice())? { - Some(bytes) => { - let overhead_timer = metrics::start_timer(&metrics::BEACON_STATE_READ_OVERHEAD_TIMES); - let container = StorageContainer::from_ssz_bytes(&bytes, spec)?; - - metrics::stop_timer(overhead_timer); - metrics::stop_timer(total_timer); - metrics::inc_counter(&metrics::BEACON_STATE_READ_COUNT); - metrics::inc_counter_by(&metrics::BEACON_STATE_READ_BYTES, bytes.len() as u64); - - Ok(Some(container.try_into()?)) - } - None => Ok(None), - } -} - -/// A container for storing `BeaconState` components. -// TODO: would be more space efficient with the caches stored separately and referenced by hash -#[derive(Encode)] -pub struct StorageContainer { - state: BeaconState, - committee_caches: Vec>, -} - -impl StorageContainer { - /// Create a new instance for storing a `BeaconState`. - pub fn new(state: &BeaconState) -> Self { - Self { - state: state.clone(), - committee_caches: state.committee_caches().to_vec(), - } - } - - pub fn from_ssz_bytes(bytes: &[u8], spec: &ChainSpec) -> Result { - // We need to use the slot-switching `from_ssz_bytes` of `BeaconState`, which doesn't - // compose with the other SSZ utils, so we duplicate some parts of `ssz_derive` here. - let mut builder = ssz::SszDecoderBuilder::new(bytes); - - builder.register_anonymous_variable_length_item()?; - builder.register_type::>()?; - - let mut decoder = builder.build()?; - - let state = decoder.decode_next_with(|bytes| BeaconState::from_ssz_bytes(bytes, spec))?; - let committee_caches = decoder.decode_next()?; - - Ok(Self { - state, - committee_caches, - }) - } -} - -impl TryInto> for StorageContainer { - type Error = Error; - - fn try_into(mut self) -> Result, Error> { - let mut state = self.state; - - for i in (0..CACHED_EPOCHS).rev() { - if i >= self.committee_caches.len() { - return Err(Error::SszDecodeError(DecodeError::BytesInvalid( - "Insufficient committees for BeaconState".to_string(), - ))); - }; - - state.committee_caches_mut()[i] = self.committee_caches.remove(i); - } - - Ok(state) - } -} diff --git a/beacon_node/store/src/iter.rs b/beacon_node/store/src/iter.rs index 8419dde4a2c..47c5a1d9d82 100644 --- a/beacon_node/store/src/iter.rs +++ b/beacon_node/store/src/iter.rs @@ -384,9 +384,9 @@ fn slot_of_prev_restore_point(current_slot: Slot) -> Slot { #[cfg(test)] mod test { use super::*; - use crate::StoreConfig as Config; + use crate::{MemoryStore, StoreConfig as Config}; use beacon_chain::test_utils::BeaconChainHarness; - use beacon_chain::types::{ChainSpec, MainnetEthSpec}; + use beacon_chain::types::MainnetEthSpec; use std::sync::Arc; use types::FixedBytesExtended; @@ -400,10 +400,31 @@ mod test { harness.get_current_state() } + fn get_store() -> HotColdDB, MemoryStore> { + let store = + HotColdDB::open_ephemeral(Config::default(), Arc::new(E::default_spec())).unwrap(); + // Init achor info so anchor slot is set. Use a random block as it is only used for the + // parent_root + let _ = store + .init_anchor_info(Hash256::ZERO, Slot::new(0), Slot::new(0), false) + .unwrap(); + // Write a state with state root 0 which is the base `put_state` below tries to diff from + { + let harness = BeaconChainHarness::builder(E::default()) + .default_spec() + .deterministic_keypairs(1) + .fresh_ephemeral_store() + .build(); + let genesis_state = harness.get_current_state(); + store.put_state(&Hash256::ZERO, &genesis_state).unwrap(); + } + store + } + #[test] fn block_root_iter() { - let store = - HotColdDB::open_ephemeral(Config::default(), Arc::new(ChainSpec::minimal())).unwrap(); + let store = get_store::(); + let slots_per_historical_root = MainnetEthSpec::slots_per_historical_root(); let mut state_a: BeaconState = get_state(); @@ -449,8 +470,8 @@ mod test { #[test] fn state_root_iter() { - let store = - HotColdDB::open_ephemeral(Config::default(), Arc::new(ChainSpec::minimal())).unwrap(); + let store = get_store::(); + let slots_per_historical_root = MainnetEthSpec::slots_per_historical_root(); let mut state_a: BeaconState = get_state(); diff --git a/beacon_node/store/src/lib.rs b/beacon_node/store/src/lib.rs index 5b30971fd8e..7762d892c2d 100644 --- a/beacon_node/store/src/lib.rs +++ b/beacon_node/store/src/lib.rs @@ -35,10 +35,8 @@ pub use self::hot_cold_store::{HotColdDB, HotStateSummary, Split}; pub use self::memory_store::MemoryStore; pub use crate::metadata::BlobInfo; pub use errors::Error; -pub use impls::beacon_state::StorageContainer as BeaconStateStorageContainer; pub use metadata::AnchorInfo; pub use metrics::scrape_for_metrics; -use parking_lot::MutexGuard; use std::collections::HashSet; use std::sync::Arc; use strum::{EnumIter, EnumString, IntoStaticStr}; @@ -76,12 +74,6 @@ pub trait KeyValueStore: Sync + Send + Sized + 'static { /// Execute either all of the operations in `batch` or none at all, returning an error. fn do_atomically(&self, batch: Vec) -> Result<(), Error>; - /// Return a mutex guard that can be used to synchronize sensitive transactions. - /// - /// This doesn't prevent other threads writing to the DB unless they also use - /// this method. In future we may implement a safer mandatory locking scheme. - fn begin_rw_transaction(&self) -> MutexGuard<()>; - /// Compact a single column in the database, freeing space used by deleted items. fn compact_column(&self, column: DBColumn) -> Result<(), Error>; @@ -91,7 +83,7 @@ pub trait KeyValueStore: Sync + Send + Sized + 'static { // i.e. entries being created and deleted. for column in [ DBColumn::BeaconState, - DBColumn::BeaconStateSummary, + DBColumn::BeaconStateHotSummary, DBColumn::BeaconBlock, ] { self.compact_column(column)?; @@ -130,7 +122,10 @@ impl Key for Hash256 { if key.len() == 32 { Ok(Hash256::from_slice(key)) } else { - Err(Error::InvalidKey) + Err(Error::InvalidKey(format!( + "Hash256 key unexpected len {}", + key.len() + ))) } } } @@ -162,7 +157,10 @@ pub fn get_data_column_key(block_root: &Hash256, column_index: &ColumnIndex) -> pub fn parse_data_column_key(data: Vec) -> Result<(Hash256, ColumnIndex), Error> { if data.len() != DBColumn::BeaconDataColumn.key_size() { - return Err(Error::InvalidKey); + return Err(Error::InvalidKey(format!( + "Unexpected BeaconDataColumn key len {}", + data.len() + ))); } // split_at panics if 32 < 40 which will never happen after the length check above let (block_root_bytes, column_index_bytes) = data.split_at(32); @@ -171,7 +169,7 @@ pub fn parse_data_column_key(data: Vec) -> Result<(Hash256, ColumnIndex), Er let column_index = ColumnIndex::from_le_bytes( column_index_bytes .try_into() - .map_err(|_| Error::InvalidKey)?, + .map_err(|e| Error::InvalidKey(format!("Invalid ColumnIndex {e:?}")))?, ); Ok((block_root, column_index)) } @@ -267,20 +265,40 @@ pub enum DBColumn { #[strum(serialize = "bdc")] BeaconDataColumn, /// For full `BeaconState`s in the hot database (finalized or fork-boundary states). + /// + /// DEPRECATED. #[strum(serialize = "ste")] BeaconState, + /// For compact `BeaconStateDiff`'s in the hot DB. + /// + /// hsd = Hot State Diff. + #[strum(serialize = "hsd")] + BeaconStateHotDiff, + /// For beacon state snapshots in the hot DB. + /// + /// hsn = Hot Snapshot. + #[strum(serialize = "hsn")] + BeaconStateHotSnapshot, /// For beacon state snapshots in the freezer DB. #[strum(serialize = "bsn")] BeaconStateSnapshot, /// For compact `BeaconStateDiff`s in the freezer DB. #[strum(serialize = "bsd")] BeaconStateDiff, - /// Mapping from state root to `HotStateSummary` in the hot DB. + /// DEPRECATED + /// + /// Mapping from state root to `HotStateSummaryV22` in the hot DB. /// /// Previously this column also served a role in the freezer DB, mapping state roots to /// `ColdStateSummary`. However that role is now filled by `BeaconColdStateSummary`. #[strum(serialize = "bss")] BeaconStateSummary, + /// Mapping from state root to `HotStateSummaryV23` in the hot DB. + /// + /// This column is populated after DB schema version 23 superseding `BeaconStateSummary`. The + /// new column is necessary to have a safe migration without data loss. + #[strum(serialize = "bs3")] + BeaconStateHotSummary, /// Mapping from state root to `ColdStateSummary` in the cold DB. #[strum(serialize = "bcs")] BeaconColdStateSummary, @@ -387,6 +405,9 @@ impl DBColumn { | Self::BeaconState | Self::BeaconBlob | Self::BeaconStateSummary + | Self::BeaconStateHotDiff + | Self::BeaconStateHotSnapshot + | Self::BeaconStateHotSummary | Self::BeaconColdStateSummary | Self::BeaconStateTemporary | Self::ExecPayload diff --git a/beacon_node/store/src/memory_store.rs b/beacon_node/store/src/memory_store.rs index 6070a2d3f0c..e53417ef0ec 100644 --- a/beacon_node/store/src/memory_store.rs +++ b/beacon_node/store/src/memory_store.rs @@ -2,7 +2,7 @@ use crate::{ errors::Error as DBError, get_key_for_col, hot_cold_store::BytesKey, ColumnIter, ColumnKeyIter, DBColumn, Error, ItemStore, Key, KeyValueStore, KeyValueStoreOp, }; -use parking_lot::{Mutex, MutexGuard, RwLock}; +use parking_lot::RwLock; use std::collections::{BTreeMap, HashSet}; use std::marker::PhantomData; use types::*; @@ -12,7 +12,6 @@ type DBMap = BTreeMap>; /// A thread-safe `BTreeMap` wrapper. pub struct MemoryStore { db: RwLock, - transaction_mutex: Mutex<()>, _phantom: PhantomData, } @@ -21,7 +20,6 @@ impl MemoryStore { pub fn open() -> Self { Self { db: RwLock::new(BTreeMap::new()), - transaction_mutex: Mutex::new(()), _phantom: PhantomData, } } @@ -107,10 +105,6 @@ impl KeyValueStore for MemoryStore { Box::new(self.iter_column(column).map(|res| res.map(|(k, _)| k))) } - fn begin_rw_transaction(&self) -> MutexGuard<()> { - self.transaction_mutex.lock() - } - fn compact_column(&self, _column: DBColumn) -> Result<(), Error> { Ok(()) } diff --git a/beacon_node/store/src/metadata.rs b/beacon_node/store/src/metadata.rs index 55c64bf8508..bc9d708e14a 100644 --- a/beacon_node/store/src/metadata.rs +++ b/beacon_node/store/src/metadata.rs @@ -2,9 +2,9 @@ use crate::{DBColumn, Error, StoreItem}; use serde::{Deserialize, Serialize}; use ssz::{Decode, Encode}; use ssz_derive::{Decode, Encode}; -use types::{Checkpoint, Hash256, Slot}; +use types::{Hash256, Slot}; -pub const CURRENT_SCHEMA_VERSION: SchemaVersion = SchemaVersion(23); +pub const CURRENT_SCHEMA_VERSION: SchemaVersion = SchemaVersion(24); // All the keys that get stored under the `BeaconMeta` column. // @@ -12,7 +12,8 @@ pub const CURRENT_SCHEMA_VERSION: SchemaVersion = SchemaVersion(23); pub const SCHEMA_VERSION_KEY: Hash256 = Hash256::repeat_byte(0); pub const CONFIG_KEY: Hash256 = Hash256::repeat_byte(1); pub const SPLIT_KEY: Hash256 = Hash256::repeat_byte(2); -pub const PRUNING_CHECKPOINT_KEY: Hash256 = Hash256::repeat_byte(3); +// DEPRECATED +// pub const PRUNING_CHECKPOINT_KEY: Hash256 = Hash256::repeat_byte(3); pub const COMPACTION_TIMESTAMP_KEY: Hash256 = Hash256::repeat_byte(4); pub const ANCHOR_INFO_KEY: Hash256 = Hash256::repeat_byte(5); pub const BLOB_INFO_KEY: Hash256 = Hash256::repeat_byte(6); @@ -21,15 +22,6 @@ pub const DATA_COLUMN_INFO_KEY: Hash256 = Hash256::repeat_byte(7); /// State upper limit value used to indicate that a node is not storing historic states. pub const STATE_UPPER_LIMIT_NO_RETAIN: Slot = Slot::new(u64::MAX); -/// The `AnchorInfo` encoding full availability of all historic blocks & states. -pub const ANCHOR_FOR_ARCHIVE_NODE: AnchorInfo = AnchorInfo { - anchor_slot: Slot::new(0), - oldest_block_slot: Slot::new(0), - oldest_block_parent: Hash256::ZERO, - state_upper_limit: Slot::new(0), - state_lower_limit: Slot::new(0), -}; - /// The `AnchorInfo` encoding an uninitialized anchor. /// /// This value should never exist except on initial start-up prior to the anchor being initialised @@ -65,30 +57,6 @@ impl StoreItem for SchemaVersion { } } -/// The checkpoint used for pruning the database. -/// -/// Updated whenever pruning is successful. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct PruningCheckpoint { - pub checkpoint: Checkpoint, -} - -impl StoreItem for PruningCheckpoint { - fn db_column() -> DBColumn { - DBColumn::BeaconMeta - } - - fn as_store_bytes(&self) -> Vec { - self.checkpoint.as_ssz_bytes() - } - - fn from_store_bytes(bytes: &[u8]) -> Result { - Ok(PruningCheckpoint { - checkpoint: Checkpoint::from_ssz_bytes(bytes)?, - }) - } -} - /// The last time the database was compacted. pub struct CompactionTimestamp(pub u64); @@ -111,7 +79,8 @@ impl StoreItem for CompactionTimestamp { pub struct AnchorInfo { /// The slot at which the anchor state is present and which we cannot revert. Values on start: /// - Genesis start: 0 - /// - Checkpoint sync: Slot of the finalized checkpoint block + /// - Checkpoint sync: Slot of the finalized state advanced to the checkpoint epoch + /// - Existing DB prior to v23: Finalized state slot at the migration moment /// /// Immutable pub anchor_slot: Slot, @@ -175,6 +144,21 @@ impl AnchorInfo { pub fn full_state_pruning_enabled(&self) -> bool { self.state_lower_limit == 0 && self.state_upper_limit == STATE_UPPER_LIMIT_NO_RETAIN } + + /// Compute the correct `AnchorInfo` for an archive node created from the current node. + /// + /// This method ensures that the `anchor_slot` which is used for the hot database's diff grid is + /// preserved. + pub fn as_archive_anchor(&self) -> Self { + Self { + // Anchor slot MUST be the same. It is immutable. + anchor_slot: self.anchor_slot, + oldest_block_slot: Slot::new(0), + oldest_block_parent: Hash256::ZERO, + state_upper_limit: Slot::new(0), + state_lower_limit: Slot::new(0), + } + } } impl StoreItem for AnchorInfo { diff --git a/beacon_node/store/src/metrics.rs b/beacon_node/store/src/metrics.rs index 5da73c3cad8..44b61e1ebe2 100644 --- a/beacon_node/store/src/metrics.rs +++ b/beacon_node/store/src/metrics.rs @@ -4,6 +4,10 @@ use directory::size_of_dir; use std::path::Path; use std::sync::LazyLock; +// Labels used for histogram timer vecs that are tracked per DB (hot and cold). +pub const HOT_METRIC: &[&str] = &["hot"]; +pub const COLD_METRIC: &[&str] = &["cold"]; + /* * General */ @@ -142,66 +146,61 @@ pub static BEACON_STATE_HOT_GET_COUNT: LazyLock> = LazyLock:: "Total number of hot beacon states requested from the store (cache or DB)", ) }); -pub static BEACON_STATE_READ_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_beacon_state_read_seconds", - "Total time required to read a BeaconState from the database", - ) -}); -pub static BEACON_STATE_READ_OVERHEAD_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_beacon_state_read_overhead_seconds", - "Overhead on reading a beacon state from the DB (e.g., decoding)", - ) -}); -pub static BEACON_STATE_READ_COUNT: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "store_beacon_state_read_total", - "Total number of beacon state reads from the DB", - ) -}); -pub static BEACON_STATE_READ_BYTES: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "store_beacon_state_read_bytes_total", - "Total number of beacon state bytes read from the DB", - ) -}); -pub static BEACON_STATE_WRITE_OVERHEAD_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_beacon_state_write_overhead_seconds", - "Overhead on writing a beacon state to the DB (e.g., encoding)", + +/* + * HDiffs + */ +pub static BEACON_HDIFF_READ_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_read_seconds", + "Time taken to read hdiff bytes from disk", + &["db"], ) }); -pub static BEACON_STATE_WRITE_COUNT: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "store_beacon_state_write_total", - "Total number of beacon state writes the DB", +pub static BEACON_HDIFF_DECODE_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_decode_seconds", + "Time taken to decode hdiff bytes", + &["db"], ) }); -pub static BEACON_STATE_WRITE_BYTES: LazyLock> = LazyLock::new(|| { - try_create_int_counter( - "store_beacon_state_write_bytes_total", - "Total number of beacon state bytes written to the DB", +pub static BEACON_HDIFF_APPLY_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_apply_seconds", + "Time taken to apply an hdiff to a buffer", + &["db"], ) }); -pub static BEACON_HDIFF_READ_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_hdiff_read_seconds", - "Time required to read the hierarchical diff bytes from the database", +pub static BEACON_HDIFF_COMPUTE_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_compute_seconds", + "Time taken to compute an hdiff for a state", + &["db"], ) }); -pub static BEACON_HDIFF_DECODE_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_hdiff_decode_seconds", - "Time required to decode hierarchical diff bytes", +pub static BEACON_HDIFF_BUFFER_LOAD_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_buffer_load_seconds", + "Time taken to load an hdiff buffer for a state", + &["db"], ) }); -pub static BEACON_HDIFF_BUFFER_CLONE_TIMES: LazyLock> = LazyLock::new(|| { - try_create_histogram( +pub static BEACON_HDIFF_BUFFER_CLONE_TIME: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec( "store_hdiff_buffer_clone_seconds", - "Time required to clone hierarchical diff buffer bytes", + "Time taken to clone an hdiff buffer from a cache", + &["db"], ) }); +pub static BEACON_HDIFF_BUFFER_LOAD_BEFORE_STORE_TIME: LazyLock> = + LazyLock::new(|| { + try_create_histogram_vec( + "store_hdiff_buffer_load_before_store_seconds", + "Time taken to load the hdiff buffer required for the storage of a new state", + &["db"], + ) + }); +// This metric is not split hot/cold because it is recorded in a place where that info is not known. pub static BEACON_HDIFF_BUFFER_APPLY_RESIZES: LazyLock> = LazyLock::new(|| { try_create_histogram_with_buckets( "store_hdiff_buffer_apply_resizes", @@ -209,6 +208,24 @@ pub static BEACON_HDIFF_BUFFER_APPLY_RESIZES: LazyLock> = Lazy Ok(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0]) ) }); +// This metric is not split hot/cold because both databases use the same hierarchy config anyway +// and that's all that affects diff sizes. +pub static BEACON_HDIFF_SIZES: LazyLock> = LazyLock::new(|| { + try_create_histogram_vec_with_buckets( + "store_hdiff_sizes", + "Size of hdiffs in bytes by layer (exponent)", + Ok(vec![ + 500_000.0, + 2_000_000.0, + 5_000_000.0, + 10_000_000.0, + 15_000_000.0, + 20_000_000.0, + 50_000_000.0, + ]), + &["exponent"], + ) +}); /* * Beacon Block */ @@ -259,17 +276,20 @@ pub static STORE_BEACON_HISTORIC_STATE_CACHE_SIZE: LazyLock> = "Current count of states in the historic state cache", ) }); -pub static STORE_BEACON_HDIFF_BUFFER_CACHE_SIZE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "store_beacon_hdiff_buffer_cache_size", - "Current count of hdiff buffers in the historic state cache", - ) -}); -pub static STORE_BEACON_HDIFF_BUFFER_CACHE_BYTE_SIZE: LazyLock> = +pub static STORE_BEACON_HDIFF_BUFFER_CACHE_SIZE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( + try_create_int_gauge_vec( + "store_beacon_hdiff_buffer_cache_size", + "Current count of hdiff buffers cached in memory", + &["db"], + ) + }); +pub static STORE_BEACON_HDIFF_BUFFER_CACHE_BYTE_SIZE: LazyLock> = + LazyLock::new(|| { + try_create_int_gauge_vec( "store_beacon_hdiff_buffer_cache_byte_size", - "Memory consumed by hdiff buffers in the historic state cache", + "Memory consumed by hdiff buffers cached in memory", + &["db"], ) }); pub static STORE_BEACON_STATE_FREEZER_COMPRESS_TIME: LazyLock> = @@ -286,33 +306,6 @@ pub static STORE_BEACON_STATE_FREEZER_DECOMPRESS_TIME: LazyLock> = - LazyLock::new(|| { - try_create_histogram( - "store_beacon_hdiff_buffer_apply_seconds", - "Time taken to apply hdiff buffer to a state buffer", - ) - }); -pub static STORE_BEACON_HDIFF_BUFFER_COMPUTE_TIME: LazyLock> = - LazyLock::new(|| { - try_create_histogram( - "store_beacon_hdiff_buffer_compute_seconds", - "Time taken to compute hdiff buffer to a state buffer", - ) - }); -pub static STORE_BEACON_HDIFF_BUFFER_LOAD_TIME: LazyLock> = LazyLock::new(|| { - try_create_histogram( - "store_beacon_hdiff_buffer_load_seconds", - "Time taken to load an hdiff buffer", - ) -}); -pub static STORE_BEACON_HDIFF_BUFFER_LOAD_FOR_STORE_TIME: LazyLock> = - LazyLock::new(|| { - try_create_histogram( - "store_beacon_hdiff_buffer_load_for_store_seconds", - "Time taken to load an hdiff buffer to store another hdiff", - ) - }); pub static STORE_BEACON_HISTORIC_STATE_CACHE_HIT: LazyLock> = LazyLock::new(|| { try_create_int_counter( @@ -327,18 +320,20 @@ pub static STORE_BEACON_HISTORIC_STATE_CACHE_MISS: LazyLock> "Total count of historic state cache misses for full states", ) }); -pub static STORE_BEACON_HDIFF_BUFFER_CACHE_HIT: LazyLock> = +pub static STORE_BEACON_HDIFF_BUFFER_CACHE_HIT: LazyLock> = LazyLock::new(|| { - try_create_int_counter( + try_create_int_counter_vec( "store_beacon_hdiff_buffer_cache_hit_total", "Total count of hdiff buffer cache hits", + &["db"], ) }); -pub static STORE_BEACON_HDIFF_BUFFER_CACHE_MISS: LazyLock> = +pub static STORE_BEACON_HDIFF_BUFFER_CACHE_MISS: LazyLock> = LazyLock::new(|| { - try_create_int_counter( + try_create_int_counter_vec( "store_beacon_hdiff_buffer_cache_miss_total", "Total count of hdiff buffer cache miss", + &["db"], ) }); pub static STORE_BEACON_HDIFF_BUFFER_INTO_STATE_TIME: LazyLock> = diff --git a/beacon_node/store/src/reconstruct.rs b/beacon_node/store/src/reconstruct.rs index 30df552b7be..ade111983b7 100644 --- a/beacon_node/store/src/reconstruct.rs +++ b/beacon_node/store/src/reconstruct.rs @@ -1,6 +1,5 @@ //! Implementation of historic state reconstruction (given complete block history). use crate::hot_cold_store::{HotColdDB, HotColdDBError}; -use crate::metadata::ANCHOR_FOR_ARCHIVE_NODE; use crate::metrics; use crate::{Error, ItemStore}; use itertools::{process_results, Itertools}; @@ -145,10 +144,8 @@ where }); } - self.compare_and_set_anchor_info_with_write( - old_anchor, - ANCHOR_FOR_ARCHIVE_NODE, - )?; + let new_anchor = old_anchor.as_archive_anchor(); + self.compare_and_set_anchor_info_with_write(old_anchor, new_anchor)?; return Ok(()); } else { diff --git a/beacon_node/store/src/state_cache.rs b/beacon_node/store/src/state_cache.rs index 281ecab1525..b6aacbb77a7 100644 --- a/beacon_node/store/src/state_cache.rs +++ b/beacon_node/store/src/state_cache.rs @@ -1,4 +1,8 @@ -use crate::Error; +use crate::hdiff::HDiffBuffer; +use crate::{ + metrics::{self, HOT_METRIC}, + Error, +}; use lru::LruCache; use std::collections::{BTreeMap, HashMap, HashSet}; use std::num::NonZeroUsize; @@ -37,26 +41,53 @@ pub struct StateCache { // the state_root states: LruCache)>, block_map: BlockMap, + hdiff_buffers: HotHDiffBufferCache, max_epoch: Epoch, head_block_root: Hash256, headroom: NonZeroUsize, } +/// Cache of hdiff buffers for hot states. +/// +/// This cache only keeps buffers prior to the finalized state, which are required by the +/// hierarchical state diff scheme to construct newer unfinalized states. +/// +/// The cache always retains the hdiff buffer for the most recent snapshot so that even if the +/// cache capacity is 1, this snapshot never needs to be loaded from disk. +#[derive(Debug)] +pub struct HotHDiffBufferCache { + /// Cache of HDiffBuffers for states *prior* to the `finalized_state`. + /// + /// Maps state_root -> (slot, buffer). + hdiff_buffers: LruCache, +} + #[derive(Debug)] pub enum PutStateOutcome { + /// State is prior to the cache's finalized state (lower slot) and was cached as an HDiffBuffer. + PreFinalizedHDiffBuffer, + /// State is equal to the cache's finalized state and was not inserted. Finalized, + /// State was already present in the cache. Duplicate, - /// Includes deleted states as a result of this insertion + /// State is new to the cache and was inserted. + /// + /// Includes deleted states as a result of this insertion. New(Vec), } #[allow(clippy::len_without_is_empty)] impl StateCache { - pub fn new(capacity: NonZeroUsize, headroom: NonZeroUsize) -> Self { + pub fn new( + state_capacity: NonZeroUsize, + headroom: NonZeroUsize, + hdiff_capacity: NonZeroUsize, + ) -> Self { StateCache { finalized_state: None, - states: LruCache::new(capacity), + states: LruCache::new(state_capacity), block_map: BlockMap::default(), + hdiff_buffers: HotHDiffBufferCache::new(hdiff_capacity), max_epoch: Epoch::new(0), head_block_root: Hash256::ZERO, headroom, @@ -71,11 +102,20 @@ impl StateCache { self.states.cap().get() } + pub fn num_hdiff_buffers(&self) -> usize { + self.hdiff_buffers.len() + } + + pub fn hdiff_buffer_mem_usage(&self) -> usize { + self.hdiff_buffers.mem_usage() + } + pub fn update_finalized_state( &mut self, state_root: Hash256, block_root: Hash256, state: BeaconState, + pre_finalized_slots_to_retain: &[Slot], ) -> Result<(), Error> { if state.slot() % E::slots_per_epoch() != 0 { return Err(Error::FinalizedStateUnaligned); @@ -95,9 +135,31 @@ impl StateCache { // Prune block map. let state_roots_to_prune = self.block_map.prune(state.slot()); + // Prune HDiffBuffers that are no longer required by the hdiff grid of the finalized state. + // We need to do this prior to copying in any new hdiff buffers, because the cache + // preferences older slots. + // NOTE: This isn't perfect as it prunes by slot: there could be multiple buffers + // at some slots in the case of long forks without finality. + let new_hdiff_cache = HotHDiffBufferCache::new(self.hdiff_buffers.cap()); + let old_hdiff_cache = std::mem::replace(&mut self.hdiff_buffers, new_hdiff_cache); + for (state_root, (slot, buffer)) in old_hdiff_cache.hdiff_buffers { + if pre_finalized_slots_to_retain.contains(&slot) { + self.hdiff_buffers.put(state_root, slot, buffer); + } + } + // Delete states. for state_root in state_roots_to_prune { - self.states.pop(&state_root); + if let Some((_, state)) = self.states.pop(&state_root) { + // Add the hdiff buffer for this state to the hdiff cache if it is now part of + // the pre-finalized grid. The `put` method will take care of keeping the most + // useful buffers. + let slot = state.slot(); + if pre_finalized_slots_to_retain.contains(&slot) { + let hdiff_buffer = HDiffBuffer::from_state(state); + self.hdiff_buffers.put(state_root, slot, hdiff_buffer); + } + } } // Update finalized state. @@ -136,12 +198,19 @@ impl StateCache { block_root: Hash256, state: &BeaconState, ) -> Result { - if self - .finalized_state - .as_ref() - .is_some_and(|finalized_state| finalized_state.state_root == state_root) - { - return Ok(PutStateOutcome::Finalized); + if let Some(ref finalized_state) = self.finalized_state { + if finalized_state.state_root == state_root { + return Ok(PutStateOutcome::Finalized); + } else if state.slot() <= finalized_state.state.slot() { + // We assume any state being inserted into the cache is grid-aligned (it is the + // caller's responsibility to not feed us garbage) as we don't want to thread the + // hierarchy config through here. So any state received is converted to an + // HDiffBuffer and saved. + let hdiff_buffer = HDiffBuffer::from_state(state.clone()); + self.hdiff_buffers + .put(state_root, state.slot(), hdiff_buffer); + return Ok(PutStateOutcome::PreFinalizedHDiffBuffer); + } } if self.states.peek(&state_root).is_some() { @@ -192,6 +261,37 @@ impl StateCache { self.states.get(&state_root).map(|(_, state)| state.clone()) } + pub fn put_hdiff_buffer(&mut self, state_root: Hash256, slot: Slot, buffer: &HDiffBuffer) { + // Only accept HDiffBuffers prior to finalization. Later states should be stored as proper + // states, not HDiffBuffers. + if let Some(finalized_state) = &self.finalized_state { + if slot >= finalized_state.state.slot() { + return; + } + } + self.hdiff_buffers.put(state_root, slot, buffer.clone()); + } + + pub fn get_hdiff_buffer_by_state_root(&mut self, state_root: Hash256) -> Option { + if let Some(buffer) = self.hdiff_buffers.get(&state_root) { + metrics::inc_counter_vec(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_HIT, HOT_METRIC); + let timer = + metrics::start_timer_vec(&metrics::BEACON_HDIFF_BUFFER_CLONE_TIME, HOT_METRIC); + let result = Some(buffer.clone()); + drop(timer); + return result; + } + if let Some(buffer) = self + .get_by_state_root(state_root) + .map(HDiffBuffer::from_state) + { + metrics::inc_counter_vec(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_HIT, HOT_METRIC); + return Some(buffer); + } + metrics::inc_counter_vec(&metrics::STORE_BEACON_HDIFF_BUFFER_CACHE_MISS, HOT_METRIC); + None + } + pub fn get_by_block_root( &mut self, block_root: Hash256, @@ -325,3 +425,80 @@ impl BlockMap { self.blocks.remove(block_root) } } + +impl HotHDiffBufferCache { + pub fn new(capacity: NonZeroUsize) -> Self { + Self { + hdiff_buffers: LruCache::new(capacity), + } + } + + pub fn get(&mut self, state_root: &Hash256) -> Option { + self.hdiff_buffers + .get(state_root) + .map(|(_, buffer)| buffer.clone()) + } + + /// Put a value in the cache, making room for it if necessary. + /// + /// If the value was inserted then `true` is returned. + pub fn put(&mut self, state_root: Hash256, slot: Slot, buffer: HDiffBuffer) -> bool { + // If the cache is not full, simply insert the value. + if self.hdiff_buffers.len() != self.hdiff_buffers.cap().get() { + self.hdiff_buffers.put(state_root, (slot, buffer)); + return true; + } + + // If the cache is full, it has room for this new entry if: + // + // - The capacity is greater than 1: we can retain the snapshot and the new entry, or + // - The capacity is 1 and the slot of the new entry is older than the min_slot in the + // cache. This is a simplified way of retaining the snapshot in the cache. We don't need + // to worry about inserting/retaining states older than the snapshot because these are + // pruned on finalization and never reinserted. + let Some(min_slot) = self.hdiff_buffers.iter().map(|(_, (slot, _))| *slot).min() else { + // Unreachable: cache is full so should have >0 entries. + return false; + }; + + if self.hdiff_buffers.cap().get() > 1 || slot < min_slot { + // Remove LRU value. Cache is now at size `cap - 1`. + let Some((removed_state_root, (removed_slot, removed_buffer))) = + self.hdiff_buffers.pop_lru() + else { + // Unreachable: cache is full so should have at least one entry to pop. + return false; + }; + + // Insert new value. Cache size is now at size `cap`. + self.hdiff_buffers.put(state_root, (slot, buffer)); + + // If the removed value had the min slot and we didn't intend to replace it (cap=1) + // then we reinsert it. + if removed_slot == min_slot && slot >= min_slot { + self.hdiff_buffers + .put(removed_state_root, (removed_slot, removed_buffer)); + } + true + } else { + // No room. + false + } + } + + pub fn cap(&self) -> NonZeroUsize { + self.hdiff_buffers.cap() + } + + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + self.hdiff_buffers.len() + } + + pub fn mem_usage(&self) -> usize { + self.hdiff_buffers + .iter() + .map(|(_, (_, buffer))| buffer.size()) + .sum() + } +} diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 35ad020b74f..bd425805187 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -171,10 +171,10 @@ Options: Specify your custom graffiti to be included in blocks. Defaults to the current version and commit, truncated to fit in 32 bytes. --hdiff-buffer-cache-size - Number of hierarchical diff (hdiff) buffers to cache in memory. Each - buffer is around the size of a BeaconState so you should be cautious - about setting this value too high. This flag is irrelevant for most - nodes, which run with state pruning enabled. [default: 16] + Number of cold hierarchical diff (hdiff) buffers to cache in memory. + Each buffer is around the size of a BeaconState so you should be + cautious about setting this value too high. This flag is irrelevant + for most nodes, which run with state pruning enabled. [default: 16] --hierarchy-exponents Specifies the frequency for storing full state snapshots and hierarchical diffs in the freezer DB. Accepts a comma-separated list @@ -187,6 +187,12 @@ Options: --historic-state-cache-size Specifies how many states from the freezer database should be cached in memory [default: 1] + --hot-hdiff-buffer-cache-size + Number of hot hierarchical diff (hdiff) buffers to cache in memory. + Each buffer is around the size of a BeaconState so you should be + cautious about setting this value too high. Setting this value higher + can reduce the time taken to store new states on disk at the cost of + higher memory usage. [default: 1] --http-address
Set the listen address for the RESTful HTTP API server. --http-allow-origin diff --git a/common/malloc_utils/src/glibc.rs b/common/malloc_utils/src/glibc.rs index 30313d06723..d50117c09e2 100644 --- a/common/malloc_utils/src/glibc.rs +++ b/common/malloc_utils/src/glibc.rs @@ -33,7 +33,7 @@ const M_MMAP_THRESHOLD: c_int = -3; /// https://man7.org/linux/man-pages/man3/mallopt.3.html const ENV_VAR_MMAP_THRESHOLD: &str = "MALLOC_MMAP_THRESHOLD_"; -pub static GLOBAL_LOCK: LazyLock> = LazyLock::new(|| <_>::default()); +pub static GLOBAL_LOCK: LazyLock> = LazyLock::new(Default::default); // Metrics for the malloc. For more information, see: // diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index ea4716c0103..4162e5d475f 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -1927,22 +1927,43 @@ fn hdiff_buffer_cache_size_flag() { .flag("hdiff-buffer-cache-size", Some("1")) .run_with_zero_port() .with_config(|config| { - assert_eq!(config.store.hdiff_buffer_cache_size.get(), 1); + assert_eq!(config.store.cold_hdiff_buffer_cache_size.get(), 1); }); } #[test] fn hdiff_buffer_cache_size_default() { - use beacon_node::beacon_chain::store::config::DEFAULT_HDIFF_BUFFER_CACHE_SIZE; + use beacon_node::beacon_chain::store::config::DEFAULT_COLD_HDIFF_BUFFER_CACHE_SIZE; CommandLineTest::new() .run_with_zero_port() .with_config(|config| { assert_eq!( - config.store.hdiff_buffer_cache_size, - DEFAULT_HDIFF_BUFFER_CACHE_SIZE + config.store.cold_hdiff_buffer_cache_size, + DEFAULT_COLD_HDIFF_BUFFER_CACHE_SIZE ); }); } #[test] +fn hot_hdiff_buffer_cache_size_default() { + use beacon_node::beacon_chain::store::config::DEFAULT_HOT_HDIFF_BUFFER_CACHE_SIZE; + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| { + assert_eq!( + config.store.hot_hdiff_buffer_cache_size, + DEFAULT_HOT_HDIFF_BUFFER_CACHE_SIZE + ); + }); +} +#[test] +fn hot_hdiff_buffer_cache_size_flag() { + CommandLineTest::new() + .flag("hot-hdiff-buffer-cache-size", Some("3")) + .run_with_zero_port() + .with_config(|config| { + assert_eq!(config.store.hot_hdiff_buffer_cache_size.get(), 3); + }); +} +#[test] fn auto_compact_db_flag() { CommandLineTest::new() .flag("auto-compact-db", Some("false"))