Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 30 additions & 5 deletions core/src/accounts_hash_verifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use {
hash::Hash,
},
std::{
io::{Error as IoError, Result as IoResult},
sync::{
atomic::{AtomicBool, Ordering},
Arc,
Expand Down Expand Up @@ -71,12 +72,17 @@ impl AccountsHashVerifier {
info!("handling accounts package: {accounts_package:?}");
let enqueued_time = accounts_package.enqueued.elapsed();

let (_, handling_time_us) = measure_us!(Self::process_accounts_package(
let (result, handling_time_us) = measure_us!(Self::process_accounts_package(
accounts_package,
snapshot_package_sender.as_ref(),
&snapshot_config,
&exit,
));
if let Err(err) = result {
error!("Stopping AccountsHashVerifier! Fatal error while processing accounts package: {err}");
exit.store(true, Ordering::Relaxed);
Comment thread
apfitzge marked this conversation as resolved.
break;
}

datapoint_info!(
"accounts_hash_verifier",
Expand Down Expand Up @@ -208,9 +214,9 @@ impl AccountsHashVerifier {
snapshot_package_sender: Option<&Sender<SnapshotPackage>>,
snapshot_config: &SnapshotConfig,
exit: &AtomicBool,
) {
) -> IoResult<()> {
let accounts_hash =
Self::calculate_and_verify_accounts_hash(&accounts_package, snapshot_config);
Self::calculate_and_verify_accounts_hash(&accounts_package, snapshot_config)?;

Self::save_epoch_accounts_hash(&accounts_package, accounts_hash);

Expand All @@ -221,13 +227,15 @@ impl AccountsHashVerifier {
accounts_hash,
exit,
);

Ok(())
}

/// returns calculated accounts hash
fn calculate_and_verify_accounts_hash(
accounts_package: &AccountsPackage,
snapshot_config: &SnapshotConfig,
) -> AccountsHashKind {
) -> IoResult<AccountsHashKind> {
let accounts_hash_calculation_kind = match accounts_package.package_kind {
AccountsPackageKind::AccountsHashVerifier => CalcAccountsHashKind::Full,
AccountsPackageKind::EpochAccountsHash => CalcAccountsHashKind::Full,
Expand Down Expand Up @@ -303,6 +311,23 @@ impl AccountsHashVerifier {
&accounts_hash_for_reserialize,
bank_incremental_snapshot_persistence.as_ref(),
);

// now write the full snapshot slot file after reserializing so this bank snapshot is loadable
let full_snapshot_archive_slot = match accounts_package.package_kind {
AccountsPackageKind::Snapshot(SnapshotKind::IncrementalSnapshot(base_slot)) => {
base_slot
}
_ => accounts_package.slot,
};
snapshot_utils::write_full_snapshot_slot_file(
&snapshot_info.bank_snapshot_dir,
full_snapshot_archive_slot,
)
.map_err(|err| {
IoError::other(format!(
"failed to calculate accounts hash for {accounts_package:?}: {err}"
))
})?;
}

if accounts_package.package_kind
Expand Down Expand Up @@ -340,7 +365,7 @@ impl AccountsHashVerifier {
);
}

accounts_hash_kind
Ok(accounts_hash_kind)
}

fn _calculate_full_accounts_hash(
Expand Down
126 changes: 61 additions & 65 deletions ledger/src/bank_forks_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,70 @@ fn bank_forks_from_snapshot(
.map(SnapshotArchiveInfoGetter::slot)
.unwrap_or(0),
);
let latest_bank_snapshot =
snapshot_utils::get_highest_bank_snapshot_post(&snapshot_config.bank_snapshots_dir);

let will_startup_from_snapshot_archives = match process_options.use_snapshot_archives_at_startup
{
UseSnapshotArchivesAtStartup::Always => true,
UseSnapshotArchivesAtStartup::Never => false,
UseSnapshotArchivesAtStartup::WhenNewest => latest_bank_snapshot
.as_ref()
.map(|bank_snapshot| latest_snapshot_archive_slot > bank_snapshot.slot)
.unwrap_or(true),
let fastboot_snapshot = match process_options.use_snapshot_archives_at_startup {
UseSnapshotArchivesAtStartup::Always => None,
UseSnapshotArchivesAtStartup::Never => {
let Some(bank_snapshot) =
snapshot_utils::get_highest_loadable_bank_snapshot(snapshot_config)
else {
return Err(BankForksUtilsError::NoBankSnapshotDirectory {
flag: use_snapshot_archives_at_startup::cli::LONG_ARG.to_string(),
value: UseSnapshotArchivesAtStartup::Never.to_string(),
});
};
// If a newer snapshot archive was downloaded, it is possible that its slot is
// higher than the local state we will load. Did the user intend for this?
if bank_snapshot.slot < latest_snapshot_archive_slot {
warn!(
"Starting up from local state at slot {}, which is *older* than \
the latest snapshot archive at slot {}. If this is not desired, \
change the --{} CLI option to *not* \"{}\" and restart.",
bank_snapshot.slot,
latest_snapshot_archive_slot,
use_snapshot_archives_at_startup::cli::LONG_ARG,
UseSnapshotArchivesAtStartup::Never.to_string(),
);
}
Some(bank_snapshot)
}
UseSnapshotArchivesAtStartup::WhenNewest => {
snapshot_utils::get_highest_loadable_bank_snapshot(snapshot_config)
.filter(|bank_snapshot| bank_snapshot.slot >= latest_snapshot_archive_slot)
}
};

let bank = if will_startup_from_snapshot_archives {
let bank = if let Some(fastboot_snapshot) = fastboot_snapshot {
let (bank, _) = snapshot_bank_utils::bank_from_snapshot_dir(
&account_paths,
&fastboot_snapshot,
genesis_config,
&process_options.runtime_config,
process_options.debug_keys.clone(),
None,
process_options.account_indexes.clone(),
process_options.limit_load_slot_count_from_snapshot,
process_options.shrink_ratio,
process_options.verify_index,
process_options.accounts_db_config.clone(),
accounts_update_notifier,
exit,
)
.map_err(|err| BankForksUtilsError::BankFromSnapshotsDirectory {
source: err,
path: fastboot_snapshot.snapshot_path(),
})?;

// If the node crashes before taking the next bank snapshot, the next startup will attempt
// to load from the same bank snapshot again. And if `shrink` has run, the account storage
// files that are hard linked in bank snapshot will be *different* than what the bank
// snapshot expects. This would cause the node to crash again. To prevent that, purge all
// the bank snapshots here. In the above scenario, this will cause the node to load from a
// snapshot archive next time, which is safe.
snapshot_utils::purge_all_bank_snapshots(&snapshot_config.bank_snapshots_dir);

bank
} else {
// Given that we are going to boot from an archive, the append vecs held in the snapshot dirs for fast-boot should
// be released. They will be released by the account_background_service anyway. But in the case of the account_paths
// using memory-mounted file system, they are not released early enough to give space for the new append-vecs from
Expand Down Expand Up @@ -292,60 +342,6 @@ fn bank_forks_from_snapshot(
.map(|archive| archive.path().display().to_string())
.unwrap_or("none".to_string()),
})?;
bank
} else {
let bank_snapshot =
latest_bank_snapshot.ok_or_else(|| BankForksUtilsError::NoBankSnapshotDirectory {
flag: use_snapshot_archives_at_startup::cli::LONG_ARG.to_string(),
value: UseSnapshotArchivesAtStartup::Never.to_string(),
})?;

// If a newer snapshot archive was downloaded, it is possible that its slot is
// higher than the local bank we will load. Did the user intend for this?
if bank_snapshot.slot < latest_snapshot_archive_slot {
assert_eq!(
process_options.use_snapshot_archives_at_startup,
UseSnapshotArchivesAtStartup::Never,
);
warn!(
"Starting up from local state at slot {}, which is *older* than \
the latest snapshot archive at slot {}. If this is not desired, \
change the --{} CLI option to *not* \"{}\" and restart.",
bank_snapshot.slot,
latest_snapshot_archive_slot,
use_snapshot_archives_at_startup::cli::LONG_ARG,
UseSnapshotArchivesAtStartup::Never.to_string(),
);
}

let (bank, _) = snapshot_bank_utils::bank_from_snapshot_dir(
&account_paths,
&bank_snapshot,
genesis_config,
&process_options.runtime_config,
process_options.debug_keys.clone(),
None,
process_options.account_indexes.clone(),
process_options.limit_load_slot_count_from_snapshot,
process_options.shrink_ratio,
process_options.verify_index,
process_options.accounts_db_config.clone(),
accounts_update_notifier,
exit,
)
.map_err(|err| BankForksUtilsError::BankFromSnapshotsDirectory {
source: err,
path: bank_snapshot.snapshot_path(),
})?;

// If the node crashes before taking the next bank snapshot, the next startup will attempt
// to load from the same bank snapshot again. And if `shrink` has run, the account storage
// files that are hard linked in bank snapshot will be *different* than what the bank
// snapshot expects. This would cause the node to crash again. To prevent that, purge all
// the bank snapshots here. In the above scenario, this will cause the node to load from a
// snapshot archive next time, which is safe.
snapshot_utils::purge_all_bank_snapshots(&snapshot_config.bank_snapshots_dir);

bank
};

Expand Down
99 changes: 99 additions & 0 deletions local-cluster/tests/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5067,6 +5067,105 @@ fn test_boot_from_local_state() {
}
}

/// Test fastboot to ensure a node can boot in case it crashed while archiving a full snapshot
///
/// 1. Start a node and wait for it to take at least two full snapshots and one more
/// bank snapshot POST afterwards (for simplicity, wait for 2 full and 1 incremental).
/// 2. To simulate a node crashing while archiving a full snapshot, stop the node and
/// then delete the latest full snapshot archive.
/// 3. Restart the node. This should succeed, and boot from the older full snapshot archive,
/// *not* the latest bank snapshot POST.
/// 4. Take another incremental snapshot. This ensures the correct snapshot was loaded,
/// AND ensures the correct accounts hashes are present (which are needed when making
/// the bank snapshot POST for the new incremental snapshot).
#[test]
#[serial]
fn test_boot_from_local_state_missing_archive() {
solana_logger::setup_with_default(RUST_LOG_FILTER);
const FULL_SNAPSHOT_INTERVAL: Slot = 20;
const INCREMENTAL_SNAPSHOT_INTERVAL: Slot = 10;

let validator_config = SnapshotValidatorConfig::new(
FULL_SNAPSHOT_INTERVAL,
INCREMENTAL_SNAPSHOT_INTERVAL,
INCREMENTAL_SNAPSHOT_INTERVAL,
7,
);

let mut cluster_config = ClusterConfig {
node_stakes: vec![100 * DEFAULT_NODE_STAKE],
cluster_lamports: DEFAULT_CLUSTER_LAMPORTS,
validator_configs: make_identical_validator_configs(&validator_config.validator_config, 1),
..ClusterConfig::default()
};
let mut cluster = LocalCluster::new(&mut cluster_config, SocketAddrSpace::Unspecified);

// we need two full snapshots and an incremental snapshot for this test
info!("Waiting for validator to create snapshots...");
LocalCluster::wait_for_next_full_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
LocalCluster::wait_for_next_full_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
LocalCluster::wait_for_next_incremental_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
&validator_config.incremental_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
debug!(
"snapshot archives:\n\tfull: {:?}\n\tincr: {:?}",
snapshot_utils::get_full_snapshot_archives(
validator_config.full_snapshot_archives_dir.path()
),
snapshot_utils::get_incremental_snapshot_archives(
validator_config.incremental_snapshot_archives_dir.path()
),
);
info!("Waiting for validator to create snapshots... DONE");

// now delete the latest full snapshot archive and restart, to simulate a crash while archiving
// a full snapshot package
info!("Stopping validator...");
let validator_pubkey = cluster.get_node_pubkeys()[0];
let mut validator_info = cluster.exit_node(&validator_pubkey);
info!("Stopping validator... DONE");

info!("Deleting latest full snapshot archive...");
let highest_full_snapshot = snapshot_utils::get_highest_full_snapshot_archive_info(
validator_config.full_snapshot_archives_dir.path(),
)
.unwrap();
fs::remove_file(highest_full_snapshot.path()).unwrap();
info!("Deleting latest full snapshot archive... DONE");

info!("Restarting validator...");
// if we set this to `Never`, the validator should not boot
validator_info.config.use_snapshot_archives_at_startup =
UseSnapshotArchivesAtStartup::WhenNewest;
cluster.restart_node(
&validator_pubkey,
validator_info,
SocketAddrSpace::Unspecified,
);
info!("Restarting validator... DONE");

// ensure we can create new incremental snapshots, since that is what used to fail
info!("Waiting for validator to create snapshots...");
LocalCluster::wait_for_next_incremental_snapshot(
&cluster,
&validator_config.full_snapshot_archives_dir,
&validator_config.incremental_snapshot_archives_dir,
Some(Duration::from_secs(5 * 60)),
);
info!("Waiting for validator to create snapshots... DONE");
}

// We want to simulate the following:
// /--- 1 --- 3 (duplicate block)
// 0
Expand Down
Loading