From 3d1b6b2b1bd714f58fc62da065a5871d3147e684 Mon Sep 17 00:00:00 2001 From: steviez Date: Thu, 19 Dec 2024 13:48:32 -0600 Subject: [PATCH 1/4] validator: Split PoH speed measurement from check Checking the PoH speed requires a Bank to derive the cluster hash rate. By the time a Bank is available, many services have started up and are competing for CPU time. So, split the PoH speed check. Measure the speed very early on in Validator::new(), and check the value later once a Bank is available --- core/src/validator.rs | 44 ++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index f63fc7617a113c..c9bce496f2fe61 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -152,6 +152,12 @@ use { tokio::runtime::Runtime as TokioRuntime, }; +// The current number of hashes per second on Solana MNB, Testnet, and Devnet +// If the hash rate on these clusters changes, we might consider updating this +// constant. However, the PoH speed check compares hashes / second so this +// constant does not have to be updated +const POH_SPEED_CHECK_NUM_HASHES: u64 = 10_000_000; + const MAX_COMPLETED_DATA_SETS_IN_CHANNEL: usize = 100_000; const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80; // Right now since we reuse the wait for supermajority code, the @@ -623,6 +629,13 @@ impl Validator { let start_time = Instant::now(); + // Run the speed check as early as possible to + let my_poh_hashes_per_second = if config.no_poh_speed_test { + None + } else { + Some(measure_poh_speed(POH_SPEED_CHECK_NUM_HASHES)) + }; + let thread_manager = ThreadManager::new(&config.thread_manager_config)?; // Initialize the global rayon pool first to ensure the value in config // is honored. Otherwise, some code accessing the global pool could @@ -835,8 +848,11 @@ impl Validator { ) .map_err(ValidatorError::Other)?; - if !config.no_poh_speed_test { - check_poh_speed(&bank_forks.read().unwrap().root_bank(), None)?; + if let Some(my_hashes_per_second) = my_poh_hashes_per_second { + check_poh_speed( + &bank_forks.read().unwrap().root_bank(), + my_hashes_per_second, + )?; } let (root_slot, hard_forks) = { @@ -1835,19 +1851,25 @@ fn active_vote_account_exists_in_bank(bank: &Bank, vote_account: &Pubkey) -> boo false } -fn check_poh_speed(bank: &Bank, maybe_hash_samples: Option) -> Result<(), ValidatorError> { +// Compute the PoH speed (hashes / second) by measuring the time required to +// compute `num_hashes` hashes +fn measure_poh_speed(num_hashes: u64) -> u64 { + let hash_time = compute_hash_time(num_hashes); + (num_hashes as f64 / hash_time.as_secs_f64()) as u64 +} + +// Compare a computed PoH speed against the target value derived from a Bank +// and error if the computed PoH speed is less than the target speed +// +// Measurement and comparison are split so that the measurement can occur early +// in validator startup before other services start competing for CPU time +fn check_poh_speed(bank: &Bank, my_hashes_per_second: u64) -> Result<(), ValidatorError> { let Some(hashes_per_tick) = bank.hashes_per_tick() else { warn!("Unable to read hashes per tick from Bank, skipping PoH speed check"); return Ok(()); }; - let ticks_per_slot = bank.ticks_per_slot(); let hashes_per_slot = hashes_per_tick * ticks_per_slot; - let hash_samples = maybe_hash_samples.unwrap_or(hashes_per_slot); - - let hash_time = compute_hash_time(hash_samples); - let my_hashes_per_second = (hash_samples as f64 / hash_time.as_secs_f64()) as u64; - let target_slot_duration = Duration::from_nanos(bank.ns_per_slot as u64); let target_hashes_per_second = (hashes_per_slot as f64 / target_slot_duration.as_secs_f64()) as u64; @@ -3285,7 +3307,7 @@ mod tests { ..GenesisConfig::default() }; let bank = Bank::new_for_tests(&genesis_config); - assert!(check_poh_speed(&bank, Some(10_000)).is_err()); + assert!(check_poh_speed(&bank, measure_poh_speed(10_000)).is_err()); } #[test] @@ -3301,6 +3323,6 @@ mod tests { ..GenesisConfig::default() }; let bank = Bank::new_for_tests(&genesis_config); - check_poh_speed(&bank, Some(10_000)).unwrap(); + check_poh_speed(&bank, measure_poh_speed(10_000)).unwrap(); } } From eb8acf8f6164dd1532cbcee9204eaecaf97a7a83 Mon Sep 17 00:00:00 2001 From: steviez Date: Tue, 4 Feb 2025 12:08:00 -0600 Subject: [PATCH 2/4] finish comment about why speed check is split and run early --- core/src/validator.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index c9bce496f2fe61..1b6c7795a42541 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -629,7 +629,8 @@ impl Validator { let start_time = Instant::now(); - // Run the speed check as early as possible to + // Measure the PoH hash rate as early as possible to minimize + // contention with other threads let my_poh_hashes_per_second = if config.no_poh_speed_test { None } else { From 5dc9a809406a6ff4fb9e9a09e48bab7db713576b Mon Sep 17 00:00:00 2001 From: steviez Date: Tue, 4 Feb 2025 12:27:56 -0600 Subject: [PATCH 3/4] Move load genesis higher up - we need it earlier now --- core/src/validator.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index 1b6c7795a42541..b31eeea8f54149 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -629,6 +629,14 @@ impl Validator { let start_time = Instant::now(); + if !ledger_path.is_dir() { + return Err(anyhow!( + "ledger directory does not exist or is not accessible: {ledger_path:?}" + )); + } + let genesis_config = load_genesis(config, ledger_path)?; + metrics_config_sanity_check(genesis_config.cluster_type)?; + // Measure the PoH hash rate as early as possible to minimize // contention with other threads let my_poh_hashes_per_second = if config.no_poh_speed_test { @@ -717,14 +725,6 @@ impl Validator { sigverify::init(); info!("Initializing sigverify done."); - if !ledger_path.is_dir() { - return Err(anyhow!( - "ledger directory does not exist or is not accessible: {ledger_path:?}" - )); - } - let genesis_config = load_genesis(config, ledger_path)?; - metrics_config_sanity_check(genesis_config.cluster_type)?; - info!("Cleaning accounts paths.."); *start_progress.write().unwrap() = ValidatorStartProgress::CleaningAccounts; let mut timer = Measure::start("clean_accounts_paths"); From ac73c673c1a1991d43100831304fc4aad8a3befc Mon Sep 17 00:00:00 2001 From: steviez Date: Tue, 4 Feb 2025 12:35:07 -0600 Subject: [PATCH 4/4] Skip speed check for cluster type Development --- core/src/validator.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index b31eeea8f54149..dd16d974ec11fa 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -639,11 +639,18 @@ impl Validator { // Measure the PoH hash rate as early as possible to minimize // contention with other threads - let my_poh_hashes_per_second = if config.no_poh_speed_test { - None - } else { - Some(measure_poh_speed(POH_SPEED_CHECK_NUM_HASHES)) - }; + // + // Skip the check for Development clusters as this will be the type for + // tests or for clusters that we don't know what the rate will be + // without a rebuilt Bank + let my_poh_hashes_per_second = + match (genesis_config.cluster_type, !config.no_poh_speed_test) { + (_, false) | (ClusterType::Development, _) => { + info!("Skipping the PoH speed check"); + None + } + (_, true) => Some(measure_poh_speed(POH_SPEED_CHECK_NUM_HASHES)), + }; let thread_manager = ThreadManager::new(&config.thread_manager_config)?; // Initialize the global rayon pool first to ensure the value in config