From 66f68826dddc2647fe07c066d69715f751fdb851 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Mon, 5 Jan 2026 07:55:54 +0000 Subject: [PATCH 01/23] maybe 3.1.5 rebas --- Cargo.lock | 24 + Cargo.toml | 1 + core/Cargo.toml | 5 + core/benches/block_consumer.rs | 323 +++++++++ core/benches/scheduler.rs | 1 + core/src/banking_stage.rs | 10 + core/src/banking_stage/consume_worker.rs | 23 + core/src/banking_stage/decision_maker.rs | 49 +- core/src/banking_stage/scheduler_messages.rs | 3 + .../transaction_scheduler/greedy_scheduler.rs | 19 +- .../prio_graph_scheduler.rs | 19 +- .../transaction_scheduler/scheduler.rs | 4 + .../transaction_scheduler/scheduler_common.rs | 29 +- .../scheduler_controller.rs | 5 + core/src/block_stage/block_consumer.rs | 662 ++++++++++++++++++ core/src/block_stage/devin_scheduler.rs | 220 ++++++ core/src/block_stage/harmonic_block.rs | 30 + core/src/block_stage/mod.rs | 269 +++++++ core/src/block_stage/timer.rs | 102 +++ core/src/bundle_stage.rs | 29 +- core/src/lib.rs | 2 + core/src/proxy/block_engine_stage.rs | 74 +- core/src/replay_stage.rs | 2 + core/src/scheduler_synchronization.rs | 349 +++++++++ core/src/tpu.rs | 25 +- core/tests/block_consumer.rs | 209 ++++++ cost-model/src/cost_tracker.rs | 57 +- entry/src/poh.rs | 5 + jito-protos/protos | 2 +- jito-protos/src/lib.rs | 4 + poh/src/poh_recorder.rs | 67 +- runtime/src/bank.rs | 35 +- svm/Cargo.toml | 1 + svm/src/account_loader.rs | 7 +- svm/src/account_overrides.rs | 72 +- 35 files changed, 2668 insertions(+), 70 deletions(-) create mode 100644 core/benches/block_consumer.rs create mode 100644 core/src/block_stage/block_consumer.rs create mode 100644 core/src/block_stage/devin_scheduler.rs create mode 100644 core/src/block_stage/harmonic_block.rs create mode 100644 core/src/block_stage/mod.rs create mode 100644 core/src/block_stage/timer.rs create mode 100644 core/src/scheduler_synchronization.rs create mode 100644 core/tests/block_consumer.rs diff --git a/Cargo.lock b/Cargo.lock index a6691a5975..87ff811ac6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6567,6 +6567,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "saa" +version = "5.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77cb23a1da9bcf98289bea29df468b782ddf2993836d1ebd171c403210b86baa" + [[package]] name = "same-file" version = "1.0.6" @@ -6576,6 +6582,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scc" +version = "3.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41314cecf05a9988a3717479e80f132c00f64298489f177c268bd675aef03fcc" +dependencies = [ + "saa", + "sdd", +] + [[package]] name = "schannel" version = "0.1.19" @@ -6608,6 +6624,12 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "sdd" +version = "4.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63d45f3526312c9c90d717aac28d37010e623fbd7ca6f21503e69784e86f40" + [[package]] name = "sec1" version = "0.7.3" @@ -8301,6 +8323,7 @@ dependencies = [ "rolling-file", "rts-alloc", "rustls 0.23.34", + "scc", "serde", "serde_bytes", "serde_json", @@ -11067,6 +11090,7 @@ dependencies = [ "percentage", "qualifier_attr", "rand 0.7.3", + "scc", "serde", "shuttle", "solana-account", diff --git a/Cargo.toml b/Cargo.toml index afc16ea9c2..be261fcb95 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -370,6 +370,7 @@ rpassword = "7.4" rts-alloc = { version = "0.2.0" } rustls = { version = "0.23.34", features = ["std"], default-features = false } scopeguard = "1.2.0" +scc = "3.4.8" semver = "1.0.27" seqlock = "0.2.0" serde = { version = "1.0.228", features = ["derive"] } diff --git a/core/Cargo.toml b/core/Cargo.toml index 7d1f4d4df3..e14b8528b3 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -86,6 +86,7 @@ rand_chacha = { workspace = true } rayon = { workspace = true } rolling-file = { workspace = true } rustls = { workspace = true } +scc = { workspace = true } serde = { workspace = true } serde_bytes = { workspace = true } slab = { workspace = true } @@ -260,5 +261,9 @@ harness = false name = "scheduler" harness = false +[[bench]] +name = "block_consumer" +harness = false + [lints] workspace = true diff --git a/core/benches/block_consumer.rs b/core/benches/block_consumer.rs new file mode 100644 index 0000000000..ea9f82d88f --- /dev/null +++ b/core/benches/block_consumer.rs @@ -0,0 +1,323 @@ +//! Benchmarks for block execution via BlockConsumer. +//! +//! Measures the performance of executing blocks of transactions using optimistic recording. + +use { + agave_reserved_account_keys::ReservedAccountKeys, + agave_transaction_view::{ + resolved_transaction_view::ResolvedTransactionView, + transaction_view::SanitizedTransactionView, + }, + criterion::{ + criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, + Throughput, + }, + crossbeam_channel::unbounded, + rand::{prelude::*, thread_rng}, + rayon::prelude::*, + solana_core::{ + banking_stage::{committer::Committer, scheduler_messages::MaxAge}, + block_stage::BlockConsumer, + }, + solana_keypair::Keypair, + solana_ledger::genesis_utils::{create_genesis_config_with_leader, GenesisConfigInfo}, + solana_message::Message, + solana_native_token::LAMPORTS_PER_SOL, + solana_poh::{ + record_channels::{record_channels, RecordReceiver}, + transaction_recorder::TransactionRecorder, + }, + solana_runtime::{ + bank::{test_utils::deposit, Bank}, + bank_forks::BankForks, + prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_runtime_transaction::runtime_transaction::RuntimeTransaction, + solana_signer::Signer, + solana_system_interface::instruction as system_instruction, + solana_transaction::{sanitized::MessageHash, versioned::VersionedTransaction, Transaction}, + std::{sync::Arc, time::Duration}, +}; + +const NUM_TRANSACTIONS: [usize; 5] = [1, 10, 100, 500, 1000]; + +fn make_config() -> ( + GenesisConfigInfo, + Arc, + Arc>, +) { + let leader_keypair = Keypair::new(); + let GenesisConfigInfo { + mut genesis_config, + mint_keypair, + voting_keypair, + .. + } = create_genesis_config_with_leader( + 100 * LAMPORTS_PER_SOL, + &leader_keypair.pubkey(), + LAMPORTS_PER_SOL * 1_000_000, + ); + + // Increase ticks per slot to have more time + genesis_config.ticks_per_slot *= 8; + + // Create bank with proper fork graph setup + let bank = Bank::new_for_benches(&genesis_config); + let bank_forks = BankForks::new_rw_arc(bank); + let bank = bank_forks.read().unwrap().get(0).unwrap(); + + // Set up fork graph for program cache + bank.set_fork_graph_in_program_cache(Arc::downgrade(&bank_forks)); + + ( + GenesisConfigInfo { + genesis_config, + mint_keypair, + voting_keypair, + validator_pubkey: leader_keypair.pubkey(), + }, + bank, + bank_forks, + ) +} + +/// Convert serialized transaction bytes to RuntimeTransaction +fn to_runtime_transaction(serialized: &[u8]) -> RuntimeTransaction> { + let transaction_view = SanitizedTransactionView::try_new_sanitized(serialized, true).unwrap(); + let static_runtime_tx = RuntimeTransaction::>::try_from( + transaction_view, + MessageHash::Compute, + None, + ) + .unwrap(); + RuntimeTransaction::>::try_from( + static_runtime_tx, + None, + &ReservedAccountKeys::empty_key_set(), + ) + .unwrap() +} + +/// Create transactions with random account accesses. +/// Returns serialized bytes that must stay alive for the transaction views. +fn make_transactions( + num_accounts: usize, + num_transactions: usize, + bank: &Bank, + recent_blockhash: solana_hash::Hash, +) -> Vec> { + // Create unique account keys + let accounts: Vec = (0..num_accounts) + .into_par_iter() + .map(|_| Keypair::new()) + .collect(); + + // Fund each account with enough SOL + accounts.par_iter().for_each(|account| { + deposit(bank, &account.pubkey(), 100 * LAMPORTS_PER_SOL).unwrap(); + }); + + // Create transactions and serialize them + (0..num_transactions) + .into_par_iter() + .map(|_i| { + let mut rng = thread_rng(); + // Use 2-10 accounts per transaction + let num_tx_accounts = rng.gen_range(2..=10.min(num_accounts)); + let selected_accounts: Vec<_> = accounts + .choose_multiple(&mut rng, num_tx_accounts) + .collect(); + + // First account is the payer/signer + let payer = selected_accounts[0]; + + // Create transfer instructions to other accounts + let instructions: Vec<_> = selected_accounts + .iter() + .skip(1) + .map(|account| system_instruction::transfer(&payer.pubkey(), &account.pubkey(), 1)) + .collect(); + + let message = Message::new(&instructions, Some(&payer.pubkey())); + let tx = Transaction::new(&[payer], message, recent_blockhash); + + // Serialize to bytes + bincode::serialize(&VersionedTransaction::from(tx)).unwrap() + }) + .collect() +} + +/// Create sequential transfer block (tx1 funds tx2's payer, tx2 funds tx3's payer, etc.) +/// Returns serialized bytes that must stay alive for the transaction views. +fn make_sequential_block( + num_transactions: usize, + bank: &Bank, + recent_blockhash: solana_hash::Hash, +) -> Vec> { + let keypairs: Vec = (0..num_transactions + 1).map(|_| Keypair::new()).collect(); + + // Fund the first keypair from mint + deposit(bank, &keypairs[0].pubkey(), 100 * LAMPORTS_PER_SOL).unwrap(); + + // Create chain of transfers: keypairs[0] -> keypairs[1] -> keypairs[2] -> ... + (0..num_transactions) + .map(|i| { + let from = &keypairs[i]; + let to = &keypairs[i + 1]; + let ix = system_instruction::transfer(&from.pubkey(), &to.pubkey(), LAMPORTS_PER_SOL); + let message = Message::new(&[ix], Some(&from.pubkey())); + let tx = Transaction::new(&[from], message, recent_blockhash); + + // Serialize to bytes + bincode::serialize(&VersionedTransaction::from(tx)).unwrap() + }) + .collect() +} + +fn create_block_consumer(bank: &Bank) -> (BlockConsumer, TransactionRecorder, RecordReceiver) { + let (record_sender, mut record_receiver) = record_channels(false); + let recorder = TransactionRecorder::new(record_sender); + record_receiver.restart(bank.bank_id()); + + let (replay_vote_sender, _replay_vote_receiver) = unbounded(); + + let committer = Committer::new( + None, + replay_vote_sender, + Arc::new(PrioritizationFeeCache::new(0u64)), + ); + + let consumer = BlockConsumer::new(committer, recorder.clone(), None); + + (consumer, recorder, record_receiver) +} + +fn bench_random_blocks(c: &mut Criterion) { + let (genesis_config_info, bank, _bank_forks) = make_config(); + let mut group = c.benchmark_group("block_random_transactions"); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + + // Pre-generate transactions for the largest block size + let max_transactions = *NUM_TRANSACTIONS.last().unwrap(); + let num_accounts = max_transactions * 2; // Ensure enough accounts + + // Create serialized transactions (bytes must stay alive) + let serialized_txs = make_transactions( + num_accounts, + max_transactions, + &bank, + genesis_config_info.genesis_config.hash(), + ); + + for num_transactions in NUM_TRANSACTIONS { + group.throughput(Throughput::Elements(num_transactions as u64)); + + let block_bytes = &serialized_txs[..num_transactions]; + let max_ages: Vec = vec![MaxAge::MAX; num_transactions]; + + group.bench_function( + BenchmarkId::new("process_and_record_block_transactions", num_transactions), + |b| { + b.iter_batched( + || { + // Clear signatures before each iteration + bank.clear_signatures(); + // Convert bytes to runtime transactions (zerocopy views) + let transactions: Vec<_> = block_bytes + .iter() + .map(|bytes| to_runtime_transaction(bytes)) + .collect(); + let (consumer, recorder, record_receiver) = create_block_consumer(&bank); + (consumer, recorder, record_receiver, transactions) + }, + |(mut consumer, _recorder, _record_receiver, transactions)| { + let output = consumer.process_and_record_block_transactions( + &bank, + &transactions, + &max_ages, + bank.slot(), + ); + // Ensure the block was processed + assert!(output + .execute_and_commit_transactions_output + .commit_transactions_result + .is_ok()); + }, + criterion::BatchSize::SmallInput, + ); + }, + ); + } + + group.finish(); +} + +fn bench_sequential_blocks(c: &mut Criterion) { + let (genesis_config_info, bank, _bank_forks) = make_config(); + let mut group = c.benchmark_group("block_sequential_transactions"); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + + // Pre-generate sequential block for the largest size + let max_transactions = *NUM_TRANSACTIONS.last().unwrap(); + let sequential_bytes = make_sequential_block( + max_transactions, + &bank, + genesis_config_info.genesis_config.hash(), + ); + + for num_transactions in NUM_TRANSACTIONS { + group.throughput(Throughput::Elements(num_transactions as u64)); + + let block_bytes = &sequential_bytes[..num_transactions]; + let max_ages: Vec = vec![MaxAge::MAX; num_transactions]; + + group.bench_function( + BenchmarkId::new("process_and_record_block_transactions", num_transactions), + |b| { + b.iter_batched( + || { + // Clear signatures before each iteration + bank.clear_signatures(); + // Convert bytes to runtime transactions (zerocopy views) + let transactions: Vec<_> = block_bytes + .iter() + .map(|bytes| to_runtime_transaction(bytes)) + .collect(); + let (consumer, recorder, record_receiver) = create_block_consumer(&bank); + (consumer, recorder, record_receiver, transactions) + }, + |(mut consumer, _recorder, _record_receiver, transactions)| { + let output = consumer.process_and_record_block_transactions( + &bank, + &transactions, + &max_ages, + bank.slot(), + ); + // Ensure the block was processed + assert!(output + .execute_and_commit_transactions_output + .commit_transactions_result + .is_ok()); + }, + criterion::BatchSize::SmallInput, + ); + }, + ); + } + + group.finish(); +} + +criterion_group!( + name = benches; + config = Criterion::default() + .noise_threshold(0.1) + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(5)) + .sample_size(20); + targets = + bench_random_blocks, + bench_sequential_blocks, +); + +criterion_main!(benches); diff --git a/core/benches/scheduler.rs b/core/benches/scheduler.rs index 7c43fab8a0..2905ee8eb4 100644 --- a/core/benches/scheduler.rs +++ b/core/benches/scheduler.rs @@ -218,6 +218,7 @@ fn timing_scheduler>( .schedule( black_box(&mut container), u64::MAX, // no budget + 0, // target_slot - benchmark doesn't care about slot validation bench_env.filter_1, bench_env.filter_2, ) diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 8f308e0511..2062bf1c5f 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -805,6 +805,7 @@ mod tests { super::*, crate::{ banking_trace::{BankingTracer, Channels}, + scheduler_synchronization, validator::SchedulerPacing, }, agave_banking_stage_ingress_types::BankingPacketBatch, @@ -1164,6 +1165,11 @@ mod tests { poh_service, entry_receiver, ) = create_test_recorder(bank.clone(), blockstore, None, None); + + // Force vanilla scheduling for slot 0 (simulates being past delegation period) + scheduler_synchronization::reset_for_tests(); + scheduler_synchronization::force_vanilla_claim(0); + let _banking_stage = BankingStage::new_num_threads( BlockProductionMethod::CentralScheduler, poh_recorder.clone(), @@ -1447,6 +1453,10 @@ mod tests { entry_receiver, ) = create_test_recorder(bank.clone(), blockstore, None, None); + // Force vanilla scheduling for slot 0 (simulates being past delegation period) + scheduler_synchronization::reset_for_tests(); + scheduler_synchronization::force_vanilla_claim(0); + let (replay_vote_sender, _replay_vote_receiver) = unbounded(); let blacklisted_keypair = Keypair::new(); diff --git a/core/src/banking_stage/consume_worker.rs b/core/src/banking_stage/consume_worker.rs index bbe91551ac..88691564f1 100644 --- a/core/src/banking_stage/consume_worker.rs +++ b/core/src/banking_stage/consume_worker.rs @@ -114,6 +114,18 @@ impl ConsumeWorker { let bank = leader_state .working_bank() .expect("active_leader_state_with_timeout should only return an active bank"); + + // Validate that the bank slot matches the slot we scheduled for. + // This prevents executing transactions on the wrong slot due to race conditions. + if bank.slot() != work.target_slot { + log::info!( + "Slot mismatch: work scheduled for slot {} but current bank is slot {}", + work.target_slot, + bank.slot() + ); + return Ok(ProcessingStatus::CouldNotProcess(work)); + } + self.metrics .count_metrics .num_messages_processed @@ -1771,12 +1783,14 @@ mod tests { ids: vec![id], transactions, max_ages: vec![max_age], + target_slot: bank.slot(), }; consume_sender.send(work).unwrap(); let consumed = consumed_receiver.recv().unwrap(); assert_eq!(consumed.work.batch_id, bid); assert_eq!(consumed.work.ids, vec![id]); assert_eq!(consumed.work.max_ages, vec![max_age]); + assert_eq!(consumed.work.target_slot, bank.slot()); assert_eq!( consumed.retryable_indexes, vec![RetryableIndex::new(0, true)] @@ -1827,12 +1841,14 @@ mod tests { ids: vec![id], transactions, max_ages: vec![max_age], + target_slot: bank.slot(), }; consume_sender.send(work).unwrap(); let consumed = consumed_receiver.recv().unwrap(); assert_eq!(consumed.work.batch_id, bid); assert_eq!(consumed.work.ids, vec![id]); assert_eq!(consumed.work.max_ages, vec![max_age]); + assert_eq!(consumed.work.target_slot, bank.slot()); assert_eq!(consumed.retryable_indexes, Vec::new()); drop(test_frame); @@ -1883,6 +1899,7 @@ mod tests { ids: vec![id1, id2], transactions: txs, max_ages: vec![max_age, max_age], + target_slot: bank.slot(), }) .unwrap(); @@ -1890,6 +1907,7 @@ mod tests { assert_eq!(consumed.work.batch_id, bid); assert_eq!(consumed.work.ids, vec![id1, id2]); assert_eq!(consumed.work.max_ages, vec![max_age, max_age]); + assert_eq!(consumed.work.target_slot, bank.slot()); // id2 succeeds with simd83, or is retryable due to lock conflict without simd83 assert_eq!( @@ -1957,6 +1975,7 @@ mod tests { ids: vec![id1], transactions: txs1, max_ages: vec![max_age], + target_slot: bank.slot(), }) .unwrap(); @@ -1966,18 +1985,21 @@ mod tests { ids: vec![id2], transactions: txs2, max_ages: vec![max_age], + target_slot: bank.slot(), }) .unwrap(); let consumed = consumed_receiver.recv().unwrap(); assert_eq!(consumed.work.batch_id, bid1); assert_eq!(consumed.work.ids, vec![id1]); assert_eq!(consumed.work.max_ages, vec![max_age]); + assert_eq!(consumed.work.target_slot, bank.slot()); assert_eq!(consumed.retryable_indexes, Vec::new()); let consumed = consumed_receiver.recv().unwrap(); assert_eq!(consumed.work.batch_id, bid2); assert_eq!(consumed.work.ids, vec![id2]); assert_eq!(consumed.work.max_ages, vec![max_age]); + assert_eq!(consumed.work.target_slot, bank.slot()); assert_eq!(consumed.retryable_indexes, Vec::new()); drop(test_frame); @@ -2110,6 +2132,7 @@ mod tests { alt_invalidation_slot: bank.slot() + 1, }, ], + target_slot: bank.slot(), }) .unwrap(); diff --git a/core/src/banking_stage/decision_maker.rs b/core/src/banking_stage/decision_maker.rs index be4a69416d..308f12736e 100644 --- a/core/src/banking_stage/decision_maker.rs +++ b/core/src/banking_stage/decision_maker.rs @@ -1,4 +1,6 @@ use { + crate::scheduler_synchronization, + log::debug, solana_clock::{ DEFAULT_TICKS_PER_SLOT, FORWARD_TRANSACTIONS_TO_LEADER_AT_SLOT_OFFSET, HOLD_TRANSACTIONS_SLOT_OFFSET, @@ -7,7 +9,7 @@ use { solana_runtime::bank::Bank, solana_unified_scheduler_pool::{BankingStageMonitor, BankingStageStatus}, std::sync::{ - atomic::{AtomicBool, Ordering::Relaxed}, + atomic::{AtomicBool, Ordering}, Arc, }, }; @@ -69,6 +71,48 @@ impl DecisionMaker { BufferedPacketsDecision::Forward } } + + /// Gate consume decisions based on scheduler synchronization. + /// + /// vanilla: consume if we are in fallback period with no external signal. + /// there are no other preconditions + /// block: consume if we are in delegation period. + /// preconditions: there is a bundle (for this slot) to consume + pub fn maybe_consume( + decision: BufferedPacketsDecision, + ) -> BufferedPacketsDecision { + debug!("maybe_consume VANILLA {VANILLA:?} decision {decision:?}"); + let BufferedPacketsDecision::Consume(bank) = decision else { + return decision; + }; + + let current_tick_height = bank.tick_height(); + let max_tick_height = bank.max_tick_height(); + let bank_ticks_per_slot = bank.ticks_per_slot(); + let start_tick = max_tick_height - bank_ticks_per_slot; + let ticks_into_slot = current_tick_height.saturating_sub(start_tick); + let delegation_period_length = bank_ticks_per_slot * 15 / 16; + let in_delegation_period = ticks_into_slot < delegation_period_length; + + debug!("maybe_consume current_tick_height {current_tick_height} max_tick_height {max_tick_height} bank_ticks_per_slot {bank_ticks_per_slot} start_tick {start_tick} ticks_into_slot {ticks_into_slot} delegation_period_length {delegation_period_length} in_delegation_period {in_delegation_period}"); + + let current_slot = bank.slot(); + + // Call the appropriate scheduler function + // vanilla_should_schedule and block_should_schedule are now idempotent - + // multiple threads calling for the same slot will get consistent results + let should_schedule: fn(u64, bool) -> Option = if VANILLA { + scheduler_synchronization::vanilla_should_schedule + } else { + scheduler_synchronization::block_should_schedule + }; + + match should_schedule(current_slot, in_delegation_period) { + Some(true) => BufferedPacketsDecision::Consume(bank), + Some(false) => BufferedPacketsDecision::Hold, + None => BufferedPacketsDecision::Hold, + } + } } impl From<&PohRecorder> for DecisionMaker { @@ -94,7 +138,7 @@ impl DecisionMakerWrapper { impl BankingStageMonitor for DecisionMakerWrapper { fn status(&mut self) -> BankingStageStatus { - if self.is_exited.load(Relaxed) { + if self.is_exited.load(Ordering::Relaxed) { BankingStageStatus::Exited } else if matches!( self.decision_maker.make_consume_or_forward_decision(), @@ -107,6 +151,7 @@ impl BankingStageMonitor for DecisionMakerWrapper { } } + #[cfg(test)] mod tests { use { diff --git a/core/src/banking_stage/scheduler_messages.rs b/core/src/banking_stage/scheduler_messages.rs index 37fdf92d2b..ad95658b2b 100644 --- a/core/src/banking_stage/scheduler_messages.rs +++ b/core/src/banking_stage/scheduler_messages.rs @@ -42,6 +42,9 @@ pub struct ConsumeWork { pub ids: Vec, pub transactions: Vec, pub max_ages: Vec, + /// Harmonic: the slot this work was scheduled for. Workers should NOT + /// execute if the working bank's slot doesn't match this target slot. + pub target_slot: Slot, } /// Message: [Worker -> Scheduler] diff --git a/core/src/banking_stage/transaction_scheduler/greedy_scheduler.rs b/core/src/banking_stage/transaction_scheduler/greedy_scheduler.rs index a854b124f7..04aaf265ac 100644 --- a/core/src/banking_stage/transaction_scheduler/greedy_scheduler.rs +++ b/core/src/banking_stage/transaction_scheduler/greedy_scheduler.rs @@ -83,6 +83,7 @@ impl Scheduler for GreedyScheduler { &mut self, container: &mut S, budget: u64, + target_slot: solana_clock::Slot, _pre_graph_filter: impl Fn(&[&Tx], &mut [bool]), pre_lock_filter: impl Fn(&TransactionState) -> PreLockFilterAction, ) -> Result { @@ -154,7 +155,7 @@ impl Scheduler for GreedyScheduler { .check_locks(transaction_state.transaction()) { self.working_account_set.clear(); - num_sent += self.common.send_batches()?; + num_sent += self.common.send_batches(target_slot)?; } // Now check if the transaction can actually be scheduled. @@ -207,7 +208,7 @@ impl Scheduler for GreedyScheduler { >= self.config.target_transactions_per_batch { self.working_account_set.clear(); - num_sent += self.common.send_batches()?; + num_sent += self.common.send_batches(target_slot)?; } // if the thread is at target_cu_per_thread, remove it from the schedulable threads @@ -226,7 +227,7 @@ impl Scheduler for GreedyScheduler { } self.working_account_set.clear(); - num_sent += self.common.send_batches()?; + num_sent += self.common.send_batches(target_slot)?; let Saturating(num_scheduled) = num_scheduled; assert_eq!( num_scheduled, num_sent, @@ -461,6 +462,7 @@ mod test { scheduler.schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter ), @@ -484,6 +486,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -509,6 +512,7 @@ mod test { .schedule( &mut container, 0, // zero budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -536,6 +540,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -564,6 +569,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -592,6 +598,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -618,6 +625,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -641,6 +649,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -685,6 +694,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -723,6 +733,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -773,6 +784,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -790,6 +802,7 @@ mod test { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) diff --git a/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs b/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs index 79da8c5565..6e2d98c566 100644 --- a/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs +++ b/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs @@ -117,6 +117,7 @@ impl Scheduler for PrioGraphScheduler { &mut self, container: &mut S, budget: u64, + target_slot: solana_clock::Slot, pre_graph_filter: impl Fn(&[&Tx], &mut [bool]), pre_lock_filter: impl Fn(&TransactionState) -> PreLockFilterAction, ) -> Result { @@ -290,7 +291,7 @@ impl Scheduler for PrioGraphScheduler { if self.common.batches.transactions()[thread_id].len() >= self.config.target_transactions_per_batch { - num_sent += self.common.send_batch(thread_id)?; + num_sent += self.common.send_batch(thread_id, target_slot)?; } // if the thread is at max_cu_per_thread, remove it from the schedulable threads @@ -313,7 +314,7 @@ impl Scheduler for PrioGraphScheduler { } // Send all non-empty batches - num_sent += self.common.send_batches()?; + num_sent += self.common.send_batches(target_slot)?; // Refresh window budget and do chunked pops window_budget += unblock_this_batch.len(); @@ -326,7 +327,7 @@ impl Scheduler for PrioGraphScheduler { } // Send batches for any remaining transactions - num_sent += self.common.send_batches()?; + num_sent += self.common.send_batches(target_slot)?; // Push unschedulable ids back into the container container.push_ids_into_queue(unschedulable_ids.into_iter()); @@ -614,6 +615,7 @@ mod tests { scheduler.schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter ), @@ -634,6 +636,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -656,6 +659,7 @@ mod tests { .schedule( &mut container, 0, // zero budget. nothing should be scheduled + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -678,6 +682,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -701,6 +706,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -729,6 +735,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -776,6 +783,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -791,6 +799,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -810,6 +819,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -839,6 +849,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -899,6 +910,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) @@ -916,6 +928,7 @@ mod tests { .schedule( &mut container, u64::MAX, // no budget + 0, // target_slot - test doesn't care about slot validation test_pre_graph_filter, test_pre_lock_filter, ) diff --git a/core/src/banking_stage/transaction_scheduler/scheduler.rs b/core/src/banking_stage/transaction_scheduler/scheduler.rs index 797f75ce67..cd8b855476 100644 --- a/core/src/banking_stage/transaction_scheduler/scheduler.rs +++ b/core/src/banking_stage/transaction_scheduler/scheduler.rs @@ -5,6 +5,7 @@ use { scheduler_common::SchedulingCommon, scheduler_error::SchedulerError, transaction_state::TransactionState, transaction_state_container::StateContainer, }, + solana_clock::Slot, solana_runtime_transaction::transaction_with_meta::TransactionWithMeta, std::num::Saturating, }; @@ -14,10 +15,13 @@ pub(crate) trait Scheduler { /// Schedule transactions from `container`. /// pre-graph and pre-lock filters may be passed to be applied /// before specific actions internally. + /// `target_slot` is the slot being scheduled for - workers will validate + /// they are executing on the correct slot. fn schedule>( &mut self, container: &mut S, budget: u64, + target_slot: Slot, pre_graph_filter: impl Fn(&[&Tx], &mut [bool]), pre_lock_filter: impl Fn(&TransactionState) -> PreLockFilterAction, ) -> Result; diff --git a/core/src/banking_stage/transaction_scheduler/scheduler_common.rs b/core/src/banking_stage/transaction_scheduler/scheduler_common.rs index c9b4d59e62..f0c4906e67 100644 --- a/core/src/banking_stage/transaction_scheduler/scheduler_common.rs +++ b/core/src/banking_stage/transaction_scheduler/scheduler_common.rs @@ -13,6 +13,7 @@ use { }, crossbeam_channel::{Receiver, Sender, TryRecvError}, itertools::izip, + solana_clock::Slot, solana_runtime_transaction::transaction_with_meta::TransactionWithMeta, }; @@ -179,7 +180,13 @@ impl SchedulingCommon { /// Send a batch of transactions to the given thread's `ConsumeWork` channel. /// Returns the number of transactions sent. - pub fn send_batch(&mut self, thread_index: usize) -> Result { + /// `target_slot` is the slot this batch is scheduled for - workers will validate + /// they are executing on the correct slot. + pub fn send_batch( + &mut self, + thread_index: usize, + target_slot: Slot, + ) -> Result { if self.batches.ids[thread_index].is_empty() { return Ok(0); } @@ -196,6 +203,7 @@ impl SchedulingCommon { ids, transactions, max_ages, + target_slot, }; self.consume_work_senders[thread_index] .send(work) @@ -206,9 +214,11 @@ impl SchedulingCommon { /// Send all batches of transactions to the worker threads. /// Returns the number of transactions sent. - pub fn send_batches(&mut self) -> Result { + /// `target_slot` is the slot this batch is scheduled for - workers will validate + /// they are executing on the correct slot. + pub fn send_batches(&mut self, target_slot: Slot) -> Result { (0..self.consume_work_senders.len()) - .map(|thread_index| self.send_batch(thread_index)) + .map(|thread_index| self.send_batch(thread_index, target_slot)) .sum() } } @@ -228,6 +238,7 @@ impl SchedulingCommon { ids, transactions, max_ages: _, + target_slot: _, }, retryable_indexes, }) => { @@ -460,7 +471,7 @@ mod tests { let mut common = SchedulingCommon::new(work_senders, finished_work_receiver, 10); pop_and_add_transaction(&mut container, &mut common, 0); - let num_scheduled = common.send_batch(0).unwrap(); + let num_scheduled = common.send_batch(0, 0).unwrap(); assert_eq!(num_scheduled, 1); assert_eq!(work_receivers[0].len(), 1); assert_eq!( @@ -472,7 +483,7 @@ mod tests { &[DUMMY_COST, 0, 0, 0] ); - let num_scheduled = common.send_batch(1).unwrap(); + let num_scheduled = common.send_batch(1, 0).unwrap(); assert_eq!(num_scheduled, 0); assert_eq!(work_receivers[1].len(), 0); // not actually sent since no transactions. @@ -482,7 +493,7 @@ mod tests { pop_and_add_transaction(&mut container, &mut common, 0); pop_and_add_transaction(&mut container, &mut common, 2); - common.send_batches().unwrap(); + common.send_batches(0).unwrap(); assert_eq!(work_receivers[0].len(), 1); assert_eq!(work_receivers[1].len(), 0); assert_eq!(work_receivers[2].len(), 1); @@ -509,7 +520,7 @@ mod tests { // Send a batch. Return completed work. pop_and_add_transaction(&mut container, &mut common, 0); - let num_scheduled = common.send_batch(0).unwrap(); + let num_scheduled = common.send_batch(0, 0).unwrap(); let work = work_receivers[0].try_recv().unwrap(); assert_eq!(work.ids.len(), num_scheduled); @@ -531,7 +542,7 @@ mod tests { pop_and_add_transaction(&mut container, &mut common, 0); pop_and_add_transaction(&mut container, &mut common, 0); pop_and_add_transaction(&mut container, &mut common, 0); - let num_scheduled = common.send_batch(0).unwrap(); + let num_scheduled = common.send_batch(0, 0).unwrap(); let work = work_receivers[0].try_recv().unwrap(); assert_eq!(work.ids.len(), num_scheduled); let retryable_indexes = vec![ @@ -565,7 +576,7 @@ mod tests { add_transactions_to_container(&mut container, 2); pop_and_add_transaction(&mut container, &mut common, 0); pop_and_add_transaction(&mut container, &mut common, 0); - let num_scheduled = common.send_batch(0).unwrap(); + let num_scheduled = common.send_batch(0, 0).unwrap(); let work = work_receivers[0].try_recv().unwrap(); assert_eq!(work.ids.len(), num_scheduled); let retryable_indexes = vec![RetryableIndex::new(1, true), RetryableIndex::new(0, true)]; diff --git a/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs b/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs index ca5568b505..1c9a8cd6e1 100644 --- a/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs +++ b/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs @@ -130,6 +130,9 @@ where // bypass sanitization and buffering and immediately drop the packets. let (decision, decision_time_us) = measure_us!(self.decision_maker.make_consume_or_forward_decision()); + // cavey: gate non-vote transaction scheduling behind vanilla scheduler - + // only run after the delegation threshold if no block was received + let decision = DecisionMaker::maybe_consume::(decision); self.timing_metrics.update(|timing_metrics| { timing_metrics.decision_time_us += decision_time_us; }); @@ -214,6 +217,7 @@ where let (scheduling_summary, schedule_time_us) = measure_us!(self.scheduler.schedule( &mut self.container, scheduling_budget, + bank.slot(), |txs, results| { Self::pre_graph_filter(txs, results, bank, MAX_PROCESSING_AGE) }, @@ -666,6 +670,7 @@ mod tests { ids: vec![], transactions: vec![], max_ages: vec![], + target_slot: 0, }, retryable_indexes: vec![], }) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs new file mode 100644 index 0000000000..b263e91b07 --- /dev/null +++ b/core/src/block_stage/block_consumer.rs @@ -0,0 +1,662 @@ +//! Block consumer processes block transactions and records them to PoH. +//! +//! Unlike banking_stage and bundle_stage, block_stage uses OPTIMISTIC RECORDING: +//! 1. First, record ALL transactions to PoH (broadcasts to cluster immediately) +//! 2. Lock the bank to prevent slot from ending +//! 3. Then execute the transactions using parallel workers with the Scheduler +//! 4. Commit the results +//! +//! This allows the cluster to start replaying the block alongside us. +//! The Scheduler ensures proper transaction ordering based on account locks. +//! Multiple worker threads execute non-conflicting chunks in parallel. + +use { + super::{DevinScheduler, Timer}, + crate::banking_stage::{ + committer::{CommitTransactionDetails, Committer}, + consumer::{ + ExecuteAndCommitTransactionsOutput, LeaderProcessedTransactionCounts, + ProcessTransactionBatchOutput, + }, + leader_slot_timing_metrics::LeaderExecuteAndCommitTimings, + scheduler_messages::MaxAge, + }, + agave_transaction_view::{ + resolved_transaction_view::ResolvedTransactionView, transaction_data::TransactionData, + }, + itertools::Itertools, + log::{debug, error, info}, + rayon::{prelude::*, ThreadPool, ThreadPoolBuilder}, + solana_account::{AccountSharedData, ReadableAccount}, + solana_clock::{Slot, MAX_PROCESSING_AGE}, + solana_entry::entry::hash_transactions, + solana_measure::measure_us, + solana_poh::{ + poh_recorder::PohRecorderError, + transaction_recorder::{RecordTransactionsTimings, TransactionRecorder}, + }, + solana_runtime::{ + account_saver::collect_accounts_to_store, + bank::{Bank, LoadAndExecuteTransactionsOutput}, + }, + solana_runtime_transaction::{ + runtime_transaction::RuntimeTransaction, transaction_meta::StaticMeta, + transaction_with_meta::TransactionWithMeta, + }, + solana_svm::{ + account_overrides::AccountOverrides, + transaction_error_metrics::TransactionErrorMetrics, + transaction_processor::{ExecutionRecordingConfig, TransactionProcessingConfig}, + }, + solana_svm_transaction::svm_message::SVMMessage, + solana_transaction::sanitized::SanitizedTransaction, + solana_transaction_error::TransactionError, + std::{cmp::max, num::Saturating, ops::Range, sync::mpsc}, +}; + +/// Number of worker threads for parallel execution +const NUM_THREADS: usize = 8; + +/// Verify signatures for all transactions in parallel using rayon. +/// Returns true if all signatures are valid, false on first invalid signature (short-circuits). +fn verify_signatures_parallel( + transactions: &[RuntimeTransaction>], + thread_pool: &ThreadPool, +) -> bool { + thread_pool.install(|| { + transactions.par_iter().all(|tx| { + // Transaction is already sanitized (num_signatures validated) + let signatures = tx.signatures(); + let pubkeys = tx.account_keys(); + let message_data = tx.message_data(); + + signatures + .iter() + .zip(pubkeys.iter()) + .all(|(sig, pk)| sig.verify(pk.as_ref(), message_data)) + }) + }) +} + +/// Result from a worker thread after executing and committing a chunk +type WorkerResult = Result< + ( + usize, // thread index + Range, // transaction range + Vec, // commit details + u64, // start time us + u64, // end time us + ), + TransactionError, +>; + +pub struct BlockConsumer { + committer: Committer, + transaction_recorder: TransactionRecorder, + log_messages_bytes_limit: Option, + scheduler: DevinScheduler, + thread_pool: ThreadPool, +} + +impl BlockConsumer { + pub fn new( + committer: Committer, + transaction_recorder: TransactionRecorder, + log_messages_bytes_limit: Option, + ) -> Self { + let thread_pool = ThreadPoolBuilder::new() + .num_threads(NUM_THREADS) + .thread_name(|i| format!("solBlkExec{i}")) + .build() + .expect("Failed to create block execution thread pool"); + + Self { + committer, + transaction_recorder, + log_messages_bytes_limit, + scheduler: DevinScheduler::new(), + thread_pool, + } + } + + /// Process and record block transactions using OPTIMISTIC RECORDING. + /// + /// Flow: + /// 1. Record ALL transactions to PoH first (broadcasts to cluster) + /// 2. Lock the bank to prevent slot from ending + /// 3. Execute the transactions in parallel using worker threads + /// 4. Commit the results after each chunk + /// + /// Uses RuntimeTransaction> for zerocopy transaction parsing. + pub fn process_and_record_block_transactions( + &mut self, + bank: &Bank, + transactions: &[RuntimeTransaction>], + max_ages: &[MaxAge], + intended_slot: Slot, + ) -> ProcessTransactionBatchOutput { + if transactions.is_empty() { + return ProcessTransactionBatchOutput { + cost_model_throttled_transactions_count: 0, + cost_model_us: 0, + execute_and_commit_transactions_output: ExecuteAndCommitTransactionsOutput { + transaction_counts: LeaderProcessedTransactionCounts::default(), + retryable_transaction_indexes: vec![], + commit_transactions_result: Ok(vec![]), + execute_and_commit_timings: Default::default(), + error_counters: Default::default(), + min_prioritization_fees: 0, + max_prioritization_fees: 0, + }, + }; + } + + // Step 0: Verify all signatures in parallel (fails fast on first invalid) + let (sigverify_passed, sigverify_us) = + measure_us!(verify_signatures_parallel(transactions, &self.thread_pool)); + + if !sigverify_passed { + error!( + "Signature verification failed for block at slot {} (took {}us)", + intended_slot, sigverify_us + ); + return ProcessTransactionBatchOutput { + cost_model_throttled_transactions_count: 0, + cost_model_us: 0, + execute_and_commit_transactions_output: ExecuteAndCommitTransactionsOutput { + transaction_counts: LeaderProcessedTransactionCounts::default(), + retryable_transaction_indexes: vec![], + commit_transactions_result: Err( + PohRecorderError::HarmonicBlockInvalidSignature, + ), + execute_and_commit_timings: Default::default(), + error_counters: Default::default(), + min_prioritization_fees: 0, + max_prioritization_fees: 0, + }, + }; + } + + debug!( + "Verified {} signatures in {}us for slot {}", + transactions.len(), + sigverify_us, + intended_slot + ); + + // Filter transactions based on max_age (check reserved keys, ALT expiration) + let pre_results = transactions + .iter() + .zip(max_ages) + .map(|(tx, max_age)| { + if bank.epoch() != max_age.sanitized_epoch { + bank.check_reserved_keys(tx)?; + } + if bank.slot() > max_age.alt_invalidation_slot { + let (_addresses, _deactivation_slot) = + bank.load_addresses_from_ref(tx.message_address_table_lookups())?; + } + Ok(()) + }) + .collect_vec(); + + let mut error_counters = TransactionErrorMetrics::default(); + let check_results = bank.check_transactions( + transactions, + &pre_results, + MAX_PROCESSING_AGE, + &mut error_counters, + ); + + // If all transactions failed checks, return early without recording + let failed_checks: Vec<_> = check_results + .iter() + .enumerate() + .filter_map(|(i, result)| result.as_ref().err().map(|e| (i, e.clone()))) + .collect(); + + if failed_checks.len() == transactions.len() { + let commit_transactions_result = check_results + .into_iter() + .map(|r| match r { + Ok(_) => unreachable!("all transactions failed checks"), + Err(err) => CommitTransactionDetails::NotCommitted(err), + }) + .collect(); + + return ProcessTransactionBatchOutput { + cost_model_throttled_transactions_count: 0, + cost_model_us: 0, + execute_and_commit_transactions_output: ExecuteAndCommitTransactionsOutput { + execute_and_commit_timings: LeaderExecuteAndCommitTimings::default(), + error_counters, + min_prioritization_fees: 0, + max_prioritization_fees: 0, + transaction_counts: LeaderProcessedTransactionCounts::default(), + retryable_transaction_indexes: vec![], + commit_transactions_result: Ok(commit_transactions_result), + }, + }; + } + + // Convert to versioned transactions for recording + let versioned_transactions: Vec<_> = transactions + .iter() + .map(|tx| tx.to_versioned_transaction()) + .collect(); + + // Step 1: OPTIMISTICALLY RECORD ALL TRANSACTIONS TO POH FIRST + // This broadcasts the block to the cluster so they can replay alongside us + let mut record_transactions_timings = RecordTransactionsTimings::default(); + + // Hash each transaction individually + let mut hashes = Vec::with_capacity(versioned_transactions.len()); + let mut batches = Vec::with_capacity(versioned_transactions.len()); + for tx in &versioned_transactions { + let batch = vec![tx.clone()]; + let (hash, hash_us) = measure_us!(hash_transactions(&batch)); + record_transactions_timings.hash_us += hash_us; + hashes.push(hash); + batches.push(batch); + } + + // Lock the bank to prevent the slot from ending after we record + // Both locks are needed: freeze_lock prevents bank hash finalization, + // blockhash_queue_lock prevents blockhash queue updates + let freeze_lock = bank.freeze_lock(); + let blockhash_queue_lock = bank.blockhash_queue_lock(); + + // Record all transactions - this is all-or-nothing for the entire block + let (record_result, poh_record_us) = + measure_us!(self + .transaction_recorder + .record(bank.bank_id(), hashes, batches)); + record_transactions_timings.poh_record_us = Saturating(poh_record_us); + + let starting_transaction_index = match record_result { + Ok(starting_index) => { + info!( + "Optimistically recorded block for slot {} with {} transactions", + intended_slot, + transactions.len() + ); + starting_index + } + Err(e) => { + // Recording failed - return early, vanilla scheduler can build fallback block + debug!("Failed to record block for slot {}: {:?}", intended_slot, e); + let error = match e { + solana_poh::record_channels::RecordSenderError::InactiveBankId + | solana_poh::record_channels::RecordSenderError::Shutdown => { + PohRecorderError::MaxHeightReached + } + solana_poh::record_channels::RecordSenderError::Full => { + PohRecorderError::ChannelFull + } + solana_poh::record_channels::RecordSenderError::Disconnected => { + PohRecorderError::ChannelDisconnected + } + }; + return ProcessTransactionBatchOutput { + cost_model_throttled_transactions_count: 0, + cost_model_us: 0, + execute_and_commit_transactions_output: ExecuteAndCommitTransactionsOutput { + transaction_counts: LeaderProcessedTransactionCounts::default(), + retryable_transaction_indexes: vec![], + commit_transactions_result: Err(error), + execute_and_commit_timings: LeaderExecuteAndCommitTimings { + record_transactions_timings, + ..Default::default() + }, + error_counters, + min_prioritization_fees: 0, + max_prioritization_fees: 0, + }, + }; + } + }; + + // Step 2: NOW EXECUTE THE TRANSACTIONS using parallel workers + // Recording succeeded, so we're committed to this block - execute and commit + let execute_and_commit_output = self.execute_and_commit_parallel( + bank, + transactions, + starting_transaction_index, + record_transactions_timings, + ); + + // Add actual executed costs to the cost tracker + if let Ok(ref commit_details) = execute_and_commit_output.commit_transactions_result { + let mut cost_tracker = bank.write_cost_tracker().unwrap(); + cost_tracker.add_executed_transaction_costs( + transactions + .iter() + .zip(commit_details.iter()) + .filter_map(|(tx, detail)| match detail { + CommitTransactionDetails::Committed { + compute_units, + loaded_accounts_data_size, + .. + } => Some((tx, *compute_units, *loaded_accounts_data_size)), + CommitTransactionDetails::NotCommitted(_) => None, + }), + ); + } + + drop(freeze_lock); + drop(blockhash_queue_lock); + + // Restore vote limit after block execution completes + bank.restore_vote_limit(); + + // Comprehensive timing log for profiling + let timings = &execute_and_commit_output.execute_and_commit_timings; + let committed_count = execute_and_commit_output + .transaction_counts + .processed_with_successful_result_count; + info!( + "Block slot={} txns={} committed={} | sigverify={}us hash={}us record={}us execute={}us | total={}us", + intended_slot, + transactions.len(), + committed_count, + sigverify_us, + timings.record_transactions_timings.hash_us.0, + timings.record_transactions_timings.poh_record_us.0, + timings.load_execute_us, + sigverify_us + + timings.record_transactions_timings.hash_us.0 + + timings.record_transactions_timings.poh_record_us.0 + + timings.load_execute_us, + ); + + ProcessTransactionBatchOutput { + cost_model_throttled_transactions_count: 0, + cost_model_us: 0, + execute_and_commit_transactions_output: execute_and_commit_output, + } + } + + /// Execute a single chunk of transactions with retry on AccountInUse errors. + /// AccountInUse errors can occur due to contention with vote processing threads. + fn execute_chunk( + bank: &Bank, + transactions: &[RuntimeTransaction>], + range: Range, + account_overrides: &AccountOverrides, + log_messages_bytes_limit: Option, + transaction_status_sender_enabled: bool, + ) -> LoadAndExecuteTransactionsOutput { + let chunk = &transactions[range]; + + loop { + // Prepare batch for this chunk + let batch = bank.prepare_sanitized_batch(chunk); + + // Execute transactions with account overrides to see state from previous chunks + let output = bank.load_and_execute_transactions( + &batch, + MAX_PROCESSING_AGE, + &mut solana_svm_timings::ExecuteTimings::default(), + &mut TransactionErrorMetrics::default(), + TransactionProcessingConfig { + account_overrides: Some(account_overrides), + check_program_modification_slot: bank.check_program_modification_slot(), + log_messages_bytes_limit, + limit_to_load_programs: true, + recording_config: ExecutionRecordingConfig::new_single_setting( + transaction_status_sender_enabled, + ), + }, + ); + + // If we get AccountInUse errors, retry execution + // Most likely account contention from vote processing threads + if output + .processing_results + .iter() + .any(|r| matches!(r, Err(TransactionError::AccountInUse))) + { + debug!("AccountInUse error detected, retrying chunk execution"); + continue; + } + + return output; + } + } + + /// Execute and commit transactions in parallel after they have been recorded to PoH. + /// Uses multiple worker threads with the Scheduler for chunked parallel execution. + fn execute_and_commit_parallel( + &mut self, + bank: &Bank, + transactions: &[RuntimeTransaction>], + starting_transaction_index: Option, + record_transactions_timings: RecordTransactionsTimings, + ) -> ExecuteAndCommitTransactionsOutput { + let mut execute_and_commit_timings = LeaderExecuteAndCommitTimings::default(); + execute_and_commit_timings.record_transactions_timings = record_transactions_timings; + + // Calculate prioritization fees + let min_max = transactions + .iter() + .filter_map(|transaction| { + transaction + .compute_budget_instruction_details() + .sanitize_and_convert_to_compute_budget_limits(&bank.feature_set) + .ok() + .map(|limits| limits.compute_unit_price) + }) + .minmax(); + let (min_prioritization_fees, max_prioritization_fees) = + min_max.into_option().unwrap_or_default(); + + let transaction_status_sender_enabled = self.committer.transaction_status_sender_enabled(); + let log_messages_bytes_limit = self.log_messages_bytes_limit; + + // Thread-safe account overrides shared between all workers (uses scc::HashMap) + let account_overrides = AccountOverrides::default(); + + // Collect all commit details across chunks + let mut all_commit_details: Vec = vec![ + CommitTransactionDetails::NotCommitted(TransactionError::AccountNotFound); + transactions.len() + ]; + let mut execution_error: Option = None; + + // Timer for tracking execution time + let start_time = Timer::new(); + + // Execute in parallel using thread pool + self.thread_pool.in_place_scope(|scope| { + // Channel for workers to send results back + let (finish_tx, finish_rx) = mpsc::channel::(); + + // Channels for sending work to each worker + let (work_senders, work_receivers): (Vec<_>, Vec<_>) = (0..NUM_THREADS) + .map(|_| mpsc::channel::>()) + .unzip(); + + // Spawn worker threads + for (thread_idx, work_rx) in work_receivers.into_iter().enumerate() { + let finish_tx = finish_tx.clone(); + let account_overrides = &account_overrides; + let committer = &self.committer; + let start_time = &start_time; + + scope.spawn(move |_| { + while let Ok(range) = work_rx.recv() { + let worker_start = start_time.elapsed_us(); + + // Execute chunk with shared account overrides (with retry on AccountInUse) + let load_and_execute_output = Self::execute_chunk( + bank, + transactions, + range.clone(), + account_overrides, + log_messages_bytes_limit, + transaction_status_sender_enabled, + ); + + let LoadAndExecuteTransactionsOutput { + processing_results, + processed_counts, + balance_collector, + } = load_and_execute_output; + + // Commit this chunk + let chunk = &transactions[range.clone()]; + let batch = bank.prepare_sanitized_batch(chunk); + + // Cache accounts in account_overrides BEFORE commit so next iterations + // can load cached state instead of using AccountsDB (which may be stale) + // This matches the audited implementation pattern. + let (accounts_to_cache, _) = collect_accounts_to_store( + chunk, + &None::>, + &processing_results, + ); + for (pubkey, account) in accounts_to_cache { + if account.lamports() == 0 { + account_overrides + .set_account(pubkey, Some(AccountSharedData::default())); + } else { + account_overrides.set_account(pubkey, Some(account.clone())); + } + } + + let chunk_starting_index = + starting_transaction_index.map(|start| start + range.start); + + let commit_transaction_statuses = + if processed_counts.processed_transactions_count != 0 { + let (_, statuses) = committer.commit_transactions( + &batch, + processing_results, + chunk_starting_index, + bank, + balance_collector, + &mut LeaderExecuteAndCommitTimings::default(), + &processed_counts, + ); + statuses + } else { + processing_results + .into_iter() + .map(|r| match r { + Ok(_) => unreachable!("processed count is 0"), + Err(err) => CommitTransactionDetails::NotCommitted(err), + }) + .collect() + }; + + let worker_end = start_time.elapsed_us(); + + let _ = finish_tx.send(Ok(( + thread_idx, + range, + commit_transaction_statuses, + worker_start, + worker_end, + ))); + } + }); + } + + // Initialize scheduler + self.scheduler.init(transactions); + + let mut next_worker = 0usize; + let mut queue_depth = 0usize; + let mut max_queue_depth = 0usize; + let mut per_thread_count = [0usize; NUM_THREADS]; + let mut per_thread_execution_times: [Vec<(Range, u64, u64)>; NUM_THREADS] = + Default::default(); + + // Main scheduling loop - matches audited implementation + while !self.scheduler.finished { + // Schedule transactions for execution + while let Some(range) = self.scheduler.pop(transactions, bank) { + // Send work to next available worker (round-robin) + if work_senders[next_worker].send(range.clone()).is_ok() { + per_thread_count[next_worker] += range.len(); + queue_depth += 1; + max_queue_depth = max(max_queue_depth, queue_depth); + } + next_worker = (next_worker + 1) % NUM_THREADS; + } + + // Finish any completed transactions (non-blocking) + if let Ok(result) = finish_rx.try_recv() { + queue_depth -= 1; + match result { + Ok((thread_idx, range, commit_details, start_us, end_us)) => { + // Track per-thread execution times + per_thread_execution_times[thread_idx].push(( + range.clone(), + start_us, + end_us, + )); + // Store commit details + for (i, status) in commit_details.into_iter().enumerate() { + all_commit_details[range.start + i] = status; + } + // Mark chunk as finished in scheduler (may unblock more work) + self.scheduler.finish(range, transactions); + } + Err(e) => { + error!("Block execution failed: {:?}", e); + execution_error = Some(e); + break; + } + } + } + } + + // Drop senders to signal workers to exit + drop(work_senders); + + // Update timing metrics + execute_and_commit_timings.load_execute_us = start_time.elapsed_us(); + + // Log per-thread execution times for profiling + for (thread_idx, times) in per_thread_execution_times.iter().enumerate() { + if !times.is_empty() { + debug!( + "Thread {} execution times: {:?}", + thread_idx, + times + .iter() + .map(|(r, s, e)| (r.clone(), e - s)) + .collect::>() + ); + } + } + }); + + // Count successes + let total_processed_count = all_commit_details + .iter() + .filter(|d| matches!(d, CommitTransactionDetails::Committed { .. })) + .count() as u64; + + let transaction_counts = LeaderProcessedTransactionCounts { + processed_count: total_processed_count, + processed_with_successful_result_count: total_processed_count, + attempted_processing_count: transactions.len() as u64, + }; + + ExecuteAndCommitTransactionsOutput { + transaction_counts, + retryable_transaction_indexes: vec![], + commit_transactions_result: if execution_error.is_some() { + Ok(all_commit_details) // Still return partial results + } else { + Ok(all_commit_details) + }, + execute_and_commit_timings, + error_counters: TransactionErrorMetrics::default(), + min_prioritization_fees, + max_prioritization_fees, + } + } +} diff --git a/core/src/block_stage/devin_scheduler.rs b/core/src/block_stage/devin_scheduler.rs new file mode 100644 index 0000000000..64f335ad6c --- /dev/null +++ b/core/src/block_stage/devin_scheduler.rs @@ -0,0 +1,220 @@ +use { + // log::*, + ahash::AHashMap, + solana_cost_model::cost_model::CostModel, + solana_runtime::bank::Bank, + solana_runtime_transaction::transaction_with_meta::TransactionWithMeta, + solana_svm_transaction::svm_message::SVMMessage, + std::{collections::VecDeque, ops::Range}, +}; + +pub struct DevinScheduler { + /// Map of the account locks for the popped but unfinished transactions + pub running_locks: AHashMap<[u8; 32], u32>, + /// Map of the account locks for the skipped over but unpopped transactions + pub skipped_locks: AHashMap<[u8; 32], u32>, + /// Indices of transactions skipped over while popping + pub indices: VecDeque, + /// Which transaction in `transactions` to pop next, if available + pub next: usize, + /// How many transactions have been scheduled + pub completed: usize, + /// Flag for when execution is done + pub finished: bool, +} + +impl Default for DevinScheduler { + fn default() -> Self { + Self { + // TODO: tune initial allocations + running_locks: AHashMap::with_capacity(10_000), + skipped_locks: AHashMap::with_capacity(10_000), + indices: VecDeque::with_capacity(100_000), + next: 0, + completed: 0, + finished: false, + } + } +} + +impl std::fmt::Debug for DevinScheduler { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "indices: {:?}", self.indices)?; + writeln!(f, "running_locks: {:?}", self.running_locks)?; + writeln!(f, "skipped_locks: {:?}", self.skipped_locks)?; + writeln!(f, "next: {:?}", self.next)?; + writeln!(f, "completed: {:?}", self.completed)?; + writeln!(f, "finished: {:?}", self.finished) + } +} + +impl DevinScheduler { + /// Total transaction slice cost at which to stop batching + const MAX_COST: u64 = 1_400_000; + /// Largest number of transactions to include in a batch + const MAX_RANGE: usize = 64; + + pub fn new() -> Self { + Self::default() + } + + /// Check if a transaction is blocked by a set of locks + #[inline(always)] + fn is_blocked(tx: &impl SVMMessage, locks: &AHashMap<[u8; 32], u32>) -> bool { + for (i, account) in tx.account_keys().iter().enumerate() { + if let Some(&lock) = locks.get(account.as_array()) { + // lock == u32::MAX => Another transaction holds a write lock + // lock != 0 && writable => Another transaction holds a read lock, + // but this transaction needs a write lock + if lock == u32::MAX || (lock != 0 && tx.is_writable(i)) { + return true; + } + } + } + false + } + + /// Add a transaction's account accesses to a set of locks + #[inline(always)] + fn lock_accounts(tx: &impl SVMMessage, locks: &mut AHashMap<[u8; 32], u32>) { + for (i, account) in tx.account_keys().iter().enumerate() { + let write = tx.is_writable(i); + locks + .entry(account.to_bytes()) + .and_modify(|v| { + if write { + *v = u32::MAX + } else { + // Use a saturating add here so that adding a read + // lock to a write locked account just leaves the write lock + // in place + *v = v.saturating_add(1) + } + }) + .or_insert(if write { u32::MAX } else { 1 }); + } + } + + /// Remove a transaction's account accesses to a set of locks + #[inline(always)] + fn unlock_accounts(tx: &impl SVMMessage, locks: &mut AHashMap<[u8; 32], u32>) { + for (i, account) in tx.account_keys().iter().enumerate() { + locks.entry(account.to_bytes()).and_modify(|v| { + if tx.is_writable(i) { + *v = 0 + } else { + *v = v.saturating_sub(1) + } + }); + } + } + + /// Initialize the scheduler for a slice of transactions + pub fn init(&mut self, transactions: &[T]) { + // info!("init():\n{self:?}"); + self.indices.clear(); + self.running_locks.clear(); + self.skipped_locks.clear(); + self.completed = 0; + self.next = 0; + self.finished = false; + self.indices.extend(0..transactions.len()); + } + + /// Get the next available range of transactions to schedule + /// + /// In order to avoid searching too deeply, this function just checks + /// the next available range of indices. This function should be called + /// in a tight loop with `done()`. + /// + /// ```rs + /// while !scheduler.finish { + /// if let Some(range) = scheduler.pop() { + /// // execute a transaction + /// running.push(range) + /// } + /// + /// // Execute transaction + /// while let Some(range) = running.pop() { + /// // Mark the transaction as done in the scheduler + /// schduler.finish(range) + /// }; + /// } + /// ``` + pub fn pop( + &mut self, + transactions: &[T], + bank: &Bank, + ) -> Option> { + // info!("pop():\n{self:?}"); + let mut value: Option> = None; + let mut cost = 0; + // Look through at most Self::MAX_RANGE transactions + for &i in self.indices.iter().skip(self.next).take(Self::MAX_RANGE) { + // We can only execute contiguous ranges, so ensure that this + // transaction doesn't skip indices + if let Some(ref range) = value { + if i != range.end { + break; + } + } + // Check if the transaction is blocked by any running transactions + // or any earlier transactions in the bundle that we haven't + // scheduled yet + let tx = &transactions[i]; + if Self::is_blocked(tx, &self.running_locks) + || Self::is_blocked(tx, &self.skipped_locks) + { + // We will return the first time we hit a blocked transaction, + // to keep this function call fast. Whether or not we have found + // a valid range to schedule, this transaction is blocked until + // we get a `finish()` call + self.next += 1; + // This transaction also blocks any future transactions in the + // bundle + Self::lock_accounts(tx, &mut self.skipped_locks); + break; + } + // This transaction can be scheduled. Add it to the set of running + // locks and update the return value + Self::lock_accounts(tx, &mut self.running_locks); + if let Some(ref mut range) = value { + range.end += 1; + } else { + value = Some(i..i + 1); + } + // Check the total cost of this transaction slice. If it gets too + // expensive, return early to avoid having too long running an + // execution on one thread blocking other transactions. + cost += CostModel::calculate_cost(tx, &bank.feature_set).sum(); + if cost > Self::MAX_COST { + break; + } + } + if let Some(ref range) = value { + // We are scheduling this range, so we need to remove it from the + // indices of remaining transactions + let start = self.indices.iter().position(|&v| v == range.start).unwrap(); + self.indices.drain(start..start + range.len()); + } + value + } + + /// Mark a transaction as finished + pub fn finish(&mut self, range: Range, transactions: &[T]) { + // info!("finish():\n{self:?}"); + // Count of completed transactions + self.completed += range.len(); + // Unlock all of the finished transactions' locks + for i in range { + let tx = &transactions[i]; + Self::unlock_accounts(tx, &mut self.running_locks); + } + // Reset the pop() transaction search, because there may now be newly + // unblocked transactions to schedule + self.skipped_locks.clear(); + self.next = 0; + // Update the finished flag once all transactions are complete + self.finished = self.completed == transactions.len(); + } +} diff --git a/core/src/block_stage/harmonic_block.rs b/core/src/block_stage/harmonic_block.rs new file mode 100644 index 0000000000..a7f60e096a --- /dev/null +++ b/core/src/block_stage/harmonic_block.rs @@ -0,0 +1,30 @@ +use solana_clock::Slot; +use solana_perf::packet::PacketBatch; + +/// A harmonic block contains transactions intended for a specific slot. +#[derive(Clone, Debug)] +pub struct HarmonicBlock { + transactions: PacketBatch, + intended_slot: Slot, +} + +impl HarmonicBlock { + pub fn new(transactions: PacketBatch, intended_slot: Slot) -> Self { + Self { + transactions, + intended_slot, + } + } + + pub fn transactions(&self) -> &PacketBatch { + &self.transactions + } + + pub fn intended_slot(&self) -> Slot { + self.intended_slot + } + + pub fn take(self) -> PacketBatch { + self.transactions + } +} diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs new file mode 100644 index 0000000000..9c9cd9755a --- /dev/null +++ b/core/src/block_stage/mod.rs @@ -0,0 +1,269 @@ +//! The `block_stage` processes blocks, which are transactions intended for a specific slot. +//! Unlike bundles, blocks have no transaction limit and the uuid field contains the intended slot. + +mod block_consumer; +mod harmonic_block; +mod devin_scheduler; +mod timer; + +pub use block_consumer::BlockConsumer; +pub use harmonic_block::HarmonicBlock; +pub use devin_scheduler::DevinScheduler; +pub use timer::Timer; + +use { + crate::{ + banking_stage::{ + committer::Committer, + scheduler_messages::MaxAge, + transaction_scheduler::receive_and_buffer::{ + calculate_max_age, translate_to_runtime_view, + }, + }, + scheduler_synchronization, + }, + agave_transaction_view::resolved_transaction_view::ResolvedTransactionView, + crossbeam_channel::{Receiver, RecvTimeoutError}, + log::info, + solana_gossip::cluster_info::ClusterInfo, + solana_ledger::blockstore_processor::TransactionStatusSender, + solana_poh::transaction_recorder::TransactionRecorder, + solana_runtime::{ + bank::Bank, bank_forks::BankForks, prioritization_fee_cache::PrioritizationFeeCache, + vote_sender_types::ReplayVoteSender, + }, + solana_runtime_transaction::runtime_transaction::RuntimeTransaction, + std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, RwLock, + }, + thread::{self, Builder, JoinHandle}, + time::Duration, + }, +}; + +pub struct BlockStage { + block_thread: JoinHandle<()>, +} + +impl BlockStage { + #[allow(clippy::new_ret_no_self)] + #[allow(clippy::too_many_arguments)] + pub fn new( + cluster_info: &Arc, + bank_forks: Arc>, + transaction_recorder: TransactionRecorder, + block_receiver: Receiver, + transaction_status_sender: Option, + replay_vote_sender: ReplayVoteSender, + log_messages_bytes_limit: Option, + exit: Arc, + prioritization_fee_cache: &Arc, + ) -> Self { + Self::start_block_thread( + cluster_info, + bank_forks, + transaction_recorder, + block_receiver, + transaction_status_sender, + replay_vote_sender, + log_messages_bytes_limit, + exit, + prioritization_fee_cache, + ) + } + + pub fn join(self) -> thread::Result<()> { + self.block_thread.join() + } + + #[allow(clippy::too_many_arguments)] + fn start_block_thread( + cluster_info: &Arc, + bank_forks: Arc>, + transaction_recorder: TransactionRecorder, + block_receiver: Receiver, + transaction_status_sender: Option, + replay_vote_sender: ReplayVoteSender, + log_message_bytes_limit: Option, + exit: Arc, + prioritization_fee_cache: &Arc, + ) -> Self { + let committer = Committer::new( + transaction_status_sender, + replay_vote_sender, + prioritization_fee_cache.clone(), + ); + + let consumer = BlockConsumer::new(committer, transaction_recorder, log_message_bytes_limit); + + let cluster_info = cluster_info.clone(); + let block_thread = Builder::new() + .name("solBlockStgTx".to_string()) + .spawn(move || { + Self::process_loop(bank_forks, block_receiver, consumer, exit, cluster_info); + }) + .unwrap(); + + Self { block_thread } + } + + fn process_loop( + bank_forks: Arc>, + block_receiver: Receiver, + mut consumer: BlockConsumer, + exit: Arc, + _cluster_info: Arc, + ) { + while !exit.load(Ordering::Relaxed) { + match block_receiver.recv_timeout(Duration::from_millis(10)) { + Ok(block_bundle) => { + let (root_bank, working_bank) = { + let bank_forks_guard = bank_forks.read().unwrap(); + ( + bank_forks_guard.root_bank(), + bank_forks_guard.working_bank(), + ) + }; + + let intended_slot = block_bundle.intended_slot(); + let current_slot = working_bank.slot(); + + // Check if this block is for the correct slot + if intended_slot != current_slot { + info!( + "Block intended for slot {} but current slot is {}, dropping", + intended_slot, current_slot + ); + continue; + } + + // Check if we're in the delegation period and can schedule this block + let current_tick_height = working_bank.tick_height(); + let max_tick_height = working_bank.max_tick_height(); + let ticks_per_slot = working_bank.ticks_per_slot(); + let start_tick = max_tick_height - ticks_per_slot; + let ticks_into_slot = current_tick_height.saturating_sub(start_tick); + let delegation_period_length = ticks_per_slot * 15 / 16; + let in_delegation_period = ticks_into_slot < delegation_period_length; + + // Try to claim this slot for block scheduling + match scheduler_synchronization::block_should_schedule( + current_slot, + in_delegation_period, + ) { + Some(true) => { + // We claimed the slot, proceed with processing + info!("Block stage claimed slot {}", current_slot); + } + Some(false) => { + // Slot was already claimed by someone else + info!( + "Block stage could not claim slot {}, already scheduled", + current_slot + ); + continue; + } + None => { + // Not in delegation period, can't schedule block + info!( + "Block stage cannot schedule for slot {}, not in delegation period", + current_slot + ); + continue; + } + } + + let batch = block_bundle.take(); + + // Translate packets to RuntimeTransaction + // using zerocopy TransactionView instead of bincode deserialization + let (transactions, max_ages) = + Self::translate_packets_to_transactions(&batch, &root_bank, &working_bank); + + // Process blocks + if !transactions.is_empty() { + let output = consumer.process_and_record_block_transactions( + &working_bank, + &transactions, + &max_ages, + intended_slot, + ); + + // Check if recording failed - if so, revert so vanilla can build fallback block + if output + .execute_and_commit_transactions_output + .commit_transactions_result + .is_err() + { + info!( + "Block recording failed for slot {}, reverting to vanilla", + current_slot + ); + scheduler_synchronization::block_failed(current_slot); + } + } + } + Err(RecvTimeoutError::Timeout) => { + // Continue loop + } + Err(RecvTimeoutError::Disconnected) => { + break; + } + } + } + } + + /// Translate packet bytes to RuntimeTransaction using + /// zerocopy parsing instead of bincode deserialization. + fn translate_packets_to_transactions<'a>( + batch: &'a solana_perf::packet::PacketBatch, + root_bank: &Bank, + working_bank: &Bank, + ) -> ( + Vec>>, + Vec, + ) { + let enable_static_instruction_limit = root_bank + .feature_set + .is_active(&agave_feature_set::static_instruction_limit::ID); + let transaction_account_lock_limit = working_bank.get_transaction_account_lock_limit(); + + let mut transactions = Vec::with_capacity(batch.len()); + let mut max_ages = Vec::with_capacity(batch.len()); + + for packet in batch.iter() { + // Skip packets marked for discard or with no data + if packet.meta().discard() { + continue; + } + + let Some(packet_data) = packet.data(..) else { + continue; + }; + + // Use translate_to_runtime_view for zerocopy parsing + match translate_to_runtime_view( + packet_data, + working_bank, + root_bank, + enable_static_instruction_limit, + transaction_account_lock_limit, + ) { + Ok((view, deactivation_slot)) => { + let max_age = + calculate_max_age(root_bank.epoch(), deactivation_slot, root_bank.slot()); + transactions.push(view); + max_ages.push(max_age); + } + Err(_) => { + // Skip packets that fail sanitization/translation + continue; + } + } + } + + (transactions, max_ages) + } +} diff --git a/core/src/block_stage/timer.rs b/core/src/block_stage/timer.rs new file mode 100644 index 0000000000..a6a9d3b91f --- /dev/null +++ b/core/src/block_stage/timer.rs @@ -0,0 +1,102 @@ +use std::time::Instant; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use core::arch::x86_64::_rdtsc; + +#[derive(Clone, Copy, Debug)] +pub enum Timer { + RDTSC(u64), + Instant(Instant), +} + +impl Timer { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn new() -> Self { + if check_cpu_supports_invariant_tsc() { + Timer::RDTSC(unsafe { _rdtsc() }) + } else { + Timer::Instant(Instant::now()) + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + pub fn new() -> Self { + Timer::Instant(Instant::now()) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn elapsed_us(&self) -> u64 { + match self { + Timer::RDTSC(rdtsc) => (unsafe { _rdtsc() - rdtsc } / ticks_per_us()), + Timer::Instant(instant) => instant.elapsed().as_micros() as u64, + } + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + pub fn elapsed_us(&self) -> u64 { + match self { + Timer::RDTSC(_) => unreachable!("RDTSC not supported outside x86"), + Timer::Instant(instant) => instant.elapsed().as_micros() as u64, + } + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[allow(dead_code)] +pub fn memoize_ticks_per_us_and_invariant_tsc_check() { + check_cpu_supports_invariant_tsc(); + ticks_per_us(); +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn check_cpu_supports_invariant_tsc() -> bool { + use std::sync::OnceLock; + static SUPPORTS_INVARIANT_TSC: OnceLock = OnceLock::new(); + + *SUPPORTS_INVARIANT_TSC.get_or_init(|| { + let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") else { + return false; + }; + + let has_constant_tsc = cpuinfo.contains("constant_tsc"); + let has_nonstop_tsc = cpuinfo.contains("nonstop_tsc"); + + has_constant_tsc && has_nonstop_tsc + }) +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn ticks_per_us() -> u64 { + use std::{sync::OnceLock, time::Duration}; + + static TICKS_PER_US: OnceLock = OnceLock::new(); + + *TICKS_PER_US.get_or_init(|| { + let warm_up_duration = Duration::from_millis(1000); + let measurement_duration = Duration::from_millis(1000); + + // Warm up + let warm_up_start = Instant::now(); + while warm_up_start.elapsed() < warm_up_duration { + // Spin + } + + let start = Instant::now(); + let start_tsc = unsafe { core::arch::x86_64::_rdtsc() }; + + // Measure + while start.elapsed() < measurement_duration { + // Spin + } + + let end_tsc = unsafe { core::arch::x86_64::_rdtsc() }; + let elapsed_tsc = end_tsc - start_tsc; + + let duration_us = measurement_duration.as_nanos() as u64 / 1000; + let tsc_per_us = elapsed_tsc / duration_us; + + tsc_per_us + }) +} + + diff --git a/core/src/bundle_stage.rs b/core/src/bundle_stage.rs index 11071b37ec..8d7f94b6de 100644 --- a/core/src/bundle_stage.rs +++ b/core/src/bundle_stage.rs @@ -48,7 +48,7 @@ use { }; pub mod bundle_account_locker; -mod bundle_consumer; +pub mod bundle_consumer; mod bundle_packet_deserializer; mod bundle_storage; const MAX_BUNDLE_RETRY_DURATION: Duration = Duration::from_millis(40); @@ -576,7 +576,12 @@ impl BundleStage { cluster_info: &Arc, consume_worker_metrics: &ConsumeWorkerMetrics, ) { - match decision_maker.make_consume_or_forward_decision() { + let decision = decision_maker.make_consume_or_forward_decision(); + // cavey: gate bundle scheduling behind vanilla scheduler - bundles only + // run after the delegation threshold if no block was received + let decision = DecisionMaker::maybe_consume::(decision); + + match decision { // BufferedPacketsDecision::Consume means this leader is scheduled to be running at the moment. // Execute, record, and commit as many bundles possible given time, compute, and other constraints. BufferedPacketsDecision::Consume(bank) => { @@ -959,10 +964,13 @@ impl BundleStage { mod tests { use { super::*, - crate::tip_manager::{ - tip_distribution::{JitoTipDistributionConfig, TipDistributionAccount}, - tip_payment::JitoTipPaymentConfig, - TipDistributionAccountConfig, TipManagerConfig, + crate::{ + scheduler_synchronization, + tip_manager::{ + tip_distribution::{JitoTipDistributionConfig, TipDistributionAccount}, + tip_payment::JitoTipPaymentConfig, + TipDistributionAccountConfig, TipManagerConfig, + }, }, crossbeam_channel::{bounded, unbounded}, solana_cluster_type::ClusterType, @@ -1045,6 +1053,9 @@ mod tests { let bank = Bank::new_from_parent(bank, &Pubkey::new_unique(), 1); bank_forks.write().unwrap().insert(bank); + // Force vanilla scheduling for this slot (simulates being past delegation period) + scheduler_synchronization::force_vanilla_claim(1); + let bank = bank_forks.read().unwrap().working_bank(); assert_eq!(bank.slot(), 1); @@ -1276,6 +1287,9 @@ mod tests { let bank = Bank::new_from_parent(bank, &Pubkey::new_unique(), 1); bank_forks.write().unwrap().insert(bank); + // Force vanilla scheduling for this slot (simulates being past delegation period) + scheduler_synchronization::force_vanilla_claim(1); + let bank = bank_forks.read().unwrap().working_bank(); assert_eq!(bank.slot(), 1); @@ -1524,6 +1538,9 @@ mod tests { let bank = Bank::new_from_parent(bank, &Pubkey::new_unique(), 1); bank_forks.write().unwrap().insert(bank); + // Force vanilla scheduling for this slot (simulates being past delegation period) + scheduler_synchronization::force_vanilla_claim(1); + let bank = bank_forks.read().unwrap().working_bank(); assert_eq!(bank.slot(), 1); diff --git a/core/src/lib.rs b/core/src/lib.rs index ecacc588c3..50859fe395 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -21,6 +21,7 @@ pub mod admin_rpc_post_init; pub mod banking_simulation; pub mod banking_stage; pub mod banking_trace; +pub mod block_stage; pub mod bundle; mod bundle_sigverify_stage; pub mod bundle_stage; @@ -44,6 +45,7 @@ pub mod replay_stage; pub mod resource_limits; mod result; pub mod sample_performance_service; +pub mod scheduler_synchronization; mod shred_fetch_stage; pub mod sigverify; pub mod sigverify_stage; diff --git a/core/src/proxy/block_engine_stage.rs b/core/src/proxy/block_engine_stage.rs index 98ad452426..63f87a7c76 100644 --- a/core/src/proxy/block_engine_stage.rs +++ b/core/src/proxy/block_engine_stage.rs @@ -7,6 +7,7 @@ use { crate::{ banking_trace::BankingPacketSender, + block_stage::HarmonicBlock, packet_bundle::PacketBundle, proto_packet_to_packet, proxy::{ @@ -128,6 +129,8 @@ impl BlockEngineStage { exit: Arc, block_builder_fee_info: &Arc>, shredstream_receiver_address: Arc>>, + // Channel that blocks get piped through. + block_tx: Sender, ) -> Self { let block_builder_fee_info = block_builder_fee_info.clone(); @@ -147,6 +150,7 @@ impl BlockEngineStage { exit, block_builder_fee_info, shredstream_receiver_address, + block_tx, )); }) .unwrap(); @@ -173,6 +177,7 @@ impl BlockEngineStage { exit: Arc, block_builder_fee_info: Arc>, shredstream_receiver_address: Arc>>, + block_tx: Sender, ) { let mut error_count: u64 = 0; @@ -197,6 +202,7 @@ impl BlockEngineStage { &block_builder_fee_info, &shredstream_receiver_address, &local_block_engine_config, + &block_tx, ) .await { @@ -232,6 +238,7 @@ impl BlockEngineStage { block_builder_fee_info: &Arc>, shredstream_receiver_address: &Arc>>, local_block_engine_config: &BlockEngineConfig, + block_tx: &Sender, ) -> crate::proxy::Result<()> { let endpoint = Self::get_endpoint(&local_block_engine_config.block_engine_url)?; if !local_block_engine_config.disable_block_engine_autoconfig { @@ -251,6 +258,7 @@ impl BlockEngineStage { exit, block_builder_fee_info, shredstream_receiver_address, + block_tx, ) .await; } @@ -283,6 +291,7 @@ impl BlockEngineStage { exit, block_builder_fee_info, &Self::CONNECTION_TIMEOUT, + block_tx, ) .await .inspect(|_| { @@ -307,6 +316,7 @@ impl BlockEngineStage { exit: &Arc, block_builder_fee_info: &Arc>, shredstream_receiver_address: &Arc>>, + block_tx: &Sender, ) -> crate::proxy::Result<()> { let candidates = Self::get_ranked_endpoints(&endpoint).await?; @@ -342,6 +352,7 @@ impl BlockEngineStage { exit, block_builder_fee_info, &Self::CONNECTION_TIMEOUT, + block_tx, ) .await { @@ -460,6 +471,7 @@ impl BlockEngineStage { exit: &Arc, block_builder_fee_info: &Arc>, connection_timeout: &Duration, + block_tx: &Sender, ) -> crate::proxy::Result<()> { // Get a copy of configs here in case they have changed at runtime let keypair = cluster_info.keypair().clone(); @@ -521,6 +533,7 @@ impl BlockEngineStage { keypair, cluster_info, &backend_url, + block_tx, ) .await } @@ -703,6 +716,7 @@ impl BlockEngineStage { keypair: Arc, cluster_info: &Arc, block_engine_url: &str, + block_tx: &Sender, ) -> crate::proxy::Result<()> { let subscribe_packets_stream = timeout( *connection_timeout, @@ -715,10 +729,19 @@ impl BlockEngineStage { let subscribe_bundles_stream = timeout( *connection_timeout, - client.subscribe_bundles(block_engine::SubscribeBundlesRequest {}), + client.subscribe_bundles2(block_engine::SubscribeBundlesRequest {}), ) .await - .map_err(|_| ProxyError::MethodTimeout("subscribe_bundles".to_string()))? + .map_err(|_| ProxyError::MethodTimeout("subscribe_bundles2".to_string()))? + .map_err(|e| ProxyError::MethodError(e.to_string()))? + .into_inner(); + + let subscribe_blocks_stream = timeout( + *connection_timeout, + client.subscribe_blocks(block_engine::SubscribeBundlesRequest {}), + ) + .await + .map_err(|_| ProxyError::MethodTimeout("subscribe_blocks".to_string()))? .map_err(|e| ProxyError::MethodError(e.to_string()))? .into_inner(); @@ -746,7 +769,11 @@ impl BlockEngineStage { Self::consume_bundle_and_packet_stream( client, - (subscribe_bundles_stream, subscribe_packets_stream), + ( + subscribe_bundles_stream, + subscribe_packets_stream, + subscribe_blocks_stream, + ), bundle_tx, packet_tx, local_config, @@ -761,6 +788,7 @@ impl BlockEngineStage { cluster_info, connection_timeout, block_engine_url, + &block_tx, ) .await } @@ -768,9 +796,10 @@ impl BlockEngineStage { #[allow(clippy::too_many_arguments)] async fn consume_bundle_and_packet_stream( mut client: BlockEngineValidatorClient>, - (mut bundle_stream, mut packet_stream): ( + (mut bundle_stream, mut packet_stream, mut block_stream): ( Streaming, Streaming, + Streaming, ), bundle_tx: &Sender>, packet_tx: &Sender, @@ -786,6 +815,7 @@ impl BlockEngineStage { cluster_info: &Arc, connection_timeout: &Duration, block_engine_url: &str, + block_tx: &Sender, ) -> crate::proxy::Result<()> { const METRICS_TICK: Duration = Duration::from_secs(1); const MAINTENANCE_TICK: Duration = Duration::from_secs(10 * 60); @@ -808,6 +838,9 @@ impl BlockEngineStage { maybe_bundles = bundle_stream.message() => { Self::handle_block_engine_maybe_bundles(maybe_bundles, bundle_tx, &mut block_engine_stats)?; } + maybe_blocks = block_stream.message() => { + Self::handle_block_engine_maybe_blocks(maybe_blocks, block_tx, &mut block_engine_stats)?; + } _ = metrics_and_auth_tick.tick() => { block_engine_stats.report(); block_engine_stats = BlockEngineStageStats::default(); @@ -914,6 +947,39 @@ impl BlockEngineStage { .map_err(|_| ProxyError::PacketForwardError) } + fn handle_block_engine_maybe_blocks( + maybe_blocks_response: Result, Status>, + block_sender: &Sender, + _block_engine_stats: &mut BlockEngineStageStats, + ) -> crate::proxy::Result<()> { + let blocks_response = maybe_blocks_response?.ok_or(ProxyError::GrpcStreamDisconnected)?; + for bundle in blocks_response.bundles { + // Parse uuid as uint64 slot + let intended_slot = bundle.uuid.parse::().map_err(|e| { + ProxyError::MethodError(format!("failed to parse block uuid as slot: {e}")) + })?; + + if let Some(bundle_data) = bundle.bundle { + let packet_batch = PacketBatch::from( + bundle_data + .packets + .into_iter() + .map(proto_packet_to_packet) + .collect::>(), + ); + + // Tag transactions as block-sourced by creating BlockPacketBundle + let block_bundle = + crate::block_stage::HarmonicBlock::new(packet_batch, intended_slot); + + block_sender + .send(block_bundle) + .map_err(|_| ProxyError::PacketForwardError)?; + } + } + Ok(()) + } + fn handle_block_engine_packets( resp: block_engine::SubscribePacketsResponse, packet_tx: &Sender, diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 557b27370d..d7b0fac040 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -2275,6 +2275,8 @@ impl ReplayStage { slot_status_notifier, NewBankOptions { vote_only_bank }, ); + // Limit votes to 4M CUs during our leader slot + tpu_bank.set_proposer_vote_limit(); // make sure parent is frozen for finalized hashes via the above // new()-ing of its child bank banking_tracer.hash_event(parent.slot(), &parent.last_blockhash(), &parent.hash()); diff --git a/core/src/scheduler_synchronization.rs b/core/src/scheduler_synchronization.rs new file mode 100644 index 0000000000..e4963cf05a --- /dev/null +++ b/core/src/scheduler_synchronization.rs @@ -0,0 +1,349 @@ +//! Synchronize whole block and vanilla schedulers. +//! +//! Every slot, there are two stages: delegation and fallback. +//! During delegation stage, the block scheduler awaits a whole block. +//! If a whole block is received by the end of the stage, it is scheduled. +//! Otherwise, vanilla scheduling takes over during fallback stage. +//! +//! The state is stored in a single atomic u64: +//! - Top bit (bit 63): 1 = claimed by block, 0 = claimed by vanilla +//! - Lower 63 bits: slot number +//! - Sentinel value (u64::MAX) indicates no slot has been scheduled yet + +use { + log::info, + std::sync::atomic::{AtomicU64, Ordering}, +}; + +/// Top bit indicates block claimed (1) vs vanilla claimed (0) +const BLOCK_CLAIMED_BIT: u64 = 1 << 63; +/// Mask to extract the slot number (lower 63 bits) +const SLOT_MASK: u64 = !BLOCK_CLAIMED_BIT; +/// Sentinel value - all bits set, indicates no slot scheduled yet. +/// Note: get_slot(SENTINEL) = 0x7FFFFFFFFFFFFFFF which is far larger than any real slot. +const SENTINEL: u64 = u64::MAX; + +/// Module private state. Shared with block & vanilla schedulers. +/// Encodes both the slot and who claimed it in a single atomic. +static SCHEDULER_STATE: AtomicU64 = AtomicU64::new(SENTINEL); + +/// Extract the slot number from the combined state value. +#[inline] +fn get_slot(value: u64) -> u64 { + value & SLOT_MASK +} + +/// Check if the state indicates the slot was claimed by block. +#[inline] +fn is_block_claim(value: u64) -> bool { + value & BLOCK_CLAIMED_BIT != 0 +} + +/// Create a state value for a slot claimed by vanilla (top bit clear). +#[inline] +fn vanilla_claim(slot: u64) -> u64 { + slot & SLOT_MASK +} + +/// Create a state value for a slot claimed by block (top bit set). +#[inline] +fn block_claim(slot: u64) -> u64 { + (slot & SLOT_MASK) | BLOCK_CLAIMED_BIT +} + +/// Reset the scheduler synchronization state. Used in tests to ensure +/// a clean slate for each test. +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub fn reset_for_tests() { + SCHEDULER_STATE.store(SENTINEL, Ordering::Release); +} + +/// Force claim a slot for vanilla scheduling. Used in tests to simulate +/// being past the delegation period. +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub fn force_vanilla_claim(slot: u64) { + SCHEDULER_STATE.store(vanilla_claim(slot), Ordering::Release); +} + +/// Returns the last slot that was scheduled (without the block/vanilla flag). +pub fn last_slot_scheduled() -> u64 { + get_slot(SCHEDULER_STATE.load(Ordering::Acquire)) +} + +/// Returns true if the current slot was claimed by block, false if by vanilla. +pub fn is_slot_claimed_by_block() -> bool { + is_block_claim(SCHEDULER_STATE.load(Ordering::Acquire)) +} + +/// If vanilla should schedule, the internal private atomic is +/// updated so that the block scheduler does not schedule. +/// +/// Returns: +/// - None => not yet time to decide (still in delegation period and not yet claimed) +/// - Some(true) => yes, vanilla should schedule (claimed by vanilla) +/// - Some(false) => no, vanilla should not schedule (claimed by block) +pub fn vanilla_should_schedule(current_slot: u64, in_delegation_period: bool) -> Option { + let state = SCHEDULER_STATE.load(Ordering::Acquire); + + // If slot is already claimed for current_slot, check who claimed it + // This must be checked BEFORE in_delegation_period to handle: + // 1. Multiple vanilla threads after one has claimed + // 2. Test scenarios using force_vanilla_claim + if state != SENTINEL && get_slot(state) == current_slot { + // Check who claimed it - if vanilla claimed, all vanilla threads can consume + // If block claimed, no vanilla thread should consume + let claimed_by_block = is_block_claim(state); + info!( + "vanilla_should_schedule: slot {} already claimed, by_block={}", + current_slot, claimed_by_block + ); + return Some(!claimed_by_block); + } + + // If still in delegation period and slot not yet claimed, don't try to claim + if in_delegation_period { + return None; + } + + // Try to claim the slot atomically + let new_state = vanilla_claim(current_slot); + let result = SCHEDULER_STATE.fetch_update(Ordering::Release, Ordering::Acquire, |old_state| { + // Handle sentinel value + if old_state == SENTINEL { + return Some(new_state); + } + + let old_slot = get_slot(old_state); + match old_slot.cmp(¤t_slot) { + // Last slot scheduled was in the past => update + std::cmp::Ordering::Less => Some(new_state), + // Something has been scheduled for this slot => no update + std::cmp::Ordering::Equal => None, + // Weird edge case (slot went backwards?) => don't schedule + std::cmp::Ordering::Greater => None, + } + }); + + if result.is_ok() { + info!("vanilla claimed slot {current_slot}"); + return Some(true); + } + + // Failed to claim - slot was claimed while we were trying + // Re-check: if it was claimed by another vanilla thread, we can still consume + let state_now = SCHEDULER_STATE.load(Ordering::Acquire); + if state_now != SENTINEL && get_slot(state_now) == current_slot { + let claimed_by_block = is_block_claim(state_now); + info!( + "vanilla unable to claim {}, but slot is claimed, by_block={}", + current_slot, claimed_by_block + ); + return Some(!claimed_by_block); + } + + info!("vanilla unable to claim slot {current_slot}"); + Some(false) +} + +/// If block should schedule, the internal private atomic is +/// updated so that the vanilla scheduler does not schedule. +/// +/// Returns: +/// - None => not in delegation period, can't schedule block +/// - Some(true) => yes, block should schedule (claimed successfully) +/// - Some(false) => no, block should not schedule (already claimed) +pub fn block_should_schedule(current_slot: u64, in_delegation_period: bool) -> Option { + if !in_delegation_period { + return None; + } + + // Try to claim the slot atomically with block flag set + let new_state = block_claim(current_slot); + let did_claim = SCHEDULER_STATE + .fetch_update(Ordering::Release, Ordering::Acquire, |old_state| { + // Handle sentinel value + if old_state == SENTINEL { + return Some(new_state); + } + + let old_slot = get_slot(old_state); + match old_slot.cmp(¤t_slot) { + // Last slot scheduled was in the past => update + std::cmp::Ordering::Less => Some(new_state), + // Something has been scheduled for this slot => no update + std::cmp::Ordering::Equal => { + info!("unexpectedly hit Equal branch in block_should_schedule"); + None + } + // Weird edge case => don't schedule + std::cmp::Ordering::Greater => None, + } + }) + .is_ok(); + + if did_claim { + info!("block claimed slot {current_slot}"); + } + + Some(did_claim) +} + +/// If block failed, we should revert and give vanilla a chance. +/// This atomically clears the block claim and sets the slot to current_slot - 1 +/// so that vanilla can claim the current slot. +pub fn block_failed(current_slot: u64) -> Option { + info!("block_failed {current_slot}"); + + // Atomically revert if we're still on the same slot with block claim + let did_revert = SCHEDULER_STATE + .fetch_update(Ordering::Release, Ordering::Acquire, |old_state| { + info!("block_failed fetch_update old_state={old_state}"); + + // Only revert if current slot is claimed by block + if old_state == SENTINEL { + return None; + } + + let old_slot = get_slot(old_state); + if old_slot != current_slot { + // Different slot, don't revert + return None; + } + + if !is_block_claim(old_state) { + // Not claimed by block, don't revert + return None; + } + + // Revert to previous slot (vanilla claim, so vanilla can now claim current_slot) + // Using wrapping_sub to handle slot 0 edge case + let new_state = vanilla_claim(current_slot.wrapping_sub(1)); + info!("block_failed reverting to state={new_state}"); + Some(new_state) + }) + .is_ok(); + + info!("block_failed did_revert={did_revert}"); + + if did_revert { + info!("block reverted in slot {current_slot}"); + } + + Some(did_revert) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encoding() { + // Test vanilla claim + let v = vanilla_claim(42); + assert_eq!(get_slot(v), 42); + assert!(!is_block_claim(v)); + + // Test block claim + let b = block_claim(42); + assert_eq!(get_slot(b), 42); + assert!(is_block_claim(b)); + + // Test sentinel + assert_eq!(get_slot(SENTINEL), SLOT_MASK); // Very large, not a real slot + assert!(is_block_claim(SENTINEL)); // Top bit is set in u64::MAX + } + + #[test] + fn test_vanilla_claim_after_delegation() { + reset_for_tests(); + + // Not in delegation period, should be able to claim + let result = vanilla_should_schedule(100, false); + assert_eq!(result, Some(true)); + assert_eq!(last_slot_scheduled(), 100); + assert!(!is_slot_claimed_by_block()); + } + + #[test] + fn test_vanilla_during_delegation_unclaimed() { + reset_for_tests(); + + // In delegation period, unclaimed, should return None + let result = vanilla_should_schedule(100, true); + assert_eq!(result, None); + } + + #[test] + fn test_vanilla_during_delegation_claimed_by_vanilla() { + reset_for_tests(); + force_vanilla_claim(100); + + // In delegation period but already claimed by vanilla + let result = vanilla_should_schedule(100, true); + assert_eq!(result, Some(true)); + } + + #[test] + fn test_vanilla_during_delegation_claimed_by_block() { + reset_for_tests(); + SCHEDULER_STATE.store(block_claim(100), Ordering::Release); + + // In delegation period, claimed by block + let result = vanilla_should_schedule(100, true); + assert_eq!(result, Some(false)); + } + + #[test] + fn test_block_claim_during_delegation() { + reset_for_tests(); + + // In delegation period, should be able to claim + let result = block_should_schedule(100, true); + assert_eq!(result, Some(true)); + assert_eq!(last_slot_scheduled(), 100); + assert!(is_slot_claimed_by_block()); + } + + #[test] + fn test_block_outside_delegation() { + reset_for_tests(); + + // Not in delegation period, should return None + let result = block_should_schedule(100, false); + assert_eq!(result, None); + } + + #[test] + fn test_block_failed_reverts() { + reset_for_tests(); + + // Block claims slot 100 + block_should_schedule(100, true); + assert!(is_slot_claimed_by_block()); + + // Block fails, should revert + let result = block_failed(100); + assert_eq!(result, Some(true)); + + // Now vanilla should be able to claim slot 100 + let result = vanilla_should_schedule(100, false); + assert_eq!(result, Some(true)); + assert!(!is_slot_claimed_by_block()); + } + + #[test] + fn test_block_failed_wrong_slot() { + reset_for_tests(); + + // Block claims slot 100 + block_should_schedule(100, true); + + // Try to revert slot 99 (wrong slot) + let result = block_failed(99); + assert_eq!(result, Some(false)); + + // Slot 100 should still be claimed by block + assert!(is_slot_claimed_by_block()); + assert_eq!(last_slot_scheduled(), 100); + } +} diff --git a/core/src/tpu.rs b/core/src/tpu.rs index 47b01dfdcd..932bbad75a 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -138,6 +138,7 @@ pub struct Tpu { fetch_stage_manager: FetchStageManager, bundle_stage: BundleStage, bundle_sigverify_stage: BundleSigverifyStage, + block_stage: crate::block_stage::BlockStage, } impl Tpu { @@ -360,6 +361,7 @@ impl Tpu { let shredstream_receiver_address = Arc::new(ArcSwap::from_pointee(None)); // set by `[BlockEngineStage::connect_auth_and_stream()]` let (unverified_bundle_sender, unverified_bundle_receiver) = bounded(1024); + let (block_sender, block_receiver) = bounded::(1024); let block_engine_stage = BlockEngineStage::new( block_engine_config, unverified_bundle_sender, @@ -369,6 +371,7 @@ impl Tpu { exit.clone(), &block_builder_fee_info, shredstream_receiver_address.clone(), + block_sender, ); let (verified_bundle_sender, verified_bundle_receiver) = bounded(1024); let bundle_sigverify_stage = BundleSigverifyStage::new( @@ -449,19 +452,31 @@ impl Tpu { cluster_info, bank_forks.clone(), poh_recorder, - transaction_recorder, + transaction_recorder.clone(), verified_bundle_receiver, - transaction_status_sender, - replay_vote_sender, + transaction_status_sender.clone(), + replay_vote_sender.clone(), log_messages_bytes_limit, exit.clone(), tip_manager, bundle_account_locker, &block_builder_fee_info, - prioritization_fee_cache, + &prioritization_fee_cache, blacklisted_accounts, ); + let block_stage = crate::block_stage::BlockStage::new( + cluster_info, + bank_forks.clone(), + transaction_recorder, + block_receiver, + transaction_status_sender, + replay_vote_sender, + log_messages_bytes_limit, + exit.clone(), + &prioritization_fee_cache, + ); + let (entry_receiver, tpu_entry_notifier) = if let Some(entry_notification_sender) = entry_notification_sender { let (broadcast_entry_sender, broadcast_entry_receiver) = unbounded(); @@ -521,6 +536,7 @@ impl Tpu { fetch_stage_manager, bundle_stage, bundle_sigverify_stage, + block_stage, } } @@ -547,6 +563,7 @@ impl Tpu { self.tpu_vote_quic_t.join(), self.bundle_stage.join(), self.bundle_sigverify_stage.join(), + self.block_stage.join(), self.relayer_stage.join(), self.block_engine_stage.join(), self.fetch_stage_manager.join(), diff --git a/core/tests/block_consumer.rs b/core/tests/block_consumer.rs new file mode 100644 index 0000000000..2928412073 --- /dev/null +++ b/core/tests/block_consumer.rs @@ -0,0 +1,209 @@ +use { + agave_reserved_account_keys::ReservedAccountKeys, + agave_transaction_view::{ + resolved_transaction_view::ResolvedTransactionView, + transaction_view::SanitizedTransactionView, + }, + crossbeam_channel::unbounded, + solana_core::{banking_stage::committer::Committer, block_stage::BlockConsumer}, + solana_keypair::Keypair, + solana_ledger::genesis_utils::{create_genesis_config_with_leader, GenesisConfigInfo}, + solana_poh::{ + record_channels::{record_channels, RecordReceiver}, + transaction_recorder::TransactionRecorder, + }, + solana_pubkey::Pubkey, + solana_runtime::{ + bank::Bank, bank_forks::BankForks, prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_runtime_transaction::runtime_transaction::RuntimeTransaction, + solana_signer::Signer, + solana_system_transaction::transfer, + solana_transaction::{sanitized::MessageHash, versioned::VersionedTransaction}, + std::sync::{Arc, RwLock}, +}; + +struct TestFixture { + genesis_config_info: GenesisConfigInfo, + bank: Arc, + #[allow(dead_code)] + bank_forks: Arc>, + block_consumer: BlockConsumer, + #[allow(dead_code)] + record_receiver: RecordReceiver, +} + +impl TestFixture { + fn new() -> Self { + let leader_keypair = Keypair::new(); + let genesis_config_info = + create_genesis_config_with_leader(10_000, &leader_keypair.pubkey(), 10_000_000); + let (bank, bank_forks) = + Bank::new_with_bank_forks_for_tests(&genesis_config_info.genesis_config); + + let (replay_vote_sender, _replay_vote_receiver) = unbounded(); + let committer = Committer::new( + None, + replay_vote_sender, + Arc::new(PrioritizationFeeCache::new(0u64)), + ); + + let (record_sender, mut record_receiver) = record_channels(false); + record_receiver.restart(bank.bank_id()); + let transaction_recorder = TransactionRecorder::new(record_sender); + + let block_consumer = BlockConsumer::new( + committer, + transaction_recorder, + None, // log_messages_bytes_limit + ); + + TestFixture { + genesis_config_info, + bank, + bank_forks, + block_consumer, + record_receiver, + } + } +} + +/// Helper to convert VersionedTransaction bytes to RuntimeTransaction +fn to_runtime_transaction( + serialized: &[u8], +) -> RuntimeTransaction> { + let transaction_view = + SanitizedTransactionView::try_new_sanitized(serialized, true).unwrap(); + let static_runtime_tx = RuntimeTransaction::>::try_from( + transaction_view, + MessageHash::Compute, + None, + ) + .unwrap(); + RuntimeTransaction::>::try_from( + static_runtime_tx, + None, + &ReservedAccountKeys::empty_key_set(), + ) + .unwrap() +} + +#[test] +fn test_block_consumer_executes_transactions() { + let mut fixture = TestFixture::new(); + let bank = fixture.bank.clone(); + let intended_slot = bank.slot(); + + // Create a test transaction - transfer from mint to a new account + let keypair1 = Keypair::new(); + + // Fund keypair1 from the mint (use small amount since mint has 10_000 lamports) + let transfer_amount = 1_000; + let transfer1 = transfer( + &fixture.genesis_config_info.mint_keypair, + &keypair1.pubkey(), + transfer_amount, + bank.last_blockhash(), + ); + + // Serialize the transaction - needs to stay alive for the transaction view + let serialized = bincode::serialize(&VersionedTransaction::from(transfer1)).unwrap(); + + // Convert to RuntimeTransaction using zerocopy parsing + let transactions = vec![to_runtime_transaction(&serialized)]; + + // Create max_ages for the transactions + let max_ages = vec![ + solana_core::banking_stage::scheduler_messages::MaxAge { + sanitized_epoch: bank.epoch(), + alt_invalidation_slot: bank.slot(), + }; + transactions.len() + ]; + + // Verify initial balance + assert_eq!(bank.get_balance(&keypair1.pubkey()), 0); + + // Process the block transactions (optimistic recording) + let output = fixture + .block_consumer + .process_and_record_block_transactions(&bank, &transactions, &max_ages, intended_slot); + + // Verify that transactions were processed successfully + let result = &output + .execute_and_commit_transactions_output + .commit_transactions_result; + if let Err(e) = result { + panic!("commit_transactions_result failed: {:?}", e); + } + + let commit_result = output + .execute_and_commit_transactions_output + .commit_transactions_result + .unwrap(); + assert_eq!(commit_result.len(), 1); + + // Verify the transaction was committed successfully + match &commit_result[0] { + solana_core::banking_stage::committer::CommitTransactionDetails::Committed { result, .. } => { + assert!(result.is_ok(), "Transaction should succeed: {:?}", result); + } + other => panic!("Expected Committed, got {:?}", other), + } + + // Verify that the balance was updated + assert_eq!(bank.get_balance(&keypair1.pubkey()), transfer_amount); +} + +#[test] +fn test_block_consumer_with_empty_transactions() { + let mut fixture = TestFixture::new(); + let bank = fixture.bank.clone(); + let intended_slot = bank.slot(); + + let transactions: Vec>> = vec![]; + let max_ages = vec![]; + + let output = fixture + .block_consumer + .process_and_record_block_transactions(&bank, &transactions, &max_ages, intended_slot); + + // Should return success for empty transactions + assert!(output + .execute_and_commit_transactions_output + .commit_transactions_result + .is_ok()); +} + +#[test] +fn test_block_consumer_with_invalid_transaction() { + let mut fixture = TestFixture::new(); + let bank = fixture.bank.clone(); + let intended_slot = bank.slot(); + + // Create an invalid transaction (insufficient funds) + let keypair = Keypair::new(); + let invalid_transfer = transfer( + &keypair, // No funds + &Pubkey::new_unique(), + 1_000_000, + bank.last_blockhash(), + ); + + let serialized = bincode::serialize(&VersionedTransaction::from(invalid_transfer)).unwrap(); + let transactions = vec![to_runtime_transaction(&serialized)]; + let max_ages = vec![solana_core::banking_stage::scheduler_messages::MaxAge { + sanitized_epoch: bank.epoch(), + alt_invalidation_slot: bank.slot(), + }]; + + let output = fixture + .block_consumer + .process_and_record_block_transactions(&bank, &transactions, &max_ages, intended_slot); + + // Should handle invalid transaction gracefully + assert!(output + .execute_and_commit_transactions_output + .commit_transactions_result + .is_ok()); +} diff --git a/cost-model/src/cost_tracker.rs b/cost-model/src/cost_tracker.rs index 7d5413aed0..80fc87c754 100644 --- a/cost-model/src/cost_tracker.rs +++ b/cost-model/src/cost_tracker.rs @@ -72,7 +72,9 @@ pub struct UpdatedCosts { pub struct CostTracker { account_cost_limit: u64, block_cost_limit: u64, - vote_cost_limit: u64, + pub vote_cost_limit: u64, + /// Original vote cost limit saved before proposer limits applied (0 = not modified) + pub original_vote_cost_limit: u64, cost_by_writable_accounts: HashMap, block_cost: SharedBlockCost, vote_cost: u64, @@ -100,6 +102,7 @@ impl Default for CostTracker { account_cost_limit: MAX_WRITABLE_ACCOUNT_UNITS, block_cost_limit: MAX_BLOCK_UNITS, vote_cost_limit: MAX_VOTE_UNITS, + original_vote_cost_limit: 0, cost_by_writable_accounts: HashMap::with_capacity_and_hasher( WRITABLE_ACCOUNTS_PER_BLOCK, ahash::RandomState::new(), @@ -120,6 +123,7 @@ impl Default for CostTracker { impl CostTracker { pub fn new_from_parent_limits(&self) -> Self { let mut new = Self::default(); + new.original_vote_cost_limit = self.original_vote_cost_limit; new.set_limits( self.account_cost_limit, self.block_cost_limit, @@ -155,6 +159,57 @@ impl CostTracker { self.vote_cost_limit = vote_cost_limit; } + /// Set proposer vote limit (4M CUs) for block execution. + /// Saves current limit to original_vote_cost_limit for restoration. + pub fn set_proposer_vote_limit(&mut self) { + self.original_vote_cost_limit = self.vote_cost_limit; + self.vote_cost_limit = 4_000_000; + } + + /// Restore vote limit after block execution completes. + pub fn restore_vote_limit(&mut self) { + if self.original_vote_cost_limit > 0 { + self.vote_cost_limit = self.original_vote_cost_limit; + self.original_vote_cost_limit = 0; + } + } + + /// Add actual executed transaction costs to the tracker in batch. + /// Used after block execution to track real CUs consumed. + /// Takes iterator of (transaction, compute_units, loaded_accounts_data_size) to add all at once. + pub fn add_executed_transaction_costs<'a, Tx: TransactionWithMeta + 'a>( + &mut self, + executed_txs: impl Iterator, + ) { + for (tx, actual_compute_units, loaded_accounts_data_size) in executed_txs { + self.transaction_count += 1; + self.allocated_accounts_data_size += loaded_accounts_data_size as u64; + self.block_cost.fetch_add(actual_compute_units); + + // Track signature counts + self.transaction_signature_count += tx.num_transaction_signatures(); + self.secp256k1_instruction_signature_count += tx.num_secp256k1_signatures(); + self.ed25519_instruction_signature_count += tx.num_ed25519_signatures(); + self.secp256r1_instruction_signature_count += tx.num_secp256r1_signatures(); + + if tx.is_simple_vote_transaction() { + self.vote_cost = self.vote_cost.saturating_add(actual_compute_units); + } + + // Add per-account costs for writable accounts + let account_keys = tx.account_keys(); + for (i, account_key) in account_keys.iter().enumerate() { + if tx.is_writable(i) { + let account_cost = self + .cost_by_writable_accounts + .entry(*account_key) + .or_insert(0); + *account_cost = account_cost.saturating_add(actual_compute_units); + } + } + } + } + pub fn in_flight_transaction_count(&self) -> usize { self.in_flight_transaction_count.0 } diff --git a/entry/src/poh.rs b/entry/src/poh.rs index 7008a215bb..faa491b82e 100644 --- a/entry/src/poh.rs +++ b/entry/src/poh.rs @@ -52,6 +52,11 @@ impl Poh { self.hashes_per_tick } + /// Returns the remaining hashes until the next tick. + pub fn remaining_hashes(&self) -> u64 { + self.remaining_hashes_until_tick + } + pub fn target_poh_time(&self, target_ns_per_tick: u64) -> Instant { assert!(self.hashes_per_tick > 0); let offset_tick_ns = target_ns_per_tick * self.tick_number; diff --git a/jito-protos/protos b/jito-protos/protos index 46ead86a13..ce5f8900de 160000 --- a/jito-protos/protos +++ b/jito-protos/protos @@ -1 +1 @@ -Subproject commit 46ead86a13a55a0ef2c139db96a8ee93bf7505e3 +Subproject commit ce5f8900deb02ea1daef696f4ea7735f284aa702 diff --git a/jito-protos/src/lib.rs b/jito-protos/src/lib.rs index cf630c53d2..5e935749f1 100644 --- a/jito-protos/src/lib.rs +++ b/jito-protos/src/lib.rs @@ -3,6 +3,10 @@ pub mod proto { tonic::include_proto!("auth"); } + pub mod block { + tonic::include_proto!("block"); + } + pub mod block_engine { tonic::include_proto!("block_engine"); } diff --git a/poh/src/poh_recorder.rs b/poh/src/poh_recorder.rs index b58511cae5..bc0aac136c 100644 --- a/poh/src/poh_recorder.rs +++ b/poh/src/poh_recorder.rs @@ -62,6 +62,9 @@ pub enum PohRecorderError { #[error("channel disconnected")] ChannelDisconnected, + + #[error("harmonic block invalid signature")] + HarmonicBlockInvalidSignature, } pub(crate) type Result = std::result::Result; @@ -305,8 +308,8 @@ impl PohRecorder { pub(crate) fn record( &mut self, bank_id: BankId, - mixins: Vec, - transaction_batches: Vec>, + mut mixins: Vec, + mut transaction_batches: Vec>, ) -> Result { // Entries without transactions are used to track real-time passing in the ledger and // cannot be generated by `record()` @@ -325,6 +328,32 @@ impl PohRecorder { self.metrics.report_metrics_us += report_metrics_us; } + // Ensure that we can fit all the mixins in this slot before recording anything. + // This ensures all-or-nothing recording for blocks. + let working_bank = self + .working_bank + .as_mut() + .ok_or(PohRecorderError::MaxHeightReached)?; + let poh_lock = self.poh.lock().unwrap(); + let remaining_ticks = working_bank + .bank + .max_tick_height() + // +1 here to ignore the current tick window, which is covered by remaining_hashes + .saturating_sub(self.tick_height() + 1); + // The last hash in each tick is reserved for the tick itself, so subtract 1 per tick + let tick_remaining_hashes = poh_lock.remaining_hashes().saturating_sub(1); + let total_remaining_hashes = tick_remaining_hashes + .saturating_add(remaining_ticks.saturating_mul(poh_lock.hashes_per_tick().saturating_sub(1))); + if mixins.len() as u64 > total_remaining_hashes { + info!( + "Insufficient hashes remaining for all-or-nothing recording: {} mixins > {} remaining hashes", + mixins.len(), + total_remaining_hashes + ); + return Err(PohRecorderError::MaxHeightReached); + } + drop(poh_lock); + loop { let (flush_cache_res, flush_cache_us) = measure_us!(self.flush_cache(false)); self.metrics.flush_cache_no_tick_us += flush_cache_us; @@ -342,17 +371,24 @@ impl PohRecorder { let (mut poh_lock, poh_lock_us) = measure_us!(self.poh.lock().unwrap()); self.metrics.record_lock_contention_us += poh_lock_us; - let (mixed_in, record_mixin_us) = - measure_us!(poh_lock.record_batches(&mixins, &mut self.entries)); + // Process mixins in batches that fit within the current tick + let batch_len = std::cmp::min((poh_lock.remaining_hashes() - 1) as usize, mixins.len()); + let (mixed_in, record_mixin_us) = if batch_len == 0 { + // Don't record mixins on tick boundary, just tick + (false, 0) + } else { + measure_us!(poh_lock.record_batches(&mixins[..batch_len], &mut self.entries)) + }; self.metrics.record_us += record_mixin_us; - let remaining_hashes_in_slot = - poh_lock.remaining_hashes_in_slot(working_bank.bank.ticks_per_slot()); - drop(poh_lock); if mixed_in { - debug_assert_eq!(self.entries.len(), mixins.len()); - for (entry, transactions) in self.entries.drain(..).zip(transaction_batches) { + debug_assert_eq!(self.entries.len(), batch_len); + for (entry, transactions) in self + .entries + .drain(..) + .zip(transaction_batches.drain(..batch_len)) + { let (send_entry_res, send_batches_us) = measure_us!(self.working_bank_sender.send(( working_bank.bank.clone(), @@ -369,9 +405,16 @@ impl PohRecorder { send_entry_res?; } - return Ok(RecordSummary { - remaining_hashes_in_slot, - }); + drop(mixins.drain(..batch_len)); + // We checked available hashes before the loop, so we should always drain all + if mixins.is_empty() { + let poh_lock = self.poh.lock().unwrap(); + let remaining_hashes_in_slot = + poh_lock.remaining_hashes_in_slot(working_bank.bank.ticks_per_slot()); + return Ok(RecordSummary { + remaining_hashes_in_slot, + }); + } } // record() might fail if the next PoH hash needs to be a tick. But that's ok, tick() diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 25013136a1..b78448ac93 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -1394,6 +1394,14 @@ impl Bank { epoch_rewards_calculation_cache: parent.epoch_rewards_calculation_cache.clone(), }; + // cavey unset limits. if we are proposer again this is set when we set_tpu_bank + let mut cost_tracker = new.cost_tracker.write().unwrap(); + if cost_tracker.original_vote_cost_limit > 0 { + cost_tracker.vote_cost_limit = cost_tracker.original_vote_cost_limit; + cost_tracker.original_vote_cost_limit = 0; + } + drop(cost_tracker); + let (_, ancestors_time_us) = measure_us!({ let mut ancestors = Vec::with_capacity(1 + new.parents().len()); ancestors.push(new.slot()); @@ -1420,6 +1428,8 @@ impl Bank { new.distribute_partitioned_epoch_rewards(); }); + + let (_, cache_preparation_time_us) = measure_us!(new.prepare_program_cache_for_upcoming_feature_set()); @@ -2006,6 +2016,25 @@ impl Bank { self.hash.read().unwrap() } + /// Lock the blockhash queue to prevent slot from ending during parallel execution. + /// This is used alongside freeze_lock for optimistic recording. + pub fn blockhash_queue_lock(&self) -> RwLockReadGuard<'_, BlockhashQueue> { + self.blockhash_queue.read().unwrap() + } + + /// Set proposer vote limit (4M CUs) for our leader slot. + /// Limits vote processing capacity during block execution. + pub fn set_proposer_vote_limit(&self) { + self.write_cost_tracker() + .unwrap() + .set_proposer_vote_limit(); + } + + /// Restore vote limit after block execution completes. + pub fn restore_vote_limit(&self) { + self.write_cost_tracker().unwrap().restore_vote_limit(); + } + pub fn hash(&self) -> Hash { *self.hash.read().unwrap() } @@ -3267,14 +3296,14 @@ impl Bank { } let mut simulation_results = Vec::new(); - let mut account_overrides = AccountOverrides::default(); + let account_overrides = AccountOverrides::default(); // Pre-load all the account state into account overrides for transaction in transactions { let account_keys = transaction.account_keys(); account_overrides.merge(self.get_account_overrides_for_simulation(&account_keys)); for account in transaction.account_keys().iter() { - if !account_overrides.accounts().contains_key(account) { + if account_overrides.get(account).is_none() { if let Some((account_shared_data, _slot)) = self.get_account_shared_data(account) { @@ -3471,7 +3500,7 @@ impl Bank { &self, account_keys: &AccountKeys, ) -> AccountOverrides { - let mut account_overrides = AccountOverrides::default(); + let account_overrides = AccountOverrides::default(); let slot_history_id = sysvar::slot_history::id(); if account_keys.iter().any(|pubkey| *pubkey == slot_history_id) { let current_account = self.get_account_with_fixed_root(&slot_history_id); diff --git a/svm/Cargo.toml b/svm/Cargo.toml index ed58692b3a..160bebb77a 100644 --- a/svm/Cargo.toml +++ b/svm/Cargo.toml @@ -38,6 +38,7 @@ svm-internal = [] [dependencies] ahash = { workspace = true } log = { workspace = true } +scc = { workspace = true } percentage = { workspace = true } qualifier_attr = { workspace = true, optional = true } serde = { workspace = true, features = ["rc"] } diff --git a/svm/src/account_loader.rs b/svm/src/account_loader.rs index af096e0be4..ebeec53507 100644 --- a/svm/src/account_loader.rs +++ b/svm/src/account_loader.rs @@ -186,9 +186,10 @@ impl<'a, CB: TransactionProcessingCallback> AccountLoader<'a, CB> { // Jito added: let's use pre-execution accounts if let Some(overrides) = account_overrides { - for (pubkey, account) in overrides.accounts().iter() { + overrides.accounts().iter_sync(|pubkey, account| { loaded_accounts.insert(*pubkey, account.clone()); - } + true // continue iteration + }); } Self { @@ -1300,7 +1301,7 @@ mod tests { #[test] fn test_overrides() { agave_logger::setup(); - let mut account_overrides = AccountOverrides::default(); + let account_overrides = AccountOverrides::default(); let slot_history_id = sysvar::slot_history::id(); let account = AccountSharedData::new(42, 0, &Pubkey::default()); account_overrides.set_slot_history(Some(account)); diff --git a/svm/src/account_overrides.rs b/svm/src/account_overrides.rs index 737965b91d..442cfe697c 100644 --- a/svm/src/account_overrides.rs +++ b/svm/src/account_overrides.rs @@ -1,35 +1,49 @@ use { - solana_account::AccountSharedData, solana_pubkey::Pubkey, solana_sdk_ids::sysvar, - std::collections::HashMap, + scc::HashMap, + solana_account::AccountSharedData, + solana_pubkey::Pubkey, + solana_sdk_ids::sysvar, }; /// Encapsulates overridden accounts, typically used for transaction /// simulations. Account overrides are currently not used when loading the /// durable nonce account or when constructing the instructions sysvar account. -#[derive(Clone, Default, Debug)] +/// +/// Uses scc::HashMap for lock-free concurrent access, allowing multiple +/// threads to read/write account overrides simultaneously. pub struct AccountOverrides { - accounts: HashMap, + accounts: HashMap, } impl AccountOverrides { /// Insert or remove an account with a given pubkey to/from the list of overrides. - pub fn set_account(&mut self, pubkey: &Pubkey, account: Option) { + /// Thread-safe: can be called from multiple threads concurrently. + pub fn set_account(&self, pubkey: &Pubkey, account: Option) { match account { - Some(account) => self.accounts.insert(*pubkey, account), - None => self.accounts.remove(pubkey), + Some(account) => { + let _ = self.accounts.upsert_sync(*pubkey, account); + } + None => { + let _ = self.accounts.remove_sync(pubkey).map(|kv| kv.1); + } }; } /// Sets in the slot history /// /// Note: no checks are performed on the correctness of the contained data - pub fn set_slot_history(&mut self, slot_history: Option) { + pub fn set_slot_history(&self, slot_history: Option) { self.set_account(&sysvar::slot_history::id(), slot_history); } - /// Gets the account if it's found in the list of overrides - pub fn get(&self, pubkey: &Pubkey) -> Option<&AccountSharedData> { - self.accounts.get(pubkey) + /// Gets the account if it's found in the list of overrides. + /// Returns an OccupiedEntry which holds a reference to the value. + pub fn get<'a>( + &'a self, + pubkey: &Pubkey, + ) -> Option> + { + self.accounts.get_sync(pubkey) } pub fn len(&self) -> usize { @@ -40,12 +54,27 @@ impl AccountOverrides { self.accounts.is_empty() } - pub fn accounts(&self) -> &HashMap { + /// Returns a reference to the underlying HashMap. + /// Note: For iteration, use scan() or scan_async() methods on the HashMap. + pub fn accounts(&self) -> &HashMap { &self.accounts } - pub fn merge(&mut self, other: AccountOverrides) { - self.accounts.extend(other.accounts); + /// Merge another AccountOverrides into this one. + /// Thread-safe: can be called while other threads are reading/writing. + pub fn merge(&self, other: AccountOverrides) { + other.accounts.iter_sync(|k, v| { + let _ = self.accounts.upsert_sync(*k, v.clone()); + true // continue iteration + }); + } +} + +impl Default for AccountOverrides { + fn default() -> AccountOverrides { + AccountOverrides { + accounts: HashMap::<_, _, ahash::RandomState>::with_hasher(ahash::RandomState::new()), + } } } @@ -58,11 +87,11 @@ mod test { #[test] fn test_set_account() { - let mut accounts = AccountOverrides::default(); + let accounts = AccountOverrides::default(); let data = AccountSharedData::default(); let key = Pubkey::new_unique(); accounts.set_account(&key, Some(data.clone())); - assert_eq!(accounts.get(&key), Some(&data)); + assert_eq!(accounts.get(&key).map(|e| e.get().clone()), Some(data)); accounts.set_account(&key, None); assert!(accounts.get(&key).is_none()); @@ -70,12 +99,17 @@ mod test { #[test] fn test_slot_history() { - let mut accounts = AccountOverrides::default(); + let accounts = AccountOverrides::default(); let data = AccountSharedData::default(); - assert_eq!(accounts.get(&sysvar::slot_history::id()), None); + assert!(accounts.get(&sysvar::slot_history::id()).is_none()); accounts.set_slot_history(Some(data.clone())); - assert_eq!(accounts.get(&sysvar::slot_history::id()), Some(&data)); + assert_eq!( + accounts + .get(&sysvar::slot_history::id()) + .map(|e| e.get().clone()), + Some(data) + ); } } From e8d0c47b086204bfa1036bbbda45915e743741ae Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Mon, 5 Jan 2026 11:07:35 +0000 Subject: [PATCH 02/23] send leader window notifications --- core/src/proxy/block_engine_stage.rs | 47 ++++++++++++++++++++++++++++ core/src/replay_stage.rs | 31 ++++++++++++++++++ core/src/tpu.rs | 2 ++ core/src/tvu.rs | 2 ++ core/src/validator.rs | 5 +++ 5 files changed, 87 insertions(+) diff --git a/core/src/proxy/block_engine_stage.rs b/core/src/proxy/block_engine_stage.rs index 63f87a7c76..360bba2e63 100644 --- a/core/src/proxy/block_engine_stage.rs +++ b/core/src/proxy/block_engine_stage.rs @@ -24,6 +24,7 @@ use { block_engine::{ self, block_engine_validator_client::BlockEngineValidatorClient, BlockBuilderFeeInfoRequest, BlockEngineEndpoint, GetBlockEngineEndpointRequest, + SubmitLeaderWindowInfoRequest, }, }, solana_gossip::cluster_info::ClusterInfo, @@ -131,6 +132,8 @@ impl BlockEngineStage { shredstream_receiver_address: Arc>>, // Channel that blocks get piped through. block_tx: Sender, + // Channel for leader window notifications. + leader_window_receiver: tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> Self { let block_builder_fee_info = block_builder_fee_info.clone(); @@ -151,6 +154,7 @@ impl BlockEngineStage { block_builder_fee_info, shredstream_receiver_address, block_tx, + leader_window_receiver, )); }) .unwrap(); @@ -178,6 +182,7 @@ impl BlockEngineStage { block_builder_fee_info: Arc>, shredstream_receiver_address: Arc>>, block_tx: Sender, + mut leader_window_receiver: tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) { let mut error_count: u64 = 0; @@ -203,6 +208,7 @@ impl BlockEngineStage { &shredstream_receiver_address, &local_block_engine_config, &block_tx, + &mut leader_window_receiver, ) .await { @@ -239,6 +245,7 @@ impl BlockEngineStage { shredstream_receiver_address: &Arc>>, local_block_engine_config: &BlockEngineConfig, block_tx: &Sender, + leader_window_receiver: &mut tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> crate::proxy::Result<()> { let endpoint = Self::get_endpoint(&local_block_engine_config.block_engine_url)?; if !local_block_engine_config.disable_block_engine_autoconfig { @@ -259,6 +266,7 @@ impl BlockEngineStage { block_builder_fee_info, shredstream_receiver_address, block_tx, + leader_window_receiver, ) .await; } @@ -292,6 +300,7 @@ impl BlockEngineStage { block_builder_fee_info, &Self::CONNECTION_TIMEOUT, block_tx, + leader_window_receiver, ) .await .inspect(|_| { @@ -317,6 +326,7 @@ impl BlockEngineStage { block_builder_fee_info: &Arc>, shredstream_receiver_address: &Arc>>, block_tx: &Sender, + leader_window_receiver: &mut tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> crate::proxy::Result<()> { let candidates = Self::get_ranked_endpoints(&endpoint).await?; @@ -353,6 +363,7 @@ impl BlockEngineStage { block_builder_fee_info, &Self::CONNECTION_TIMEOUT, block_tx, + leader_window_receiver, ) .await { @@ -472,6 +483,7 @@ impl BlockEngineStage { block_builder_fee_info: &Arc>, connection_timeout: &Duration, block_tx: &Sender, + leader_window_receiver: &mut tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> crate::proxy::Result<()> { // Get a copy of configs here in case they have changed at runtime let keypair = cluster_info.keypair().clone(); @@ -534,6 +546,7 @@ impl BlockEngineStage { cluster_info, &backend_url, block_tx, + leader_window_receiver, ) .await } @@ -717,6 +730,7 @@ impl BlockEngineStage { cluster_info: &Arc, block_engine_url: &str, block_tx: &Sender, + leader_window_receiver: &mut tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> crate::proxy::Result<()> { let subscribe_packets_stream = timeout( *connection_timeout, @@ -789,6 +803,7 @@ impl BlockEngineStage { connection_timeout, block_engine_url, &block_tx, + leader_window_receiver, ) .await } @@ -816,6 +831,7 @@ impl BlockEngineStage { connection_timeout: &Duration, block_engine_url: &str, block_tx: &Sender, + leader_window_receiver: &mut tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> crate::proxy::Result<()> { const METRICS_TICK: Duration = Duration::from_secs(1); const MAINTENANCE_TICK: Duration = Duration::from_secs(10 * 60); @@ -838,6 +854,9 @@ impl BlockEngineStage { maybe_bundles = bundle_stream.message() => { Self::handle_block_engine_maybe_bundles(maybe_bundles, bundle_tx, &mut block_engine_stats)?; } + maybe_leader_window = leader_window_receiver.recv() => { + Self::handle_leader_window_notification(maybe_leader_window, &mut client).await?; + } maybe_blocks = block_stream.message() => { Self::handle_block_engine_maybe_blocks(maybe_blocks, block_tx, &mut block_engine_stats)?; } @@ -980,6 +999,34 @@ impl BlockEngineStage { Ok(()) } + async fn handle_leader_window_notification( + notification: Option<(std::time::SystemTime, u64)>, + client: &mut BlockEngineValidatorClient>, + ) -> crate::proxy::Result<()> { + let Some((time, slot)) = notification else { + return Err(ProxyError::GrpcStreamDisconnected); + }; + + info!("Handling leader window notification ({:?}, {})", time, slot); + + match client + .submit_leader_window_info(SubmitLeaderWindowInfoRequest { + start_timestamp: Some(prost_types::Timestamp::from(time)), + slot, + }) + .await + { + Ok(_) => { + info!("Successfully submitted leader window info for slot {}", slot); + } + Err(e) => { + error!("Failed to submit leader window info: {e}"); + } + } + + Ok(()) + } + fn handle_block_engine_packets( resp: block_engine::SubscribePacketsResponse, packet_tx: &Sender, diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index d7b0fac040..337dffa206 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -308,6 +308,7 @@ pub struct ReplaySenders { pub drop_bank_sender: Sender>, pub block_metadata_notifier: Option, pub dumped_slots_sender: Sender>, + pub leader_window_sender: tokio::sync::mpsc::Sender<(std::time::SystemTime, u64)>, } pub struct ReplayReceivers { @@ -614,6 +615,7 @@ impl ReplayStage { drop_bank_sender, block_metadata_notifier, dumped_slots_sender, + leader_window_sender, } = senders; let ReplayReceivers { @@ -1224,6 +1226,7 @@ impl ReplayStage { has_new_vote_been_rooted, &first_alpenglow_slot, &mut is_alpenglow_migration_complete, + &leader_window_sender, ) { Self::log_leader_change( &my_pubkey, @@ -2147,6 +2150,7 @@ impl ReplayStage { has_new_vote_been_rooted: bool, first_alpenglow_slot: &Option, is_alpenglow_migration_complete: &mut bool, + leader_window_sender: &tokio::sync::mpsc::Sender<(std::time::SystemTime, u64)>, ) -> Option { // all the individual calls to poh_recorder.read() are designed to // increase granularity, decrease contention @@ -2286,6 +2290,25 @@ impl ReplayStage { poh_controller, tpu_bank, ); + + // Send leader window notification to block auction house + let window_start_time = std::time::SystemTime::now(); + match leader_window_sender.try_send((window_start_time, poh_slot)) { + Ok(()) => { + info!( + "Sent leader window notification ({:?}, {})", + window_start_time, poh_slot + ); + } + Err(e) => { + error!( + "Failed to send leader window notification. Is consumer thread connected \ + to the block auction house? error: {}", + e + ); + } + } + Some(poh_slot) } else { error!("{my_pubkey} No next leader found"); @@ -8793,6 +8816,7 @@ pub(crate) mod tests { let rpc_subscriptions = Some(rpc_subscriptions); + let (leader_window_sender, _) = tokio::sync::mpsc::channel(1); assert!(ReplayStage::maybe_start_leader( my_pubkey, bank_forks, @@ -8808,6 +8832,7 @@ pub(crate) mod tests { has_new_vote_been_rooted, &None, &mut false, + &leader_window_sender, ) .is_none()); } @@ -9471,6 +9496,7 @@ pub(crate) mod tests { poh_recorder.read().unwrap().reached_leader_slot(&my_pubkey), PohLeaderStatus::NotReached ); + let (leader_window_sender, _) = tokio::sync::mpsc::channel(1); assert!(ReplayStage::maybe_start_leader( &my_pubkey, &bank_forks, @@ -9486,6 +9512,7 @@ pub(crate) mod tests { has_new_vote_been_rooted, &None, &mut false, + &leader_window_sender, ) .is_none()); @@ -9516,6 +9543,7 @@ pub(crate) mod tests { has_new_vote_been_rooted, &None, &mut false, + &leader_window_sender, ) .is_some()); wait_for_poh_service(&poh_controller); @@ -9823,6 +9851,7 @@ pub(crate) mod tests { let has_new_vote_been_rooted = true; // We should start leader for the poh slot, however alpenglow migration should not be started + let (leader_window_sender, _) = tokio::sync::mpsc::channel(1); assert!(ReplayStage::maybe_start_leader( &my_pubkey, &bank_forks, @@ -9838,6 +9867,7 @@ pub(crate) mod tests { has_new_vote_been_rooted, &None, &mut is_alpenglow_migration_complete, + &leader_window_sender, ) .is_some()); assert!(!is_alpenglow_migration_complete); @@ -9873,6 +9903,7 @@ pub(crate) mod tests { has_new_vote_been_rooted, &Some(alpenglow_slot), &mut is_alpenglow_migration_complete, + &leader_window_sender, ) .is_none()); assert!(is_alpenglow_migration_complete); diff --git a/core/src/tpu.rs b/core/src/tpu.rs index 932bbad75a..14b02c0486 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -190,6 +190,7 @@ impl Tpu { relayer_config: Arc>, tip_manager_config: TipManagerConfig, shred_receiver_address: Arc>>, + leader_window_receiver: tokio::sync::mpsc::Receiver<(std::time::SystemTime, u64)>, ) -> Self { let TpuSockets { transactions: transactions_sockets, @@ -372,6 +373,7 @@ impl Tpu { &block_builder_fee_info, shredstream_receiver_address.clone(), block_sender, + leader_window_receiver, ); let (verified_bundle_sender, verified_bundle_receiver) = bounded(1024); let bundle_sigverify_stage = BundleSigverifyStage::new( diff --git a/core/src/tvu.rs b/core/src/tvu.rs index d3982f6db0..7bff32a9f7 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -176,6 +176,7 @@ impl Tvu { slot_status_notifier: Option, vote_connection_cache: Arc, shred_receiver_addr: Arc>>, + leader_window_sender: tokio::sync::mpsc::Sender<(std::time::SystemTime, u64)>, ) -> Result { let in_wen_restart = wen_restart_repair_slots.is_some(); @@ -317,6 +318,7 @@ impl Tvu { drop_bank_sender, block_metadata_notifier, dumped_slots_sender, + leader_window_sender, }; let replay_receivers = ReplayReceivers { diff --git a/core/src/validator.rs b/core/src/validator.rs index 98d51cd64c..704afe7eda 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -1620,6 +1620,9 @@ impl Validator { None }; + // Channel for leader window notifications from replay_stage to block_engine_stage + let (leader_window_sender, leader_window_receiver) = tokio::sync::mpsc::channel(128); + let tvu = Tvu::new( vote_account, authorized_voter_keypairs, @@ -1684,6 +1687,7 @@ impl Validator { slot_status_notifier, vote_connection_cache, config.shred_retransmit_receiver_address.clone(), + leader_window_sender, ) .map_err(ValidatorError::Other)?; @@ -1780,6 +1784,7 @@ impl Validator { config.relayer_config.clone(), config.tip_manager_config.clone(), config.shred_receiver_address.clone(), + leader_window_receiver, ); datapoint_info!( From a40d0fafda2a724ea39901452d5f82d8f4b55892 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Mon, 5 Jan 2026 11:08:40 +0000 Subject: [PATCH 03/23] fix vote worker (upstream bug) --- core/src/banking_stage/vote_worker.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/banking_stage/vote_worker.rs b/core/src/banking_stage/vote_worker.rs index 8360a9e461..78e6e2f2bf 100644 --- a/core/src/banking_stage/vote_worker.rs +++ b/core/src/banking_stage/vote_worker.rs @@ -441,7 +441,7 @@ impl VoteWorker { total_transaction_counts .accumulate(&transaction_counts, commit_transactions_result.is_ok()); - let should_bank_still_be_processing_txs = bank.is_complete(); + let should_bank_still_be_processing_txs = !bank.is_complete(); let reached_max_poh_height = match ( commit_transactions_result, should_bank_still_be_processing_txs, From 6870f264a7da0a6fb452804c2d8c3e5a5cbc5b73 Mon Sep 17 00:00:00 2001 From: cavemanloverboy <93507302+cavemanloverboy@users.noreply.github.com> Date: Tue, 6 Jan 2026 22:43:24 -0800 Subject: [PATCH 04/23] use restore_vote_limit Co-authored-by: devinschmitz <94090407+devinschmitz@users.noreply.github.com> --- runtime/src/bank.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index b78448ac93..e18c7790df 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -1395,12 +1395,7 @@ impl Bank { }; // cavey unset limits. if we are proposer again this is set when we set_tpu_bank - let mut cost_tracker = new.cost_tracker.write().unwrap(); - if cost_tracker.original_vote_cost_limit > 0 { - cost_tracker.vote_cost_limit = cost_tracker.original_vote_cost_limit; - cost_tracker.original_vote_cost_limit = 0; - } - drop(cost_tracker); + new.restore_vote_limit(); let (_, ancestors_time_us) = measure_us!({ let mut ancestors = Vec::with_capacity(1 + new.parents().len()); From 0f948edcd24a0d68c488526bbfd0e2788a6447de Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 08:46:44 +0000 Subject: [PATCH 05/23] fmt/nit reduce diff --- core/src/banking_stage/decision_maker.rs | 5 ++--- runtime/src/bank.rs | 8 ++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/core/src/banking_stage/decision_maker.rs b/core/src/banking_stage/decision_maker.rs index 308f12736e..1df20fbeef 100644 --- a/core/src/banking_stage/decision_maker.rs +++ b/core/src/banking_stage/decision_maker.rs @@ -9,7 +9,7 @@ use { solana_runtime::bank::Bank, solana_unified_scheduler_pool::{BankingStageMonitor, BankingStageStatus}, std::sync::{ - atomic::{AtomicBool, Ordering}, + atomic::{AtomicBool, Ordering::Relaxed}, Arc, }, }; @@ -138,7 +138,7 @@ impl DecisionMakerWrapper { impl BankingStageMonitor for DecisionMakerWrapper { fn status(&mut self) -> BankingStageStatus { - if self.is_exited.load(Ordering::Relaxed) { + if self.is_exited.load(Relaxed) { BankingStageStatus::Exited } else if matches!( self.decision_maker.make_consume_or_forward_decision(), @@ -151,7 +151,6 @@ impl BankingStageMonitor for DecisionMakerWrapper { } } - #[cfg(test)] mod tests { use { diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index e18c7790df..bf2e2de96c 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -1395,7 +1395,7 @@ impl Bank { }; // cavey unset limits. if we are proposer again this is set when we set_tpu_bank - new.restore_vote_limit(); + new.restore_vote_limit(); let (_, ancestors_time_us) = measure_us!({ let mut ancestors = Vec::with_capacity(1 + new.parents().len()); @@ -1423,8 +1423,6 @@ impl Bank { new.distribute_partitioned_epoch_rewards(); }); - - let (_, cache_preparation_time_us) = measure_us!(new.prepare_program_cache_for_upcoming_feature_set()); @@ -2020,9 +2018,7 @@ impl Bank { /// Set proposer vote limit (4M CUs) for our leader slot. /// Limits vote processing capacity during block execution. pub fn set_proposer_vote_limit(&self) { - self.write_cost_tracker() - .unwrap() - .set_proposer_vote_limit(); + self.write_cost_tracker().unwrap().set_proposer_vote_limit(); } /// Restore vote limit after block execution completes. From 3eedbd3910d5ca5c51ffb1c2f89f986abcbaf643 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 08:47:04 +0000 Subject: [PATCH 06/23] require all transactions pass check_transactions --- core/src/block_stage/block_consumer.rs | 31 +++++++++++++------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs index b263e91b07..4c0773de55 100644 --- a/core/src/block_stage/block_consumer.rs +++ b/core/src/block_stage/block_consumer.rs @@ -208,22 +208,19 @@ impl BlockConsumer { &mut error_counters, ); - // If all transactions failed checks, return early without recording - let failed_checks: Vec<_> = check_results - .iter() + // All transactions should pass check_transactions + if let Some((txn, err)) = check_results + .into_iter() .enumerate() - .filter_map(|(i, result)| result.as_ref().err().map(|e| (i, e.clone()))) - .collect(); - - if failed_checks.len() == transactions.len() { - let commit_transactions_result = check_results - .into_iter() - .map(|r| match r { - Ok(_) => unreachable!("all transactions failed checks"), - Err(err) => CommitTransactionDetails::NotCommitted(err), - }) - .collect(); - + .find(|(_i, r)| r.is_err()) + .map(|(i, r)| (&transactions[i], r.expect_err("filtered for is_err"))) + { + info!( + "block in slot {} has txn {} which failed check_transactions: {:?}", + bank.slot(), + txn.signatures()[0], + &err + ); return ProcessTransactionBatchOutput { cost_model_throttled_transactions_count: 0, cost_model_us: 0, @@ -234,7 +231,9 @@ impl BlockConsumer { max_prioritization_fees: 0, transaction_counts: LeaderProcessedTransactionCounts::default(), retryable_transaction_indexes: vec![], - commit_transactions_result: Ok(commit_transactions_result), + commit_transactions_result: Err( + PohRecorderError::HarmonicBlockInvalidTransaction, + ), }, }; } From a1caa83098420cea7969701ab24a6972e7091cdc Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 08:48:08 +0000 Subject: [PATCH 07/23] minor optimization to poh record preparation (one less allocation + data round trip) --- core/src/block_stage/block_consumer.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs index 4c0773de55..21e89b251d 100644 --- a/core/src/block_stage/block_consumer.rs +++ b/core/src/block_stage/block_consumer.rs @@ -238,21 +238,15 @@ impl BlockConsumer { }; } - // Convert to versioned transactions for recording - let versioned_transactions: Vec<_> = transactions - .iter() - .map(|tx| tx.to_versioned_transaction()) - .collect(); - // Step 1: OPTIMISTICALLY RECORD ALL TRANSACTIONS TO POH FIRST // This broadcasts the block to the cluster so they can replay alongside us let mut record_transactions_timings = RecordTransactionsTimings::default(); // Hash each transaction individually - let mut hashes = Vec::with_capacity(versioned_transactions.len()); - let mut batches = Vec::with_capacity(versioned_transactions.len()); - for tx in &versioned_transactions { - let batch = vec![tx.clone()]; + let mut hashes = Vec::with_capacity(transactions.len()); + let mut batches = Vec::with_capacity(transactions.len()); + for tx in transactions.iter() { + let batch = vec![tx.to_versioned_transaction()]; let (hash, hash_us) = measure_us!(hash_transactions(&batch)); record_transactions_timings.hash_us += hash_us; hashes.push(hash); From f35fde80a382c0132f160d0401a013d949dd59c8 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 08:55:46 +0000 Subject: [PATCH 08/23] simplify block stage, add leader checks --- core/src/block_stage/harmonic_block.rs | 4 -- core/src/block_stage/mod.rs | 94 ++++++++++++-------------- core/src/scheduler_synchronization.rs | 54 +++++++-------- 3 files changed, 68 insertions(+), 84 deletions(-) diff --git a/core/src/block_stage/harmonic_block.rs b/core/src/block_stage/harmonic_block.rs index a7f60e096a..80a9febad0 100644 --- a/core/src/block_stage/harmonic_block.rs +++ b/core/src/block_stage/harmonic_block.rs @@ -23,8 +23,4 @@ impl HarmonicBlock { pub fn intended_slot(&self) -> Slot { self.intended_slot } - - pub fn take(self) -> PacketBatch { - self.transactions - } } diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs index 9c9cd9755a..99cdaafbd4 100644 --- a/core/src/block_stage/mod.rs +++ b/core/src/block_stage/mod.rs @@ -2,15 +2,17 @@ //! Unlike bundles, blocks have no transaction limit and the uuid field contains the intended slot. mod block_consumer; -mod harmonic_block; mod devin_scheduler; +mod harmonic_block; mod timer; pub use block_consumer::BlockConsumer; -pub use harmonic_block::HarmonicBlock; pub use devin_scheduler::DevinScheduler; +pub use harmonic_block::HarmonicBlock; pub use timer::Timer; +use crate::banking_stage::decision_maker::{BufferedPacketsDecision, DecisionMaker}; + use { crate::{ banking_stage::{ @@ -24,7 +26,7 @@ use { }, agave_transaction_view::resolved_transaction_view::ResolvedTransactionView, crossbeam_channel::{Receiver, RecvTimeoutError}, - log::info, + log::{info, warn}, solana_gossip::cluster_info::ClusterInfo, solana_ledger::blockstore_processor::TransactionStatusSender, solana_poh::transaction_recorder::TransactionRecorder, @@ -48,7 +50,6 @@ pub struct BlockStage { } impl BlockStage { - #[allow(clippy::new_ret_no_self)] #[allow(clippy::too_many_arguments)] pub fn new( cluster_info: &Arc, @@ -114,11 +115,11 @@ impl BlockStage { block_receiver: Receiver, mut consumer: BlockConsumer, exit: Arc, - _cluster_info: Arc, + cluster_info: Arc, ) { while !exit.load(Ordering::Relaxed) { match block_receiver.recv_timeout(Duration::from_millis(10)) { - Ok(block_bundle) => { + Ok(block) => { let (root_bank, working_bank) = { let bank_forks_guard = bank_forks.read().unwrap(); ( @@ -127,7 +128,7 @@ impl BlockStage { ) }; - let intended_slot = block_bundle.intended_slot(); + let intended_slot = block.intended_slot(); let current_slot = working_bank.slot(); // Check if this block is for the correct slot @@ -139,51 +140,37 @@ impl BlockStage { continue; } - // Check if we're in the delegation period and can schedule this block - let current_tick_height = working_bank.tick_height(); - let max_tick_height = working_bank.max_tick_height(); - let ticks_per_slot = working_bank.ticks_per_slot(); - let start_tick = max_tick_height - ticks_per_slot; - let ticks_into_slot = current_tick_height.saturating_sub(start_tick); - let delegation_period_length = ticks_per_slot * 15 / 16; - let in_delegation_period = ticks_into_slot < delegation_period_length; + // Sanity check we are leader + if !cluster_info.id().eq(working_bank.collector_id()) { + warn!("received block for which we are not leader"); + continue; + } - // Try to claim this slot for block scheduling - match scheduler_synchronization::block_should_schedule( - current_slot, - in_delegation_period, - ) { - Some(true) => { - // We claimed the slot, proceed with processing - info!("Block stage claimed slot {}", current_slot); - } - Some(false) => { - // Slot was already claimed by someone else - info!( - "Block stage could not claim slot {}, already scheduled", - current_slot - ); - continue; - } - None => { - // Not in delegation period, can't schedule block - info!( - "Block stage cannot schedule for slot {}, not in delegation period", - current_slot - ); - continue; - } + // Sanity check block is not empty. + // Note this is intentionally done after checking this was intended for us + if block.transactions().is_empty() { + warn!( + "received empty block intended for our slot {}", + block.intended_slot() + ); } - let batch = block_bundle.take(); + // Attempt to claim the slot for block stage + let decision = BufferedPacketsDecision::Consume(working_bank); + if let BufferedPacketsDecision::Consume(working_bank) = + DecisionMaker::maybe_consume::(decision) + { + // Claimed! Now record, execute, commit - // Translate packets to RuntimeTransaction - // using zerocopy TransactionView instead of bincode deserialization - let (transactions, max_ages) = - Self::translate_packets_to_transactions(&batch, &root_bank, &working_bank); + // Translate packets to RuntimeTransaction + // using zerocopy TransactionView instead of bincode deserialization + let (transactions, max_ages) = Self::translate_packets_to_transactions( + &block.transactions(), + &root_bank, + &working_bank, + ); - // Process blocks - if !transactions.is_empty() { + // Process blocks let output = consumer.process_and_record_block_transactions( &working_bank, &transactions, @@ -192,18 +179,21 @@ impl BlockStage { ); // Check if recording failed - if so, revert so vanilla can build fallback block - if output + if let Err(e) = output .execute_and_commit_transactions_output .commit_transactions_result - .is_err() { info!( - "Block recording failed for slot {}, reverting to vanilla", - current_slot + "Block recording failed for slot {}, reverting to vanilla: {:?}", + current_slot, e ); scheduler_synchronization::block_failed(current_slot); } - } + } else { + // Failed to claim for this slot. + info!("block stage failed to claim slot {}", bank.slot()); + continue; + }; } Err(RecvTimeoutError::Timeout) => { // Continue loop diff --git a/core/src/scheduler_synchronization.rs b/core/src/scheduler_synchronization.rs index e4963cf05a..b27c08d1a7 100644 --- a/core/src/scheduler_synchronization.rs +++ b/core/src/scheduler_synchronization.rs @@ -36,7 +36,7 @@ fn get_slot(value: u64) -> u64 { /// Check if the state indicates the slot was claimed by block. #[inline] fn is_block_claim(value: u64) -> bool { - value & BLOCK_CLAIMED_BIT != 0 + value != SENTINEL && value & BLOCK_CLAIMED_BIT != 0 } /// Create a state value for a slot claimed by vanilla (top bit clear). @@ -51,30 +51,6 @@ fn block_claim(slot: u64) -> u64 { (slot & SLOT_MASK) | BLOCK_CLAIMED_BIT } -/// Reset the scheduler synchronization state. Used in tests to ensure -/// a clean slate for each test. -#[cfg(any(test, feature = "dev-context-only-utils"))] -pub fn reset_for_tests() { - SCHEDULER_STATE.store(SENTINEL, Ordering::Release); -} - -/// Force claim a slot for vanilla scheduling. Used in tests to simulate -/// being past the delegation period. -#[cfg(any(test, feature = "dev-context-only-utils"))] -pub fn force_vanilla_claim(slot: u64) { - SCHEDULER_STATE.store(vanilla_claim(slot), Ordering::Release); -} - -/// Returns the last slot that was scheduled (without the block/vanilla flag). -pub fn last_slot_scheduled() -> u64 { - get_slot(SCHEDULER_STATE.load(Ordering::Acquire)) -} - -/// Returns true if the current slot was claimed by block, false if by vanilla. -pub fn is_slot_claimed_by_block() -> bool { - is_block_claim(SCHEDULER_STATE.load(Ordering::Acquire)) -} - /// If vanilla should schedule, the internal private atomic is /// updated so that the block scheduler does not schedule. /// @@ -92,12 +68,12 @@ pub fn vanilla_should_schedule(current_slot: u64, in_delegation_period: bool) -> if state != SENTINEL && get_slot(state) == current_slot { // Check who claimed it - if vanilla claimed, all vanilla threads can consume // If block claimed, no vanilla thread should consume - let claimed_by_block = is_block_claim(state); + let claimed_by_vanilla = is_block_claim(state); info!( "vanilla_should_schedule: slot {} already claimed, by_block={}", - current_slot, claimed_by_block + current_slot, claimed_by_vanilla ); - return Some(!claimed_by_block); + return Some(!claimed_by_vanilla); } // If still in delegation period and slot not yet claimed, don't try to claim @@ -236,6 +212,28 @@ pub fn block_failed(current_slot: u64) -> Option { mod tests { use super::*; + /// Reset the scheduler synchronization state. Used in tests to ensure + /// a clean slate for each test. + fn reset_for_tests() { + SCHEDULER_STATE.store(SENTINEL, Ordering::Release); + } + + /// Force claim a slot for vanilla scheduling. Used in tests to simulate + /// being past the delegation period. + fn force_vanilla_claim(slot: u64) { + SCHEDULER_STATE.store(vanilla_claim(slot), Ordering::Release); + } + + /// Returns the last slot that was scheduled (without the block/vanilla flag). + fn last_slot_scheduled() -> u64 { + get_slot(SCHEDULER_STATE.load(Ordering::Acquire)) + } + + /// Returns true if the current slot was claimed by block, false if by vanilla. + fn is_slot_claimed_by_block() -> bool { + is_block_claim(SCHEDULER_STATE.load(Ordering::Acquire)) + } + #[test] fn test_encoding() { // Test vanilla claim From f63d7fda514605104b9f0f81314cbf4900608d0e Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 09:55:57 +0000 Subject: [PATCH 09/23] sprint poh after receiving block --- core/src/block_stage/block_consumer.rs | 3 +- core/src/block_stage/mod.rs | 2 +- poh/src/poh_recorder.rs | 13 ++- poh/src/poh_service.rs | 116 ++++++++++++++++++++----- poh/src/transaction_recorder.rs | 7 +- 5 files changed, 113 insertions(+), 28 deletions(-) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs index 21e89b251d..0d0483701e 100644 --- a/core/src/block_stage/block_consumer.rs +++ b/core/src/block_stage/block_consumer.rs @@ -260,10 +260,11 @@ impl BlockConsumer { let blockhash_queue_lock = bank.blockhash_queue_lock(); // Record all transactions - this is all-or-nothing for the entire block + // Pass harmonic=true to trigger PoH speedrun after this block is recorded let (record_result, poh_record_us) = measure_us!(self .transaction_recorder - .record(bank.bank_id(), hashes, batches)); + .record(bank.bank_id(), hashes, batches, true)); record_transactions_timings.poh_record_us = Saturating(poh_record_us); let starting_transaction_index = match record_result { diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs index 99cdaafbd4..a8f3e5409a 100644 --- a/core/src/block_stage/mod.rs +++ b/core/src/block_stage/mod.rs @@ -191,7 +191,7 @@ impl BlockStage { } } else { // Failed to claim for this slot. - info!("block stage failed to claim slot {}", bank.slot()); + info!("block stage failed to claim slot {}", current_slot); continue; }; } diff --git a/poh/src/poh_recorder.rs b/poh/src/poh_recorder.rs index bc0aac136c..8775b7c016 100644 --- a/poh/src/poh_recorder.rs +++ b/poh/src/poh_recorder.rs @@ -65,6 +65,9 @@ pub enum PohRecorderError { #[error("harmonic block invalid signature")] HarmonicBlockInvalidSignature, + + #[error("harmonic block invalid transaction")] + HarmonicBlockInvalidTransaction, } pub(crate) type Result = std::result::Result; @@ -80,6 +83,9 @@ pub struct Record { pub mixins: Vec, pub transaction_batches: Vec>, pub bank_id: BankId, + /// Whether this record is from a harmonic block (block auction house). + /// Used for PoH pacing - when true, we speedrun the rest of the slot. + pub harmonic: bool, } impl Record { @@ -87,11 +93,13 @@ impl Record { mixins: Vec, transaction_batches: Vec>, bank_id: BankId, + harmonic: bool, ) -> Self { Self { mixins, transaction_batches, bank_id, + harmonic, } } } @@ -342,8 +350,9 @@ impl PohRecorder { .saturating_sub(self.tick_height() + 1); // The last hash in each tick is reserved for the tick itself, so subtract 1 per tick let tick_remaining_hashes = poh_lock.remaining_hashes().saturating_sub(1); - let total_remaining_hashes = tick_remaining_hashes - .saturating_add(remaining_ticks.saturating_mul(poh_lock.hashes_per_tick().saturating_sub(1))); + let total_remaining_hashes = tick_remaining_hashes.saturating_add( + remaining_ticks.saturating_mul(poh_lock.hashes_per_tick().saturating_sub(1)), + ); if mixins.len() as u64 > total_remaining_hashes { info!( "Insufficient hashes remaining for all-or-nothing recording: {} mixins > {} remaining hashes", diff --git a/poh/src/poh_service.rs b/poh/src/poh_service.rs index 13dc72eee8..976b220ac2 100644 --- a/poh/src/poh_service.rs +++ b/poh/src/poh_service.rs @@ -37,7 +37,7 @@ pub const DEFAULT_HASHES_PER_BATCH: u64 = pub const DEFAULT_PINNED_CPU_CORE: usize = 0; -const TARGET_SLOT_ADJUSTMENT_NS: u64 = 50_000_000; +const TARGET_SLOT_ADJUSTMENT_NS: u64 = 0; #[derive(Debug)] struct PohTiming { @@ -228,7 +228,14 @@ impl PohService { } if let Some(service_message) = service_message { - Self::handle_service_message(&poh_recorder, service_message, &mut record_receiver); + // Dummy block_received (not used in low-power mode) + let mut block_received = false; + Self::handle_service_message( + &poh_recorder, + service_message, + &mut record_receiver, + &mut block_received, + ); should_shutdown_for_test_producers = Self::should_shutdown_for_test_producers(&poh_recorder); if should_shutdown_for_test_producers { @@ -349,7 +356,14 @@ impl PohService { warn!("exit signal is ignored because PohService is scheduled to exit soon"); } if let Some(service_message) = service_message { - Self::handle_service_message(&poh_recorder, service_message, &mut record_receiver); + // Dummy block_received (not used in short-lived low-power mode) + let mut block_received = false; + Self::handle_service_message( + &poh_recorder, + service_message, + &mut record_receiver, + &mut block_received, + ); should_shutdown_for_test_producers = Self::should_shutdown_for_test_producers(&poh_recorder); if should_shutdown_for_test_producers { @@ -388,6 +402,7 @@ impl PohService { poh: &Arc>, target_ns_per_tick: u64, ticks_per_slot: u64, + block_received: &mut bool, ) -> bool { match next_record.take() { Some(mut record) => { @@ -399,6 +414,13 @@ impl PohService { timing.total_lock_time_ns += lock_time.as_ns(); let mut record_time = Measure::start("record"); loop { + // Check if this is a harmonic block record + if record.harmonic { + let slot = poh_recorder_l.bank().map(|b| b.slot()).unwrap_or(0); + info!("PohService received harmonic block for slot {}", slot); + *block_received = true; + } + match poh_recorder_l.record( record.bank_id, record.mixins, @@ -439,7 +461,6 @@ impl PohService { timing.num_hashes += hashes_per_batch; let mut hash_time = Measure::start("hash"); let should_tick = poh_l.hash(hashes_per_batch); - let ideal_time = poh_l.target_poh_time(target_ns_per_tick); hash_time.stop(); // shutdown if another batch would push us over the shutdown threshold. @@ -463,25 +484,31 @@ impl PohService { *next_record = Some(record); break; } - // check to see if we need to wait to catch up to ideal - let wait_start = Instant::now(); - if ideal_time <= wait_start { - // no, keep hashing. We still hold the lock. - continue; - } - // busy wait, polling for new records and after dropping poh lock (reset can occur, for example) - drop(poh_l); - while ideal_time > Instant::now() { - // check to see if a record request has been sent - if let Ok(record) = record_receiver.try_recv() { - // remember the record we just received as the next record to occur - *next_record = Some(record); - break; + // Don't even bother with this busy polling if we are speed + // running the slot + if target_ns_per_tick != 1 { + let ideal_time = poh_l.target_poh_time(target_ns_per_tick); + // check to see if we need to wait to catch up to ideal + let wait_start = Instant::now(); + if ideal_time <= wait_start { + // no, keep hashing. We still hold the lock. + continue; + } + + // busy wait, polling for new records and after dropping poh lock (reset can occur, for example) + drop(poh_l); + while ideal_time > Instant::now() { + // check to see if a record request has been sent + if let Ok(record) = record_receiver.try_recv() { + // remember the record we just received as the next record to occur + *next_record = Some(record); + break; + } } + timing.total_sleep_us += wait_start.elapsed().as_micros() as u64; + break; } - timing.total_sleep_us += wait_start.elapsed().as_micros() as u64; - break; } } }; @@ -495,8 +522,18 @@ impl PohService { hashes_per_batch: u64, mut record_receiver: RecordReceiver, poh_service_receiver: PohServiceMessageReceiver, - target_ns_per_tick: u64, + original_target_ns_per_tick: u64, ) { + // Whether or not we have received a harmonic block for the current slot + let mut block_received = false; + // The target ns per tick when waiting for a block to arrive + let extended_target_ns_per_tick = + Duration::from_millis(430).as_nanos() as u64 / ticks_per_slot; + // The target ns per tick after a block has arrived + let shortened_target_ns_per_tick = 1; + // Current target ns per tick (dynamically adjusted) + let mut target_ns_per_tick = original_target_ns_per_tick; + let poh = poh_recorder.read().unwrap().poh.clone(); let mut timing = PohTiming::new(); let mut next_record = None; @@ -524,7 +561,39 @@ impl PohService { &poh, target_ns_per_tick, ticks_per_slot, + &mut block_received, ); + + // Dynamic adjustment of target_ns_per_tick based on block receipt + target_ns_per_tick = { + let poh_recorder_r = poh_recorder.read().unwrap(); + let max_tick_height = poh_recorder_r + .bank() + .map(|b| b.max_tick_height()) + .unwrap_or(0); + if poh_recorder_r.tick_height() < max_tick_height { + if block_received { + if target_ns_per_tick != shortened_target_ns_per_tick { + info!("PohService shortening target ns per tick"); + } + // We have received a block - speedrun the rest of the slot + shortened_target_ns_per_tick + } else { + if target_ns_per_tick != extended_target_ns_per_tick { + info!("PohService extending target ns per tick"); + } + // We are waiting for a block - delay in case the block is held up + extended_target_ns_per_tick + } + } else { + if target_ns_per_tick != original_target_ns_per_tick { + info!("PohService restoring target ns per tick"); + } + // If we aren't leader or we have a block, use the normal slot timing + original_target_ns_per_tick + } + }; + if should_tick { // Lock PohRecorder only for the final hash. record_or_hash will lock PohRecorder for record calls but not for hashing. { @@ -556,6 +625,7 @@ impl PohService { &poh_recorder, service_message, &mut record_receiver, + &mut block_received, ); } } @@ -585,6 +655,7 @@ impl PohService { poh_recorder: &RwLock, mut service_message: PohServiceMessageGuard, record_receiver: &mut RecordReceiver, + block_received: &mut bool, ) { { let mut recorder = poh_recorder.write().unwrap(); @@ -604,6 +675,8 @@ impl PohService { if should_restart { record_receiver.restart(bank_id); } + // Harmonic: reset pacing state for the new slot + *block_received = false; } } } @@ -900,6 +973,7 @@ mod tests { mixins: vec![Hash::new_unique()], transaction_batches: vec![vec![VersionedTransaction::from(test_tx())]], bank_id: bank.bank_id(), + harmonic: false, }) .unwrap(); diff --git a/poh/src/transaction_recorder.rs b/poh/src/transaction_recorder.rs index fd585692fb..46e5305905 100644 --- a/poh/src/transaction_recorder.rs +++ b/poh/src/transaction_recorder.rs @@ -62,7 +62,7 @@ impl TransactionRecorder { record_transactions_timings.hash_us = Saturating(hash_us); let (res, poh_record_us) = - measure_us!(self.record(bank_id, vec![hash], vec![transactions])); + measure_us!(self.record(bank_id, vec![hash], vec![transactions], false)); record_transactions_timings.poh_record_us = Saturating(poh_record_us); match res { @@ -106,9 +106,10 @@ impl TransactionRecorder { bank_id: BankId, mixins: Vec, transaction_batches: Vec>, + harmonic: bool, ) -> Result, RecordSenderError> { self.record_sender - .try_send(Record::new(mixins, transaction_batches, bank_id)) + .try_send(Record::new(mixins, transaction_batches, bank_id, harmonic)) } pub fn record_bundle( @@ -130,7 +131,7 @@ impl TransactionRecorder { batches.push(batch); } - let (res, poh_record_us) = measure_us!(self.record(bank_id, hashes, batches)); + let (res, poh_record_us) = measure_us!(self.record(bank_id, hashes, batches, false)); record_transactions_timings.poh_record_us = Saturating(poh_record_us); match res { From 241a147801e8fafc07a4a78d7e1b1f5b541cb370 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 16:55:18 +0000 Subject: [PATCH 10/23] restore deadlock prevention measures --- runtime/src/bank.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index bf2e2de96c..7996704db5 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -2533,7 +2533,14 @@ impl Bank { // BankingStage doesn't release this hash lock until both // record and commit are finished, those transactions will be // committed before this write lock can be obtained here. - let mut hash = self.hash.write().unwrap(); + // NOTE: (mevanoxx) This RwLock is fair, so calling self.hash.write() + // will queue a write lock, potentially interrupting transaction + // execution. Instead, we use try_write() here. + let mut hash = loop { + if let Ok(lock) = self.hash.try_write() { + break lock; + } + }; if *hash == Hash::default() { // finish up any deferred changes to account state self.distribute_transaction_fee_details(); @@ -2877,7 +2884,15 @@ impl Bank { // Only acquire the write lock for the blockhash queue on block boundaries because // readers can starve this write lock acquisition and ticks would be slowed down too // much if the write lock is acquired for each tick. - let mut w_blockhash_queue = self.blockhash_queue.write().unwrap(); + let mut w_blockhash_queue = loop { + // cavey: block stage holds one read lock throughout execution and individual block + // stage execution threads take a read lock to check for valid blockhashes. this + // was changed from a write() to try_write() because this is a fair rwlock and the + // interleaved r/w requests cause a deadlock + if let Ok(lock) = self.blockhash_queue.try_write() { + break lock; + } + }; #[cfg(feature = "dev-context-only-utils")] let blockhash_override = self From 896f7358462030bd47a8ec177431a8eebccff8ab Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Wed, 7 Jan 2026 23:18:57 +0000 Subject: [PATCH 11/23] memoize ticks per us at startup --- core/src/block_stage/block_consumer.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs index 0d0483701e..2befd7b2b3 100644 --- a/core/src/block_stage/block_consumer.rs +++ b/core/src/block_stage/block_consumer.rs @@ -104,6 +104,10 @@ impl BlockConsumer { transaction_recorder: TransactionRecorder, log_messages_bytes_limit: Option, ) -> Self { + // Pre-initialize the timer calibration to avoid 2s stall on first block + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + super::timer::memoize_ticks_per_us_and_invariant_tsc_check(); + let thread_pool = ThreadPoolBuilder::new() .num_threads(NUM_THREADS) .thread_name(|i| format!("solBlkExec{i}")) From dffef5154dd576dd7f70fa1404e56c5adf180a2e Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 02:56:40 +0000 Subject: [PATCH 12/23] simplify vote processing --- .../latest_validator_vote_packet.rs | 3 + core/src/banking_stage/vote_storage.rs | 771 ++++-------------- core/src/banking_stage/vote_worker.rs | 148 ++-- core/src/cluster_info_vote_listener.rs | 24 +- 4 files changed, 251 insertions(+), 695 deletions(-) diff --git a/core/src/banking_stage/latest_validator_vote_packet.rs b/core/src/banking_stage/latest_validator_vote_packet.rs index e904994f9f..d64e286697 100644 --- a/core/src/banking_stage/latest_validator_vote_packet.rs +++ b/core/src/banking_stage/latest_validator_vote_packet.rs @@ -19,6 +19,7 @@ pub enum VoteSource { } /// Holds deserialized vote messages as well as their source, and slot +#[allow(dead_code)] #[derive(Debug)] pub struct LatestValidatorVote { vote_source: VoteSource, @@ -29,6 +30,7 @@ pub struct LatestValidatorVote { timestamp: Option, } +#[allow(dead_code)] impl LatestValidatorVote { pub fn new_from_view( vote: SanitizedTransactionView, @@ -130,6 +132,7 @@ impl LatestValidatorVote { } } +#[allow(dead_code)] #[derive(Debug, Error)] pub enum DeserializedPacketError { #[error("vote transaction failure")] diff --git a/core/src/banking_stage/vote_storage.rs b/core/src/banking_stage/vote_storage.rs index 821619b257..3fdf7fb29f 100644 --- a/core/src/banking_stage/vote_storage.rs +++ b/core/src/banking_stage/vote_storage.rs @@ -1,22 +1,29 @@ use { - super::latest_validator_vote_packet::{LatestValidatorVote, VoteSource}, - crate::banking_stage::transaction_scheduler::transaction_state_container::SharedBytes, + super::latest_validator_vote_packet::VoteSource, + crate::banking_stage::transaction_scheduler::transaction_state_container::{ + RuntimeTransactionView, SharedBytes, + }, agave_feature_set as feature_set, agave_transaction_view::transaction_view::SanitizedTransactionView, - ahash::HashMap, - itertools::Itertools, - rand::{thread_rng, Rng}, - solana_account::from_account, - solana_clock::Epoch, - solana_pubkey::Pubkey, - solana_runtime::{bank::Bank, epoch_stakes::VersionedEpochStakes}, - solana_sysvar::{self as sysvar, slot_hashes::SlotHashes}, - std::cmp, + lru::LruCache, + solana_clock::MAX_PROCESSING_AGE, + solana_runtime::bank::Bank, + solana_runtime_transaction::{ + runtime_transaction::RuntimeTransaction, transaction_meta::StaticMeta, + }, + solana_signature::Signature, + solana_svm::transaction_error_metrics::TransactionErrorMetrics, + solana_svm_transaction::svm_transaction::SVMTransaction, + solana_transaction::sanitized::MessageHash, + std::collections::HashSet, }; /// Maximum number of votes a single receive call will accept const MAX_NUM_VOTES_RECEIVE: usize = 10_000; +/// High capacity LRU cache size for vote storage (400k entries) +const VOTE_STORAGE_CAPACITY: usize = 400_000; + #[derive(Default, Debug)] pub(crate) struct VoteBatchInsertionMetrics { pub(crate) num_dropped_gossip: usize, @@ -39,296 +46,121 @@ impl VoteBatchInsertionMetrics { #[derive(Debug)] pub struct VoteStorage { - latest_vote_per_vote_pubkey: HashMap, - num_unprocessed_votes: usize, - cached_epoch_stakes: VersionedEpochStakes, + /// LRU cache storing resolved vote transactions ready for processing + votes: LruCache, deprecate_legacy_vote_ixs: bool, - current_epoch: Epoch, + /// Cached reserved account keys for vote resolution + reserved_account_keys: HashSet, } impl VoteStorage { pub fn new(bank: &Bank) -> Self { Self { - latest_vote_per_vote_pubkey: HashMap::default(), - num_unprocessed_votes: 0, - cached_epoch_stakes: bank.current_epoch_stakes().clone(), - current_epoch: bank.epoch(), + votes: LruCache::new(VOTE_STORAGE_CAPACITY), deprecate_legacy_vote_ixs: bank .feature_set .is_active(&feature_set::deprecate_legacy_vote_ixs::id()), - } - } - - #[cfg(test)] - pub fn new_for_tests(vote_pubkeys_to_stake: &[Pubkey]) -> Self { - use solana_vote::vote_account::VoteAccount; - - let vote_accounts = vote_pubkeys_to_stake - .iter() - .map(|pubkey| (*pubkey, (1u64, VoteAccount::new_random()))) - .collect(); - let epoch_stakes = VersionedEpochStakes::new_for_tests(vote_accounts, 0); - - Self { - latest_vote_per_vote_pubkey: HashMap::default(), - num_unprocessed_votes: 0, - cached_epoch_stakes: epoch_stakes, - current_epoch: 0, - deprecate_legacy_vote_ixs: true, + reserved_account_keys: bank.get_reserved_account_keys().clone(), } } pub fn is_empty(&self) -> bool { - self.len() == 0 + self.votes.is_empty() } pub fn len(&self) -> usize { - self.num_unprocessed_votes + self.votes.len() } pub fn max_receive_size(&self) -> usize { MAX_NUM_VOTES_RECEIVE } + #[allow(unused_variables)] pub(crate) fn insert_batch( &mut self, vote_source: VoteSource, votes: impl Iterator>, ) -> VoteBatchInsertionMetrics { - let should_deprecate_legacy_vote_ixs = self.deprecate_legacy_vote_ixs; - self.insert_batch_with_replenish( - votes.filter_map(|vote| { - LatestValidatorVote::new_from_view( - vote, - vote_source, - should_deprecate_legacy_vote_ixs, - ) - .ok() - }), - false, - ) - } - - // Re-insert re-tryable packets. - pub(crate) fn reinsert_packets( - &mut self, - packets: impl Iterator>, - ) { - let should_deprecate_legacy_vote_ixs = self.deprecate_legacy_vote_ixs; - self.insert_batch_with_replenish( - packets.filter_map(|packet| { - LatestValidatorVote::new_from_view( - packet, - VoteSource::Tpu, // incorrect, but this bug has been here w/o issue for a long time. - should_deprecate_legacy_vote_ixs, - ) - .ok() - }), - true, - ); + for vote in votes { + if let Some(resolved) = self.try_resolve_vote(vote) { + let sig = *resolved.signature(); + self.votes.push(sig, resolved); + } + } + VoteBatchInsertionMetrics { + num_dropped_gossip: 0, + num_dropped_tpu: 0, + } } - pub fn drain_unprocessed(&mut self, bank: &Bank) -> Vec> { - let slot_hashes = bank - .get_account(&sysvar::slot_hashes::id()) - .and_then(|account| from_account::(&account)); - if slot_hashes.is_none() { - error!( - "Slot hashes sysvar doesn't exist on bank {}. Including all votes without \ - filtering", - bank.slot() - ); + // Re-insert re-tryable votes. + pub(crate) fn reinsert_votes(&mut self, votes: impl Iterator) { + for vote in votes { + let sig = *vote.signature(); + self.votes.push(sig, vote); } - - self.weighted_random_order_by_stake() - .filter_map(|pubkey| { - self.latest_vote_per_vote_pubkey - .get_mut(&pubkey) - .and_then(|latest_vote| { - if !Self::is_valid_for_our_fork(latest_vote, &slot_hashes) { - return None; - } - latest_vote.take_vote().inspect(|_vote| { - self.num_unprocessed_votes -= 1; - }) - }) - }) - .collect_vec() } pub fn clear(&mut self) { - self.latest_vote_per_vote_pubkey - .values_mut() - .for_each(|vote| { - if vote.take_vote().is_some() { - self.num_unprocessed_votes -= 1; - } - }); + self.votes.clear(); } - pub fn cache_epoch_boundary_info(&mut self, bank: &Bank) { - if bank.epoch() <= self.current_epoch { - return; - } - { - self.cached_epoch_stakes = bank.current_epoch_stakes().clone(); - self.current_epoch = bank.epoch(); - self.deprecate_legacy_vote_ixs = bank - .feature_set - .is_active(&feature_set::deprecate_legacy_vote_ixs::id()); - } - - // Evict any now unstaked pubkeys - let mut unstaked_votes = 0; - self.latest_vote_per_vote_pubkey - .retain(|vote_pubkey, vote| { - let is_present = !vote.is_vote_taken(); - let should_evict = self.cached_epoch_stakes.vote_account_stake(vote_pubkey) == 0; - if is_present && should_evict { - unstaked_votes += 1; - } - !should_evict - }); - self.num_unprocessed_votes -= unstaked_votes; - datapoint_info!( - "latest_unprocessed_votes-epoch-boundary", - ("epoch", bank.epoch(), i64), - ("evicted_unstaked_votes", unstaked_votes, i64) - ); + /// Pop the least recently used vote, ready for processing. + pub fn pop(&mut self) -> Option { + self.votes.pop_lru().map(|(_sig, vote)| vote) } - fn insert_batch_with_replenish( - &mut self, - votes: impl Iterator, - should_replenish_taken_votes: bool, - ) -> VoteBatchInsertionMetrics { - let mut num_dropped_gossip = 0; - let mut num_dropped_tpu = 0; - - for vote in votes { - if self - .cached_epoch_stakes - .vote_account_stake(&vote.vote_pubkey()) - == 0 - { - continue; - } - if let Some(vote) = self.update_latest_vote(vote, should_replenish_taken_votes) { - match vote.source() { - VoteSource::Gossip => num_dropped_gossip += 1, - VoteSource::Tpu => num_dropped_tpu += 1, - } - } - } - - VoteBatchInsertionMetrics { - num_dropped_gossip, - num_dropped_tpu, - } + pub fn cache_epoch_boundary_info(&mut self, bank: &Bank) { + self.deprecate_legacy_vote_ixs = bank + .feature_set + .is_active(&feature_set::deprecate_legacy_vote_ixs::id()); + self.reserved_account_keys = bank.get_reserved_account_keys().clone(); } - /// If this vote causes an unprocessed vote to be removed, returns Some(old_vote) - /// If there is a newer vote processed / waiting to be processed returns Some(vote) - /// Otherwise returns None - fn update_latest_vote( - &mut self, - vote: LatestValidatorVote, - should_replenish_taken_votes: bool, - ) -> Option { - let vote_pubkey = vote.vote_pubkey(); - // Grab write-lock to insert new vote. - match self.latest_vote_per_vote_pubkey.entry(vote_pubkey) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - let latest_vote = entry.get_mut(); - if Self::allow_update(&vote, latest_vote, should_replenish_taken_votes) { - let old_vote = std::mem::replace(latest_vote, vote); - if old_vote.is_vote_taken() { - self.num_unprocessed_votes += 1; - return None; - } else { - return Some(old_vote); - } - } - Some(vote) - } - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert(vote); - self.num_unprocessed_votes += 1; - None + /// Remove votes that can no longer be processed. + pub fn cavey_clean(&mut self, bank: &Bank) { + let lock_result = [Ok(()); 1]; + let error_counters = &mut TransactionErrorMetrics::default(); + let mut to_remove = vec![]; + + for (sig, vote) in self.votes.iter().rev() { + let check = bank.check_transactions( + core::array::from_ref(vote), + &lock_result, + MAX_PROCESSING_AGE, + error_counters, + ); + if check[0].is_err() { + to_remove.push(*sig); } } - } - - /// Allow votes for later slots or the same slot with later timestamp (refreshed votes) - /// We directly compare as options to prioritize votes for same slot with timestamp as - /// Some > None - fn allow_update( - vote: &LatestValidatorVote, - latest_vote: &LatestValidatorVote, - should_replenish_taken_votes: bool, - ) -> bool { - let slot = vote.slot(); - - match slot.cmp(&latest_vote.slot()) { - cmp::Ordering::Less => return false, - cmp::Ordering::Greater => return true, - cmp::Ordering::Equal => {} - }; - // Slots are equal, now check timestamp - match vote.timestamp().cmp(&latest_vote.timestamp()) { - cmp::Ordering::Less => return false, - cmp::Ordering::Greater => return true, - cmp::Ordering::Equal => {} - }; - - // Timestamps are equal, lastly check if vote was taken previously - // and should be replenished - should_replenish_taken_votes && latest_vote.is_vote_taken() - } - - fn weighted_random_order_by_stake(&self) -> impl Iterator { - // Efraimidis and Spirakis algo for weighted random sample without replacement - let mut pubkey_with_weight: Vec<(f64, Pubkey)> = self - .latest_vote_per_vote_pubkey - .keys() - .filter_map(|&pubkey| { - let stake = self.cached_epoch_stakes.vote_account_stake(&pubkey); - if stake == 0 { - None // Ignore votes from unstaked validators - } else { - Some((thread_rng().gen::().powf(1.0 / (stake as f64)), pubkey)) - } - }) - .collect::>(); - pubkey_with_weight.sort_by(|(w1, _), (w2, _)| w2.partial_cmp(w1).unwrap()); - pubkey_with_weight.into_iter().map(|(_, pubkey)| pubkey) + for sig in &to_remove { + let _ = self.votes.pop(sig); + } } - /// Check if `vote` can land in our fork based on `slot_hashes` - fn is_valid_for_our_fork(vote: &LatestValidatorVote, slot_hashes: &Option) -> bool { - let Some(slot_hashes) = slot_hashes else { - // When slot hashes is not present we do not filter - return true; - }; - slot_hashes - .get(&vote.slot()) - .map(|found_hash| *found_hash == vote.hash()) - .unwrap_or(false) - } + /// Try to resolve a vote packet into a RuntimeTransactionView. + fn try_resolve_vote( + &self, + packet: SanitizedTransactionView, + ) -> Option { + // Build RuntimeTransaction from the view + let view = RuntimeTransaction::>::try_from( + packet, + MessageHash::Compute, + None, + ) + .ok()?; - #[cfg(test)] - pub fn get_latest_vote_slot(&self, pubkey: Pubkey) -> Option { - self.latest_vote_per_vote_pubkey - .get(&pubkey) - .map(|l| l.slot()) - } + // Filter non-vote transactions + if !view.is_simple_vote_transaction() { + return None; + } - #[cfg(test)] - fn get_latest_timestamp(&self, pubkey: Pubkey) -> Option { - self.latest_vote_per_vote_pubkey - .get(&pubkey) - .and_then(|l| l.timestamp()) + // Resolve the transaction (votes do not have LUTs) + RuntimeTransactionView::try_from(view, None, &self.reserved_account_keys).ok() } } @@ -336,17 +168,16 @@ impl VoteStorage { pub(crate) mod tests { use { super::*, + agave_transaction_view::transaction_view::SanitizedTransactionView, solana_clock::UnixTimestamp, - solana_epoch_schedule::MINIMUM_SLOTS_PER_EPOCH, - solana_genesis_config::GenesisConfig, solana_hash::Hash, solana_keypair::Keypair, solana_perf::packet::{BytesPacket, PacketFlags}, - solana_runtime::genesis_utils::{self, ValidatorVoteKeypairs}, + solana_runtime::genesis_utils::ValidatorVoteKeypairs, solana_signer::Signer, solana_vote::vote_transaction::new_tower_sync_transaction, solana_vote_program::vote_state::TowerSync, - std::{error::Error, sync::Arc}, + std::sync::Arc, }; pub(crate) fn packet_from_slots( @@ -373,378 +204,122 @@ pub(crate) mod tests { packet } - fn from_slots( - slots: Vec<(u64, u32)>, - vote_source: VoteSource, - keypairs: &ValidatorVoteKeypairs, - timestamp: Option, - ) -> LatestValidatorVote { - let packet = packet_from_slots(slots, keypairs, timestamp); - LatestValidatorVote::new(packet.as_ref(), vote_source, true).unwrap() - } - fn to_sanitized_view(packet: BytesPacket) -> SanitizedTransactionView { SanitizedTransactionView::try_new_sanitized(Arc::new(packet.buffer().to_vec()), false) .unwrap() } #[test] - fn test_reinsert_packets() -> Result<(), Box> { - let node_keypair = Keypair::new(); - let genesis_config = - genesis_utils::create_genesis_config_with_leader(100, &node_keypair.pubkey(), 200) - .genesis_config; - let (bank, _bank_forks) = Bank::new_with_bank_forks_for_tests(&genesis_config); - let vote_keypair = Keypair::new(); - let mut vote = BytesPacket::from_data( - None, - new_tower_sync_transaction( - TowerSync::default(), - Hash::new_unique(), - &node_keypair, - &vote_keypair, - &vote_keypair, - None, - ), - )?; - vote.meta_mut().flags.set(PacketFlags::SIMPLE_VOTE_TX, true); - - let mut vote_storage = VoteStorage::new_for_tests(&[vote_keypair.pubkey()]); - vote_storage.insert_batch(VoteSource::Tpu, std::iter::once(to_sanitized_view(vote))); - assert_eq!(1, vote_storage.len()); - - // Drain all packets, then re-insert. - let packets = vote_storage.drain_unprocessed(&bank); - vote_storage.reinsert_packets(packets.into_iter()); - - // All packets should remain in the transaction storage - assert_eq!(1, vote_storage.len()); - Ok(()) - } - - #[test] - fn test_update_latest_vote() { + fn test_insert_and_pop() { let keypair_a = ValidatorVoteKeypairs::new_rand(); - let keypair_b = ValidatorVoteKeypairs::new_rand(); - let mut vote_storage = VoteStorage::new_for_tests(&[ - keypair_a.vote_keypair.pubkey(), - keypair_b.vote_keypair.pubkey(), - ]); - - let vote_a = from_slots(vec![(0, 2), (1, 1)], VoteSource::Gossip, &keypair_a, None); - let vote_b = from_slots( - vec![(0, 5), (4, 2), (9, 1)], - VoteSource::Gossip, - &keypair_b, - None, - ); - assert!(vote_storage - .update_latest_vote(vote_a, false /* should replenish */) - .is_none()); - assert!(vote_storage - .update_latest_vote(vote_b, false /* should replenish */) - .is_none()); - assert_eq!(2, vote_storage.len()); - - assert_eq!( - Some(1), - vote_storage.get_latest_vote_slot(keypair_a.vote_keypair.pubkey()) - ); - assert_eq!( - Some(9), - vote_storage.get_latest_vote_slot(keypair_b.vote_keypair.pubkey()) - ); - - let vote_a = from_slots( - vec![(0, 5), (1, 4), (3, 3), (10, 1)], - VoteSource::Gossip, - &keypair_a, - None, - ); - let vote_b = from_slots( - vec![(0, 5), (4, 2), (6, 1)], - VoteSource::Gossip, - &keypair_b, - None, - ); - - // Evict previous vote - assert_eq!( - 1, - vote_storage - .update_latest_vote(vote_a, false /* should replenish */) - .unwrap() - .slot() - ); - // Drop current vote - assert_eq!( - 6, - vote_storage - .update_latest_vote(vote_b, false /* should replenish */) - .unwrap() - .slot() - ); - - assert_eq!(2, vote_storage.len()); + let genesis_config = solana_runtime::genesis_utils::create_genesis_config_with_leader( + 100, + &Keypair::new().pubkey(), + 200, + ) + .genesis_config; + let (bank, _bank_forks) = + solana_runtime::bank::Bank::new_with_bank_forks_for_tests(&genesis_config); - // Same votes should be no-ops - let vote_a = from_slots( - vec![(0, 5), (1, 4), (3, 3), (10, 1)], - VoteSource::Gossip, - &keypair_a, - None, - ); - let vote_b = from_slots( - vec![(0, 5), (4, 2), (9, 1)], - VoteSource::Gossip, - &keypair_b, - None, - ); - vote_storage.update_latest_vote(vote_a, false /* should replenish */); - vote_storage.update_latest_vote(vote_b, false /* should replenish */); - - assert_eq!(2, vote_storage.len()); - assert_eq!( - 10, - vote_storage - .get_latest_vote_slot(keypair_a.vote_keypair.pubkey()) - .unwrap() - ); - assert_eq!( - 9, - vote_storage - .get_latest_vote_slot(keypair_b.vote_keypair.pubkey()) - .unwrap() - ); + let mut vote_storage = VoteStorage::new(&bank); + assert!(vote_storage.is_empty()); - // Same votes with timestamps should override - let vote_a = from_slots( - vec![(0, 5), (1, 4), (3, 3), (10, 1)], - VoteSource::Gossip, - &keypair_a, - Some(1), - ); - let vote_b = from_slots( - vec![(0, 5), (4, 2), (9, 1)], - VoteSource::Gossip, - &keypair_b, - Some(2), - ); - vote_storage.update_latest_vote(vote_a, false /* should replenish */); - vote_storage.update_latest_vote(vote_b, false /* should replenish */); + // Insert a vote + let vote = to_sanitized_view(packet_from_slots(vec![(1, 1)], &keypair_a, None)); + vote_storage.insert_batch(VoteSource::Tpu, std::iter::once(vote)); + assert_eq!(vote_storage.len(), 1); - assert_eq!(2, vote_storage.len()); - assert_eq!( - Some(1), - vote_storage.get_latest_timestamp(keypair_a.vote_keypair.pubkey()) - ); - assert_eq!( - Some(2), - vote_storage.get_latest_timestamp(keypair_b.vote_keypair.pubkey()) - ); + // Pop the vote + let popped = vote_storage.pop(); + assert!(popped.is_some()); + assert!(vote_storage.is_empty()); + } - // Same votes with bigger timestamps should override - let vote_a = from_slots( - vec![(0, 5), (1, 4), (3, 3), (10, 1)], - VoteSource::Gossip, - &keypair_a, - Some(5), - ); - let vote_b = from_slots( - vec![(0, 5), (4, 2), (9, 1)], - VoteSource::Gossip, - &keypair_b, - Some(6), - ); - vote_storage.update_latest_vote(vote_a, false /* should replenish */); - vote_storage.update_latest_vote(vote_b, false /* should replenish */); + #[test] + fn test_reinsert_votes() { + let keypair_a = ValidatorVoteKeypairs::new_rand(); - assert_eq!(2, vote_storage.len()); - assert_eq!( - Some(5), - vote_storage.get_latest_timestamp(keypair_a.vote_keypair.pubkey()) - ); - assert_eq!( - Some(6), - vote_storage.get_latest_timestamp(keypair_b.vote_keypair.pubkey()) - ); + let genesis_config = solana_runtime::genesis_utils::create_genesis_config_with_leader( + 100, + &Keypair::new().pubkey(), + 200, + ) + .genesis_config; + let (bank, _bank_forks) = + solana_runtime::bank::Bank::new_with_bank_forks_for_tests(&genesis_config); - // Same votes with smaller timestamps should not override - let vote_a = || { - from_slots( - vec![(0, 5), (1, 4), (3, 3), (10, 1)], - VoteSource::Gossip, - &keypair_a, - Some(2), - ) - }; - let vote_b = || { - from_slots( - vec![(0, 5), (4, 2), (9, 1)], - VoteSource::Gossip, - &keypair_b, - Some(3), - ) - }; - vote_storage.update_latest_vote(vote_a(), false /* should replenish */); - vote_storage.update_latest_vote(vote_b(), false /* should replenish */); + let mut vote_storage = VoteStorage::new(&bank); - assert_eq!(2, vote_storage.len()); - assert_eq!( - Some(5), - vote_storage.get_latest_timestamp(keypair_a.vote_keypair.pubkey()) - ); - assert_eq!( - Some(6), - vote_storage.get_latest_timestamp(keypair_b.vote_keypair.pubkey()) - ); - - // Drain all latest votes - for packet in vote_storage.latest_vote_per_vote_pubkey.values_mut() { - packet.take_vote().inspect(|_vote| { - vote_storage.num_unprocessed_votes -= 1; - }); - } - assert_eq!(0, vote_storage.len()); + // Insert a vote + let vote = to_sanitized_view(packet_from_slots(vec![(1, 1)], &keypair_a, None)); + vote_storage.insert_batch(VoteSource::Tpu, std::iter::once(vote)); + assert_eq!(vote_storage.len(), 1); - // Same votes with same timestamps should not replenish without flag - vote_storage.update_latest_vote(vote_a(), false /* should replenish */); - vote_storage.update_latest_vote(vote_b(), false /* should replenish */); - assert_eq!(0, vote_storage.len()); + // Pop and reinsert + let popped = vote_storage.pop().unwrap(); + assert!(vote_storage.is_empty()); - // Same votes with same timestamps should replenish with the flag - vote_storage.update_latest_vote(vote_a(), true /* should replenish */); - vote_storage.update_latest_vote(vote_b(), true /* should replenish */); - assert_eq!(0, vote_storage.len()); + vote_storage.reinsert_votes(std::iter::once(popped)); + assert_eq!(vote_storage.len(), 1); } #[test] fn test_clear() { let keypair_a = ValidatorVoteKeypairs::new_rand(); let keypair_b = ValidatorVoteKeypairs::new_rand(); - let keypair_c = ValidatorVoteKeypairs::new_rand(); - let keypair_d = ValidatorVoteKeypairs::new_rand(); - let mut vote_storage = VoteStorage::new_for_tests(&[ - keypair_a.vote_keypair.pubkey(), - keypair_b.vote_keypair.pubkey(), - keypair_c.vote_keypair.pubkey(), - keypair_d.vote_keypair.pubkey(), - ]); - - let vote_a = from_slots(vec![(1, 1)], VoteSource::Gossip, &keypair_a, None); - let vote_b = from_slots(vec![(2, 1)], VoteSource::Tpu, &keypair_b, None); - let vote_c = from_slots(vec![(3, 1)], VoteSource::Tpu, &keypair_c, None); - let vote_d = from_slots(vec![(4, 1)], VoteSource::Gossip, &keypair_d, None); - - vote_storage.update_latest_vote(vote_a, false /* should replenish */); - vote_storage.update_latest_vote(vote_b, false /* should replenish */); - vote_storage.update_latest_vote(vote_c, false /* should replenish */); - vote_storage.update_latest_vote(vote_d, false /* should replenish */); - assert_eq!(4, vote_storage.len()); - vote_storage.clear(); - assert_eq!(0, vote_storage.len()); + let genesis_config = solana_runtime::genesis_utils::create_genesis_config_with_leader( + 100, + &Keypair::new().pubkey(), + 200, + ) + .genesis_config; + let (bank, _bank_forks) = + solana_runtime::bank::Bank::new_with_bank_forks_for_tests(&genesis_config); - assert_eq!( - Some(1), - vote_storage.get_latest_vote_slot(keypair_a.vote_keypair.pubkey()) - ); - assert_eq!( - Some(2), - vote_storage.get_latest_vote_slot(keypair_b.vote_keypair.pubkey()) - ); - assert_eq!( - Some(3), - vote_storage.get_latest_vote_slot(keypair_c.vote_keypair.pubkey()) - ); - assert_eq!( - Some(4), - vote_storage.get_latest_vote_slot(keypair_d.vote_keypair.pubkey()) - ); + let mut vote_storage = VoteStorage::new(&bank); + + let vote_a = to_sanitized_view(packet_from_slots(vec![(1, 1)], &keypair_a, None)); + let vote_b = to_sanitized_view(packet_from_slots(vec![(2, 1)], &keypair_b, None)); + + vote_storage.insert_batch(VoteSource::Tpu, vec![vote_a, vote_b].into_iter()); + assert_eq!(vote_storage.len(), 2); + + vote_storage.clear(); + assert!(vote_storage.is_empty()); } #[test] - fn test_insert_batch_unstaked() { + fn test_lru_eviction() { + let genesis_config = solana_runtime::genesis_utils::create_genesis_config_with_leader( + 100, + &Keypair::new().pubkey(), + 200, + ) + .genesis_config; + let (bank, _bank_forks) = + solana_runtime::bank::Bank::new_with_bank_forks_for_tests(&genesis_config); + + // Create a small LRU for testing eviction + let mut vote_storage = VoteStorage { + votes: LruCache::new(2), + deprecate_legacy_vote_ixs: true, + reserved_account_keys: bank.get_reserved_account_keys().clone(), + }; + let keypair_a = ValidatorVoteKeypairs::new_rand(); let keypair_b = ValidatorVoteKeypairs::new_rand(); let keypair_c = ValidatorVoteKeypairs::new_rand(); - let keypair_d = ValidatorVoteKeypairs::new_rand(); - - let vote_b_slot = 2; - let vote_c_slot = 3; - let vote_a = packet_from_slots(vec![(1, 1)], &keypair_a, None); - let vote_b = packet_from_slots(vec![(vote_b_slot, 1)], &keypair_b, None); - let vote_c = packet_from_slots(vec![(vote_c_slot, 1)], &keypair_c, None); - let vote_d = packet_from_slots(vec![(4, 1)], &keypair_d, None); - let votes = || { - vec![ - to_sanitized_view(vote_a.clone()), - to_sanitized_view(vote_b.clone()), - to_sanitized_view(vote_c.clone()), - to_sanitized_view(vote_d.clone()), - ] - }; - let bank_0 = Bank::new_for_tests(&GenesisConfig::default()); - let mut vote_storage = VoteStorage::new(&bank_0); + let vote_a = to_sanitized_view(packet_from_slots(vec![(1, 1)], &keypair_a, None)); + let vote_b = to_sanitized_view(packet_from_slots(vec![(2, 1)], &keypair_b, None)); + let vote_c = to_sanitized_view(packet_from_slots(vec![(3, 1)], &keypair_c, None)); - // Insert batch should filter out all votes as they are unstaked - vote_storage.insert_batch(VoteSource::Tpu, votes().into_iter()); - assert!(vote_storage.is_empty()); + // Insert 3 votes into capacity-2 LRU + vote_storage.insert_batch(VoteSource::Tpu, vec![vote_a, vote_b, vote_c].into_iter()); - // Bank in same epoch should not update stakes - let config = - genesis_utils::create_genesis_config_with_vote_accounts(100, &[&keypair_a], vec![200]) - .genesis_config; - let bank_0 = Bank::new_for_tests(&config); - let bank = Bank::new_from_parent( - Arc::new(bank_0), - &Pubkey::new_unique(), - MINIMUM_SLOTS_PER_EPOCH - 1, - ); - assert_eq!(bank.epoch(), 0); - vote_storage.cache_epoch_boundary_info(&bank); - vote_storage.insert_batch(VoteSource::Tpu, votes().into_iter()); - assert!(vote_storage.is_empty()); - - // Bank in next epoch should update stakes - let config = - genesis_utils::create_genesis_config_with_vote_accounts(100, &[&keypair_b], vec![200]) - .genesis_config; - let bank_0 = Bank::new_for_tests(&config); - let bank = Bank::new_from_parent( - Arc::new(bank_0), - &Pubkey::new_unique(), - MINIMUM_SLOTS_PER_EPOCH, - ); - assert_eq!(bank.epoch(), 1); - vote_storage.cache_epoch_boundary_info(&bank); - vote_storage.insert_batch(VoteSource::Gossip, votes().into_iter()); - assert_eq!(vote_storage.len(), 1); - assert_eq!( - vote_storage.get_latest_vote_slot(keypair_b.vote_keypair.pubkey()), - Some(vote_b_slot) - ); - - // Previously unstaked votes are removed - let config = - genesis_utils::create_genesis_config_with_vote_accounts(100, &[&keypair_c], vec![200]) - .genesis_config; - let bank_0 = Bank::new_for_tests(&config); - let bank = Bank::new_from_parent( - Arc::new(bank_0), - &Pubkey::new_unique(), - 3 * MINIMUM_SLOTS_PER_EPOCH, - ); - assert_eq!(bank.epoch(), 2); - vote_storage.cache_epoch_boundary_info(&bank); - assert_eq!(vote_storage.len(), 0); - vote_storage.insert_batch(VoteSource::Tpu, votes().into_iter()); - assert_eq!(vote_storage.len(), 1); - assert_eq!( - vote_storage.get_latest_vote_slot(keypair_c.vote_keypair.pubkey()), - Some(vote_c_slot) - ); + // Should only have 2 votes (oldest evicted) + assert_eq!(vote_storage.len(), 2); } } diff --git a/core/src/banking_stage/vote_worker.rs b/core/src/banking_stage/vote_worker.rs index 78e6e2f2bf..2c1e778579 100644 --- a/core/src/banking_stage/vote_worker.rs +++ b/core/src/banking_stage/vote_worker.rs @@ -13,33 +13,23 @@ use { crate::{ banking_stage::{ consumer::{ExecuteAndCommitTransactionsOutput, ProcessTransactionBatchOutput}, - transaction_scheduler::transaction_state_container::{ - RuntimeTransactionView, SharedBytes, - }, + transaction_scheduler::transaction_state_container::RuntimeTransactionView, }, bundle_stage::bundle_account_locker::BundleAccountLocker, }, - agave_transaction_view::{ - transaction_version::TransactionVersion, transaction_view::SanitizedTransactionView, - }, arrayvec::ArrayVec, crossbeam_channel::RecvTimeoutError, - itertools::Itertools, solana_accounts_db::account_locks::validate_account_locks, solana_clock::FORWARD_TRANSACTIONS_TO_LEADER_AT_SLOT_OFFSET, solana_measure::{measure::Measure, measure_us}, solana_poh::poh_recorder::PohRecorderError, solana_runtime::{bank::Bank, bank_forks::BankForks}, - solana_runtime_transaction::{ - runtime_transaction::RuntimeTransaction, transaction_meta::StaticMeta, - transaction_with_meta::TransactionWithMeta, - }, + solana_runtime_transaction::transaction_with_meta::TransactionWithMeta, solana_svm::{ account_loader::TransactionCheckResult, transaction_error_metrics::TransactionErrorMetrics, }, solana_svm_transaction::svm_message::SVMMessage, solana_time_utils::timestamp, - solana_transaction::sanitized::MessageHash, solana_transaction_error::TransactionError, std::{ sync::{ @@ -170,15 +160,19 @@ impl VoteWorker { // load all accounts from address loader; let current_bank = self.bank_forks.read().unwrap().working_bank(); self.storage.cache_epoch_boundary_info(¤t_bank); - self.storage.clear(); + self.storage.cavey_clean(¤t_bank); } BufferedPacketsDecision::ForwardAndHold => { // get current working bank from bank_forks, use it to sanitize transaction and // load all accounts from address loader; let current_bank = self.bank_forks.read().unwrap().working_bank(); self.storage.cache_epoch_boundary_info(¤t_bank); + self.storage.cavey_clean(¤t_bank); + } + BufferedPacketsDecision::Hold => { + let current_bank = self.bank_forks.read().unwrap().working_bank(); + self.storage.cavey_clean(¤t_bank); } - BufferedPacketsDecision::Hold => {} } } @@ -242,58 +236,76 @@ impl VoteWorker { slot_metrics_tracker: &mut LeaderSlotMetricsTracker, reservation_cb: &impl Fn(&Bank) -> u64, ) -> bool { - // Based on the stake distribution present in the supplied bank, drain the unprocessed votes - // from each validator using a weighted random ordering. Votes from validators with - // 0 stake are ignored. - let all_vote_packets = self.storage.drain_unprocessed(bank); + // Simplified vote processing: pop votes from the LRU cache and process + // them in batches. let mut reached_end_of_slot = false; let mut error_counters: TransactionErrorMetrics = TransactionErrorMetrics::default(); - let mut resolved_txs = ArrayVec::<_, UNPROCESSED_BUFFER_STEP_SIZE>::new(); - for chunk in Itertools::chunks(all_vote_packets.into_iter(), UNPROCESSED_BUFFER_STEP_SIZE) - .into_iter() - { - debug_assert!(resolved_txs.is_empty()); + let mut votes_batch = + ArrayVec::::new(); + let mut retry_votes = Vec::new(); + let mut num_votes_processed = 0_usize; + const MAX_VOTES_PER_SLOT: usize = 1000; - // Short circuit if we've reached the end of slot. - if reached_end_of_slot { - self.storage.reinsert_packets(chunk.into_iter()); + debug!( + "Processing vote packets, slot: {}, outstanding: {}", + bank.slot(), + self.storage.len() + ); - continue; + while !self.storage.is_empty() + && !reached_end_of_slot + && num_votes_processed < MAX_VOTES_PER_SLOT + { + votes_batch.clear(); + + // Fill up the batch from the LRU (votes are already resolved) + while !votes_batch.is_full() { + if let Some(vote) = self.storage.pop() { + // Validate against current bank + if validate_vote_for_processing(bank, &vote, &mut error_counters) { + num_votes_processed += 1; + votes_batch.push(vote); + } else { + retry_votes.push(vote); + } + } else { + break; + } } - // Sanitize & resolve our chunk. - for packet in chunk.into_iter() { - if let Some(tx) = - consume_scan_should_process_packet(bank, packet, &mut error_counters) - { - resolved_txs.push(tx); - } + if votes_batch.is_empty() { + break; } if let Some(retryable_vote_indices) = self.do_process_packets( bank, &mut reached_end_of_slot, - &resolved_txs, + &votes_batch, banking_stage_stats, consumed_buffered_packets_count, rebuffered_packet_count, slot_metrics_tracker, reservation_cb, ) { - self.storage.reinsert_packets( - Self::extract_retryable(&mut resolved_txs, retryable_vote_indices) - .map(|tx| tx.into_inner_transaction().into_view()), - ); + self.storage.reinsert_votes(Self::extract_retryable( + &mut votes_batch, + retryable_vote_indices, + )); } else { - self.storage.reinsert_packets( - resolved_txs - .drain(..) - .map(|tx| tx.into_inner_transaction().into_view()), - ); + self.storage.reinsert_votes(votes_batch.drain(..)); } } + debug!( + "Done processing votes, slot: {}, outstanding: {}", + bank.slot(), + self.storage.len() + ); + + // Reinsert votes that failed validation for retry + self.storage.reinsert_votes(retry_votes.drain(..)); + reached_end_of_slot } @@ -533,47 +545,27 @@ impl VoteWorker { } } -fn consume_scan_should_process_packet( +/// Validate a pre-resolved vote transaction against the current bank. +fn validate_vote_for_processing( bank: &Bank, - packet: SanitizedTransactionView, + vote: &RuntimeTransactionView, error_counters: &mut TransactionErrorMetrics, -) -> Option { - // Construct the RuntimeTransaction. - let Ok(view) = RuntimeTransaction::>::try_from( - packet, - MessageHash::Compute, - None, - ) else { - return None; - }; - - // Filter invalid votes (should never be triggered). - if !view.is_simple_vote_transaction() { - return None; - } - - // Resolve the transaction (votes do not have LUTs). - debug_assert!(!matches!(view.version(), TransactionVersion::V0)); - let Ok(view) = RuntimeTransactionView::try_from(view, None, bank.get_reserved_account_keys()) - else { - return None; - }; - +) -> bool { // Check the number of locks and whether there are duplicates if validate_account_locks( - view.account_keys(), + vote.account_keys(), bank.get_transaction_account_lock_limit(), ) .is_err() { - return None; + return false; } - if Consumer::check_fee_payer_unlocked(bank, &view, error_counters).is_err() { - return None; + if Consumer::check_fee_payer_unlocked(bank, vote, error_counters).is_err() { + return false; } - Some(view) + true } fn has_reached_end_of_slot(reached_max_poh_height: bool, bank: &Bank) -> bool { @@ -585,13 +577,19 @@ mod tests { use { super::*, crate::banking_stage::{ - tests::create_slow_genesis_config, vote_storage::tests::packet_from_slots, + tests::create_slow_genesis_config, + transaction_scheduler::transaction_state_container::SharedBytes, + vote_storage::tests::packet_from_slots, }, + agave_transaction_view::transaction_view::SanitizedTransactionView, solana_ledger::genesis_utils::GenesisConfigInfo, solana_perf::packet::BytesPacket, solana_runtime::genesis_utils::ValidatorVoteKeypairs, - solana_runtime_transaction::transaction_meta::StaticMeta, + solana_runtime_transaction::{ + runtime_transaction::RuntimeTransaction, transaction_meta::StaticMeta, + }, solana_svm::account_loader::CheckedTransactionDetails, + solana_transaction::sanitized::MessageHash, std::collections::HashSet, }; diff --git a/core/src/cluster_info_vote_listener.rs b/core/src/cluster_info_vote_listener.rs index f5ad3b8994..d3e49163af 100644 --- a/core/src/cluster_info_vote_listener.rs +++ b/core/src/cluster_info_vote_listener.rs @@ -468,22 +468,10 @@ impl ClusterInfoVoteListener { .entry(*vote_pubkey) .or_insert(0); - let root = root_bank.slot(); let mut is_new_vote = false; let vote_slots = vote.slots(); - let accumulate_intermediate_votes = - if let Some(hash) = bank_hash_cache.hash(last_vote_slot, &mut slots_dumped) { - // Only accumulate intermediates if we have replayed the same version being voted on, as - // otherwise we cannot verify the ancestry or the hashes. - // Note: this can only be performed on full tower votes, until deprecate_legacy_vote_ixs feature - // is active we must check the transaction type. - hash == last_vote_hash && vote.is_full_tower_vote() - } else { - // If we have not frozen the bank do not accumulate intermediate slots as we cannot verify - // the hashes - false - }; + let accumulate_intermediate_votes = true; let mut get_hash = |slot: Slot| { (slot == last_vote_slot) .then_some(last_vote_hash) @@ -491,7 +479,7 @@ impl ClusterInfoVoteListener { }; // If slot is before the root, ignore it. Iterates from most recent vote slot to oldest. - for slot in vote_slots.iter().filter(|slot| **slot > root).rev() { + for slot in vote_slots.iter().rev() { let slot = *slot; // if we don't have stake information, ignore it @@ -573,14 +561,6 @@ impl ClusterInfoVoteListener { is_new_vote = is_new; } - if slot < *latest_vote_slot { - // Important that we filter after the `last_vote_slot` check, as even if this vote - // is old, we still need to track optimistic confirmations. - // However it is fine to filter the rest of the slots for the propagated check tracking below, - // as the propagated check is able to roll up votes for descendants unlike optimistic confirmation. - continue; - } - diff.entry(slot) .or_default() .entry(*vote_pubkey) From 9499ef330af1e0615c089026c8bfecc94ad6f40e Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 02:56:55 +0000 Subject: [PATCH 13/23] gossip: fix duplicate shred handler buffer pruning --- gossip/src/duplicate_shred_handler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gossip/src/duplicate_shred_handler.rs b/gossip/src/duplicate_shred_handler.rs index 36e0733d38..29081b8812 100644 --- a/gossip/src/duplicate_shred_handler.rs +++ b/gossip/src/duplicate_shred_handler.rs @@ -187,7 +187,7 @@ impl DuplicateShredHandler { } }); } - if self.buffer.len() < BUFFER_CAPACITY { + if self.buffer.len() <= BUFFER_CAPACITY { return; } // Lookup stake for each entry. From 646cc03529032c44229090cffa83f5f92c845c51 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 03:16:12 +0000 Subject: [PATCH 14/23] harmonic opticast slot time regulator --- entry/src/poh.rs | 18 ++++++++++++++++++ poh/src/poh_recorder.rs | 25 +++++++++++++++++++++++++ runtime/src/bank.rs | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/entry/src/poh.rs b/entry/src/poh.rs index faa491b82e..cdb68f00fa 100644 --- a/entry/src/poh.rs +++ b/entry/src/poh.rs @@ -159,6 +159,24 @@ impl Poh { .wrapping_mul(self.hashes_per_tick) .wrapping_add(self.remaining_hashes_until_tick) } + + /// Set the slot start time for consistent timing across consecutive leader slots. + /// When we're leader in back-to-back slots, we use the previous slot's expected end time + /// rather than the current wall clock time to maintain consistent 400ms slot boundaries. + #[inline] + pub fn cavey_set_start_time(&mut self, start_time: Instant) { + info!( + "CAVEY DEBUG: set start time. old slot_start_time {:?}; new start_time {:?}", + self.slot_start_time, start_time + ); + self.slot_start_time = start_time; + } + + /// Get the current slot start time. + #[inline] + pub fn cavey_start_time(&self) -> Instant { + self.slot_start_time + } } pub fn compute_hash_time(hashes_sample_size: u64) -> Duration { diff --git a/poh/src/poh_recorder.rs b/poh/src/poh_recorder.rs index 8775b7c016..44dd5a3665 100644 --- a/poh/src/poh_recorder.rs +++ b/poh/src/poh_recorder.rs @@ -503,6 +503,20 @@ impl PohRecorder { leader_first_tick_height, next_leader_slot, ))); + + // Check if we were leader in the previous slot (consecutive leader slots). + // If so, use the parent's expected end time as our start time for consistent slot pacing. + // This ensures that back-to-back leader slots maintain consistent 400ms timing + // rather than drifting based on actual processing time. + let parent = working_bank.bank.parent(); + let parent_was_our_leader_prev_slot = parent.as_ref().is_some_and(|p| { + p.collector_id() == working_bank.bank.collector_id() + && p.slot() + 1 == working_bank.bank.slot() + }); + if parent_was_our_leader_prev_slot { + self.cavey_set_start_time(parent.as_ref().unwrap().cavey_next_time.0); + } + self.working_bank = Some(working_bank); // TODO: adjust the working_bank.start time based on number of ticks @@ -944,6 +958,17 @@ impl PohRecorder { pub fn clear_bank_for_test(&mut self) { self.clear_bank(true); } + + /// Set the slot start time for consistent timing across consecutive leader slots. + /// Wrapper that delegates to the underlying Poh instance. + pub fn cavey_set_start_time(&self, start_time: Instant) { + self.poh.lock().unwrap().cavey_set_start_time(start_time); + } + + /// Get the current slot start time from the underlying Poh instance. + pub fn cavey_start_time(&self) -> Instant { + self.poh.lock().unwrap().cavey_start_time() + } } #[allow(clippy::type_complexity)] diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 7996704db5..5c3243226c 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -182,7 +182,7 @@ use { }, Arc, LockResult, Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard, Weak, }, - time::{Duration, Instant}, + time::{Duration, Instant, SystemTime}, vec, }, }; @@ -594,6 +594,7 @@ impl PartialEq for Bank { block_id, bank_hash_stats: _, epoch_rewards_calculation_cache: _, + cavey_next_time: _, // Ignore new fields explicitly if they do not impact PartialEq. // Adding ".." will remove compile-time checks that if a new field // is added to the struct, this PartialEq is accordingly updated. @@ -941,6 +942,11 @@ pub struct Bank { /// This is used to avoid recalculating the same epoch rewards at epoch boundary. /// The hashmap is keyed by parent_hash. epoch_rewards_calculation_cache: Arc>>>, + + /// The expected start time of the next slot when we're leader in consecutive slots. + /// Used to maintain consistent slot timing across back-to-back leader slots. + /// Tuple of (Instant for internal pacing, SystemTime for external communication). + pub cavey_next_time: (Instant, SystemTime), } #[derive(Debug)] @@ -1145,6 +1151,10 @@ impl Bank { block_id: RwLock::new(None), bank_hash_stats: AtomicBankHashStats::default(), epoch_rewards_calculation_cache: Arc::new(Mutex::new(HashMap::default())), + cavey_next_time: ( + Instant::now() + Duration::from_millis(400), + SystemTime::now() + Duration::from_millis(400), + ), }; bank.transaction_processor = @@ -1392,6 +1402,27 @@ impl Bank { block_id: RwLock::new(None), bank_hash_stats: AtomicBankHashStats::default(), epoch_rewards_calculation_cache: parent.epoch_rewards_calculation_cache.clone(), + // For consecutive leader slots (same collector, consecutive slot numbers), + // chain the timing from the parent's expected end time to maintain consistent + // 400ms slot boundaries. Otherwise, start fresh from now. + cavey_next_time: { + let is_consecutive_leader = + *parent.collector_id() == *collector_id && parent.slot() + 1 == slot; + if is_consecutive_leader { + // Chain from parent's expected end time + let parent_next = parent.cavey_next_time; + ( + parent_next.0 + Duration::from_millis(400), + parent_next.1 + Duration::from_millis(400), + ) + } else { + // First slot in leader window or not a leader slot - start fresh + ( + Instant::now() + Duration::from_millis(400), + SystemTime::now() + Duration::from_millis(400), + ) + } + }, }; // cavey unset limits. if we are proposer again this is set when we set_tpu_bank @@ -1888,6 +1919,10 @@ impl Bank { block_id: RwLock::new(None), bank_hash_stats: AtomicBankHashStats::new(&fields.bank_hash_stats), epoch_rewards_calculation_cache: Arc::new(Mutex::new(HashMap::default())), + cavey_next_time: ( + Instant::now() + Duration::from_millis(400), + SystemTime::now() + Duration::from_millis(400), + ), }; // Sanity assertions between bank snapshot and genesis config From 270efb9fb4073cbc09e11de721948897a01e0b5d Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 03:17:03 +0000 Subject: [PATCH 15/23] remove noisy log --- core/src/scheduler_synchronization.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/src/scheduler_synchronization.rs b/core/src/scheduler_synchronization.rs index b27c08d1a7..a1082a692f 100644 --- a/core/src/scheduler_synchronization.rs +++ b/core/src/scheduler_synchronization.rs @@ -69,10 +69,6 @@ pub fn vanilla_should_schedule(current_slot: u64, in_delegation_period: bool) -> // Check who claimed it - if vanilla claimed, all vanilla threads can consume // If block claimed, no vanilla thread should consume let claimed_by_vanilla = is_block_claim(state); - info!( - "vanilla_should_schedule: slot {} already claimed, by_block={}", - current_slot, claimed_by_vanilla - ); return Some(!claimed_by_vanilla); } From 4a249fdcd7f9ecc1c78d4dd3a45f957cc5e67306 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 11:56:28 +0000 Subject: [PATCH 16/23] remove stupid branch --- core/src/block_stage/block_consumer.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/block_stage/block_consumer.rs b/core/src/block_stage/block_consumer.rs index 2befd7b2b3..2c6ab29d50 100644 --- a/core/src/block_stage/block_consumer.rs +++ b/core/src/block_stage/block_consumer.rs @@ -646,11 +646,7 @@ impl BlockConsumer { ExecuteAndCommitTransactionsOutput { transaction_counts, retryable_transaction_indexes: vec![], - commit_transactions_result: if execution_error.is_some() { - Ok(all_commit_details) // Still return partial results - } else { - Ok(all_commit_details) - }, + commit_transactions_result: Ok(all_commit_details), execute_and_commit_timings, error_counters: TransactionErrorMetrics::default(), min_prioritization_fees, From 592cd3e84107139e7c1127f9f08b97b7dfc7f196 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:39:19 +0000 Subject: [PATCH 17/23] add future subscribe_bundles todo --- core/src/proxy/block_engine_stage.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/proxy/block_engine_stage.rs b/core/src/proxy/block_engine_stage.rs index 360bba2e63..d1586bb01e 100644 --- a/core/src/proxy/block_engine_stage.rs +++ b/core/src/proxy/block_engine_stage.rs @@ -741,6 +741,7 @@ impl BlockEngineStage { .map_err(|e| ProxyError::MethodError(e.to_string()))? .into_inner(); + // Harmonic TODO: revert to subscribe_bundles once 3.0 clients are deprecated let subscribe_bundles_stream = timeout( *connection_timeout, client.subscribe_bundles2(block_engine::SubscribeBundlesRequest {}), @@ -1017,7 +1018,10 @@ impl BlockEngineStage { .await { Ok(_) => { - info!("Successfully submitted leader window info for slot {}", slot); + info!( + "Successfully submitted leader window info for slot {}", + slot + ); } Err(e) => { error!("Failed to submit leader window info: {e}"); From b3ed3aac79d39194ea9ac05a610ba78ae08f3c74 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:41:16 +0000 Subject: [PATCH 18/23] add block engine stats --- core/src/proxy/block_engine_stage.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/src/proxy/block_engine_stage.rs b/core/src/proxy/block_engine_stage.rs index d1586bb01e..3b4edb2663 100644 --- a/core/src/proxy/block_engine_stage.rs +++ b/core/src/proxy/block_engine_stage.rs @@ -65,6 +65,7 @@ struct BlockEngineStageStats { num_bundle_packets: u64, num_packets: u64, num_empty_packets: u64, + num_blocks: u64, } impl BlockEngineStageStats { @@ -74,7 +75,8 @@ impl BlockEngineStageStats { ("num_bundles", self.num_bundles, i64), ("num_bundle_packets", self.num_bundle_packets, i64), ("num_packets", self.num_packets, i64), - ("num_empty_packets", self.num_empty_packets, i64) + ("num_empty_packets", self.num_empty_packets, i64), + ("num_blocks", self.num_blocks, i64) ); } } @@ -970,7 +972,7 @@ impl BlockEngineStage { fn handle_block_engine_maybe_blocks( maybe_blocks_response: Result, Status>, block_sender: &Sender, - _block_engine_stats: &mut BlockEngineStageStats, + block_engine_stats: &mut BlockEngineStageStats, ) -> crate::proxy::Result<()> { let blocks_response = maybe_blocks_response?.ok_or(ProxyError::GrpcStreamDisconnected)?; for bundle in blocks_response.bundles { @@ -995,6 +997,8 @@ impl BlockEngineStage { block_sender .send(block_bundle) .map_err(|_| ProxyError::PacketForwardError)?; + + block_engine_stats.num_blocks += 1; } } Ok(()) From 4de8849b6f041af037dec1ebf520e3458cfa2a99 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:43:30 +0000 Subject: [PATCH 19/23] remove unnecessary indirection in BlockStage::new --- core/src/block_stage/mod.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs index a8f3e5409a..dc98de8e00 100644 --- a/core/src/block_stage/mod.rs +++ b/core/src/block_stage/mod.rs @@ -62,17 +62,24 @@ impl BlockStage { exit: Arc, prioritization_fee_cache: &Arc, ) -> Self { - Self::start_block_thread( - cluster_info, - bank_forks, - transaction_recorder, - block_receiver, + let committer = Committer::new( transaction_status_sender, replay_vote_sender, - log_messages_bytes_limit, - exit, - prioritization_fee_cache, - ) + prioritization_fee_cache.clone(), + ); + + let consumer = + BlockConsumer::new(committer, transaction_recorder, log_messages_bytes_limit); + + let cluster_info = Arc::clone(&cluster_info) + let block_thread = Builder::new() + .name("solBlockStgTx".to_string()) + .spawn(move || { + Self::process_loop(bank_forks, block_receiver, consumer, exit, cluster_info); + }) + .unwrap(); + + Self { block_thread } } pub fn join(self) -> thread::Result<()> { From a28a413d9ecbc06a3a55ddfb1dc6b11e6cabe086 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:46:26 +0000 Subject: [PATCH 20/23] remove 3 logs in block_failed --- core/src/scheduler_synchronization.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/core/src/scheduler_synchronization.rs b/core/src/scheduler_synchronization.rs index a1082a692f..ac859111f9 100644 --- a/core/src/scheduler_synchronization.rs +++ b/core/src/scheduler_synchronization.rs @@ -164,14 +164,10 @@ pub fn block_should_schedule(current_slot: u64, in_delegation_period: bool) -> O /// This atomically clears the block claim and sets the slot to current_slot - 1 /// so that vanilla can claim the current slot. pub fn block_failed(current_slot: u64) -> Option { - info!("block_failed {current_slot}"); - // Atomically revert if we're still on the same slot with block claim let did_revert = SCHEDULER_STATE .fetch_update(Ordering::Release, Ordering::Acquire, |old_state| { - info!("block_failed fetch_update old_state={old_state}"); - - // Only revert if current slot is claimed by block + // Only revert if currnt slot is claimed by block if old_state == SENTINEL { return None; } @@ -190,16 +186,11 @@ pub fn block_failed(current_slot: u64) -> Option { // Revert to previous slot (vanilla claim, so vanilla can now claim current_slot) // Using wrapping_sub to handle slot 0 edge case let new_state = vanilla_claim(current_slot.wrapping_sub(1)); - info!("block_failed reverting to state={new_state}"); Some(new_state) }) .is_ok(); - info!("block_failed did_revert={did_revert}"); - - if did_revert { - info!("block reverted in slot {current_slot}"); - } + info!("block_failed did_revert={did_revert} in slot={current_slot}"); Some(did_revert) } From 165b647ec39f620c8adb87f82e88b24bb53c4a79 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:53:58 +0000 Subject: [PATCH 21/23] fix leader window notification timestamp, now matches poh pacing start time --- core/src/replay_stage.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 337dffa206..945400f5d1 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -2285,6 +2285,8 @@ impl ReplayStage { // new()-ing of its child bank banking_tracer.hash_event(parent.slot(), &parent.last_blockhash(), &parent.hash()); + // Capture collector_id before tpu_bank is moved + let tpu_bank_collector_id = *tpu_bank.collector_id(); update_bank_forks_and_poh_recorder_for_new_tpu_bank( bank_forks, poh_controller, @@ -2292,7 +2294,19 @@ impl ReplayStage { ); // Send leader window notification to block auction house - let window_start_time = std::time::SystemTime::now(); + // Use cavey_next_time for consistent slot timing across consecutive leader slots. + // This must be synced with PoH start time pacing: + // - PoH uses parent.cavey_next_time.0 (Instant) for internal pacing + // - Notification uses parent.cavey_next_time.1 (SystemTime) for external communication + // Both come from the same tuple, ensuring they represent the same logical time point. + let parent_was_our_leader_prev_slot = + parent.collector_id() == &tpu_bank_collector_id && parent_slot + 1 == poh_slot; + let window_start_time = if parent_was_our_leader_prev_slot { + // Use parent's expected start time (synced with PoH which uses .0 component) + parent.cavey_next_time.1.min(SystemTime::now()) + } else { + SystemTime::now() + }; match leader_window_sender.try_send((window_start_time, poh_slot)) { Ok(()) => { info!( From 651031de547fe186316f3c1082479fdfd2a7ce30 Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:56:09 +0000 Subject: [PATCH 22/23] minor fixes --- core/src/block_stage/mod.rs | 2 +- core/src/replay_stage.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs index dc98de8e00..b6f869a1e2 100644 --- a/core/src/block_stage/mod.rs +++ b/core/src/block_stage/mod.rs @@ -71,7 +71,7 @@ impl BlockStage { let consumer = BlockConsumer::new(committer, transaction_recorder, log_messages_bytes_limit); - let cluster_info = Arc::clone(&cluster_info) + let cluster_info = Arc::clone(&cluster_info); let block_thread = Builder::new() .name("solBlockStgTx".to_string()) .spawn(move || { diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 945400f5d1..e900d38c28 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -89,7 +89,7 @@ use { Arc, RwLock, }, thread::{self, Builder, JoinHandle}, - time::{Duration, Instant}, + time::{Duration, Instant, SystemTime}, }, }; From ed491c57f0fce0e9d33fc7e6d724c3bb7b16976d Mon Sep 17 00:00:00 2001 From: cavemanloverboy Date: Thu, 8 Jan 2026 19:56:56 +0000 Subject: [PATCH 23/23] remove unused start_block_thread fn --- core/src/block_stage/mod.rs | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/core/src/block_stage/mod.rs b/core/src/block_stage/mod.rs index b6f869a1e2..666d1583b9 100644 --- a/core/src/block_stage/mod.rs +++ b/core/src/block_stage/mod.rs @@ -86,37 +86,6 @@ impl BlockStage { self.block_thread.join() } - #[allow(clippy::too_many_arguments)] - fn start_block_thread( - cluster_info: &Arc, - bank_forks: Arc>, - transaction_recorder: TransactionRecorder, - block_receiver: Receiver, - transaction_status_sender: Option, - replay_vote_sender: ReplayVoteSender, - log_message_bytes_limit: Option, - exit: Arc, - prioritization_fee_cache: &Arc, - ) -> Self { - let committer = Committer::new( - transaction_status_sender, - replay_vote_sender, - prioritization_fee_cache.clone(), - ); - - let consumer = BlockConsumer::new(committer, transaction_recorder, log_message_bytes_limit); - - let cluster_info = cluster_info.clone(); - let block_thread = Builder::new() - .name("solBlockStgTx".to_string()) - .spawn(move || { - Self::process_loop(bank_forks, block_receiver, consumer, exit, cluster_info); - }) - .unwrap(); - - Self { block_thread } - } - fn process_loop( bank_forks: Arc>, block_receiver: Receiver,