From 27404c033253447f2ad05e4e2240f660062e4f71 Mon Sep 17 00:00:00 2001 From: Georgios Konstantopoulos Date: Fri, 16 Jan 2026 18:06:59 +0000 Subject: [PATCH 1/3] perf: add heavy benchmarks for persistence, cache, and state root Based on #eng-perf Slack discussions identifying key bottlenecks: - update_history_indices: 26% of persist time - write_trie_updates: 25.4% - write_trie_changesets: 24.2% - Execution cache contention under high throughput New benchmarks: - execution_cache: cache hit rates, contention, TIP-20 patterns - heavy_persistence: accumulated blocks, history indices, state root - heavy_root: parallel vs sync at scale, large storage tries Includes runner script and optimization opportunities doc. --- crates/engine/tree/Cargo.toml | 8 + crates/engine/tree/benches/execution_cache.rs | 371 ++++++++++++++++++ .../engine/tree/benches/heavy_persistence.rs | 308 +++++++++++++++ crates/trie/parallel/Cargo.toml | 4 + crates/trie/parallel/benches/heavy_root.rs | 330 ++++++++++++++++ docs/perf/OPTIMIZATION_OPPORTUNITIES.md | 170 ++++++++ scripts/bench-heavy.sh | 72 ++++ 7 files changed, 1263 insertions(+) create mode 100644 crates/engine/tree/benches/execution_cache.rs create mode 100644 crates/engine/tree/benches/heavy_persistence.rs create mode 100644 crates/trie/parallel/benches/heavy_root.rs create mode 100644 docs/perf/OPTIMIZATION_OPPORTUNITIES.md create mode 100755 scripts/bench-heavy.sh diff --git a/crates/engine/tree/Cargo.toml b/crates/engine/tree/Cargo.toml index 006233c1908..4c3b91daefe 100644 --- a/crates/engine/tree/Cargo.toml +++ b/crates/engine/tree/Cargo.toml @@ -115,6 +115,14 @@ harness = false name = "state_root_task" harness = false +[[bench]] +name = "heavy_persistence" +harness = false + +[[bench]] +name = "execution_cache" +harness = false + [features] test-utils = [ "reth-chain-state/test-utils", diff --git a/crates/engine/tree/benches/execution_cache.rs b/crates/engine/tree/benches/execution_cache.rs new file mode 100644 index 00000000000..11d593d3b57 --- /dev/null +++ b/crates/engine/tree/benches/execution_cache.rs @@ -0,0 +1,371 @@ +//! Heavy benchmarks for execution cache performance. +//! +//! Based on #eng-perf discussions about: +//! - moka vs mini-moka cache hit rates +//! - Cache contention under high throughput +//! - Pre-warming effectiveness +//! - 4GB fixed_cache allocation overhead +//! +//! Run with: cargo bench -p reth-engine-tree --bench execution_cache + +#![allow(missing_docs)] + +use alloy_primitives::{Address, B256, Bytes, U256}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use mini_moka::sync::CacheBuilder; +use rand::Rng; +use reth_primitives_traits::{Account, Bytecode}; +use revm_primitives::map::DefaultHashBuilder; +use std::{sync::Arc, thread, time::Duration}; + +type Cache = mini_moka::sync::Cache; + +/// Cache configuration matching production settings +struct CacheConfig { + account_cache_size: u64, + storage_cache_size: u64, + code_cache_size: u64, +} + +impl Default for CacheConfig { + fn default() -> Self { + Self { + account_cache_size: 1_000_000, + storage_cache_size: 10_000_000, + code_cache_size: 100_000, + } + } +} + +fn create_caches(config: &CacheConfig) -> (Cache>, Cache<(B256, B256), U256>, Cache>) { + let account_cache = CacheBuilder::new(config.account_cache_size) + .time_to_idle(Duration::from_secs(300)) + .build_with_hasher(DefaultHashBuilder::default()); + + let storage_cache = CacheBuilder::new(config.storage_cache_size) + .time_to_idle(Duration::from_secs(300)) + .build_with_hasher(DefaultHashBuilder::default()); + + let code_cache = CacheBuilder::new(config.code_cache_size) + .time_to_idle(Duration::from_secs(300)) + .build_with_hasher(DefaultHashBuilder::default()); + + (account_cache, storage_cache, code_cache) +} + +/// Benchmark: Cache lookup performance under varying hit rates +fn bench_cache_hit_rates(c: &mut Criterion) { + let mut group = c.benchmark_group("cache/hit_rates"); + group.sample_size(50); + + let config = CacheConfig::default(); + let hit_rates = [0.45, 0.78, 0.90, 0.95]; // 45% baseline, 78% with Half-Path, optimized + + for hit_rate in hit_rates { + let id = format!("hit_rate_{:.0}pct", hit_rate * 100.0); + group.throughput(Throughput::Elements(10000)); + + group.bench_function(BenchmarkId::new("storage_lookups", &id), |b| { + b.iter_with_setup( + || { + let (_, storage_cache, _) = create_caches(&config); + let mut rng = rand::rng(); + + // Pre-populate cache with some entries + let cached_keys: Vec<(B256, B256)> = (0..10000) + .map(|_| (B256::random(), B256::random())) + .collect(); + + for (addr, slot) in &cached_keys { + storage_cache.insert((*addr, *slot), U256::from(rng.random::())); + } + + // Create lookup keys - mix of cached and uncached + let num_cached = (10000.0 * hit_rate) as usize; + let mut lookup_keys: Vec<(B256, B256)> = cached_keys[..num_cached].to_vec(); + lookup_keys.extend((0..(10000 - num_cached)).map(|_| (B256::random(), B256::random()))); + + // Shuffle + use rand::seq::SliceRandom; + lookup_keys.shuffle(&mut rng); + + (storage_cache, lookup_keys) + }, + |(cache, keys)| { + let mut hits = 0u64; + let mut misses = 0u64; + for (addr, slot) in keys { + if cache.get(&(addr, slot)).is_some() { + hits += 1; + } else { + misses += 1; + } + } + (hits, misses) + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Concurrent cache access (simulating rayon parallel execution) +fn bench_cache_contention(c: &mut Criterion) { + let mut group = c.benchmark_group("cache/contention"); + group.sample_size(20); + + let thread_counts = [1, 4, 8, 16, 32]; // Match rayon thread pool sizes + + for num_threads in thread_counts { + let id = format!("threads_{}", num_threads); + group.throughput(Throughput::Elements(100000)); + + group.bench_function(BenchmarkId::new("concurrent_storage_access", &id), |b| { + b.iter_with_setup( + || { + let config = CacheConfig::default(); + let (_, storage_cache, _) = create_caches(&config); + let storage_cache = Arc::new(storage_cache); + + // Pre-populate with hot data + let mut rng = rand::rng(); + for _ in 0..50000 { + let key = (B256::random(), B256::random()); + storage_cache.insert(key, U256::from(rng.random::())); + } + + storage_cache + }, + |cache| { + let handles: Vec<_> = (0..num_threads) + .map(|_| { + let cache = Arc::clone(&cache); + thread::spawn(move || { + let mut rng = rand::rng(); + let ops_per_thread = 100000 / num_threads; + let mut hits = 0u64; + + for _ in 0..ops_per_thread { + let key = (B256::random(), B256::random()); + // 70% reads, 30% writes + if rng.random_bool(0.7) { + if cache.get(&key).is_some() { + hits += 1; + } + } else { + cache.insert(key, U256::from(rng.random::())); + } + } + hits + }) + }) + .collect(); + + let total_hits: u64 = handles.into_iter().map(|h| h.join().unwrap()).sum(); + total_hits + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Cache insertion burst (simulating BundleState merge after block) +fn bench_cache_burst_insert(c: &mut Criterion) { + let mut group = c.benchmark_group("cache/burst_insert"); + group.sample_size(20); + + // Sizes from Slack discussions: + // - Normal block: ~2000 storage changes + // - Heavy block: ~10000 storage changes + // - Megablock: ~50000 storage changes + let burst_sizes = [2000, 10000, 50000]; + + for size in burst_sizes { + let id = format!("entries_{}", size); + group.throughput(Throughput::Elements(size as u64)); + + group.bench_function(BenchmarkId::new("storage_burst", &id), |b| { + b.iter_with_setup( + || { + let config = CacheConfig::default(); + let (_, storage_cache, _) = create_caches(&config); + + let mut rng = rand::rng(); + let entries: Vec<((B256, B256), U256)> = (0..size) + .map(|_| { + ((B256::random(), B256::random()), U256::from(rng.random::())) + }) + .collect(); + + (storage_cache, entries) + }, + |(cache, entries)| { + for (key, value) in entries { + cache.insert(key, value); + } + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Simulating TIP-20 token transfer patterns (Tempo-specific) +/// These trigger mostly cache misses per Slack discussion +fn bench_tip20_cache_pattern(c: &mut Criterion) { + let mut group = c.benchmark_group("cache/tip20_pattern"); + group.sample_size(20); + + // TIP-20 transfers access unique accounts rarely seen before + let transfer_counts = [100, 500, 1000]; + + for num_transfers in transfer_counts { + let id = format!("transfers_{}", num_transfers); + group.throughput(Throughput::Elements(num_transfers as u64)); + + group.bench_function(BenchmarkId::new("unique_account_access", &id), |b| { + b.iter_with_setup( + || { + let config = CacheConfig::default(); + let (account_cache, storage_cache, _) = create_caches(&config); + + // TIP-20: Each transfer accesses sender, recipient, fee token contract + // Most are unique addresses → cache misses + let mut rng = rand::rng(); + let transfer_accounts: Vec<(B256, B256, B256)> = (0..num_transfers) + .map(|_| (B256::random(), B256::random(), B256::random())) + .collect(); + + (account_cache, storage_cache, transfer_accounts) + }, + |(account_cache, storage_cache, transfers)| { + let mut account_misses = 0u64; + let mut storage_misses = 0u64; + + for (sender, recipient, fee_contract) in transfers { + // Check sender account + if account_cache.get(&sender).is_none() { + account_misses += 1; + // Simulate DB lookup and cache population + account_cache.insert(sender, Some(Account::default())); + } + + // Check recipient + if account_cache.get(&recipient).is_none() { + account_misses += 1; + account_cache.insert(recipient, Some(Account::default())); + } + + // Check fee contract storage (balance slot) + let balance_slot = B256::ZERO; + if storage_cache.get(&(fee_contract, balance_slot)).is_none() { + storage_misses += 1; + storage_cache.insert((fee_contract, balance_slot), U256::from(1000)); + } + } + + (account_misses, storage_misses) + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Pre-warming effectiveness +fn bench_prewarm_effectiveness(c: &mut Criterion) { + let mut group = c.benchmark_group("cache/prewarm"); + group.sample_size(20); + + let block_sizes = [500, 2000, 5000]; + + for num_accounts in block_sizes { + let id = format!("accounts_{}", num_accounts); + + // Without pre-warming + group.bench_function(BenchmarkId::new("cold_execution", &id), |b| { + b.iter_with_setup( + || { + let config = CacheConfig::default(); + let (account_cache, storage_cache, _) = create_caches(&config); + + let mut rng = rand::rng(); + let accounts: Vec = (0..num_accounts).map(|_| B256::random()).collect(); + + (account_cache, storage_cache, accounts) + }, + |(account_cache, storage_cache, accounts)| { + let mut misses = 0u64; + for addr in &accounts { + if account_cache.get(addr).is_none() { + misses += 1; + // Simulate expensive DB lookup + std::hint::black_box(0u64); + account_cache.insert(*addr, Some(Account::default())); + } + // Access storage + let slot = B256::ZERO; + if storage_cache.get(&(*addr, slot)).is_none() { + std::hint::black_box(0u64); + storage_cache.insert((*addr, slot), U256::ZERO); + } + } + misses + }, + ); + }); + + // With pre-warming + group.bench_function(BenchmarkId::new("warm_execution", &id), |b| { + b.iter_with_setup( + || { + let config = CacheConfig::default(); + let (account_cache, storage_cache, _) = create_caches(&config); + + let mut rng = rand::rng(); + let accounts: Vec = (0..num_accounts).map(|_| B256::random()).collect(); + + // Pre-warm the cache + for addr in &accounts { + account_cache.insert(*addr, Some(Account::default())); + storage_cache.insert((*addr, B256::ZERO), U256::ZERO); + } + + (account_cache, storage_cache, accounts) + }, + |(account_cache, storage_cache, accounts)| { + let mut hits = 0u64; + for addr in &accounts { + if account_cache.get(addr).is_some() { + hits += 1; + } + let slot = B256::ZERO; + if storage_cache.get(&(*addr, slot)).is_some() { + hits += 1; + } + } + hits + }, + ); + }); + } + + group.finish(); +} + +criterion_group!( + name = execution_cache; + config = Criterion::default().significance_level(0.05).sample_size(20); + targets = + bench_cache_hit_rates, + bench_cache_contention, + bench_cache_burst_insert, + bench_tip20_cache_pattern, + bench_prewarm_effectiveness +); +criterion_main!(execution_cache); diff --git a/crates/engine/tree/benches/heavy_persistence.rs b/crates/engine/tree/benches/heavy_persistence.rs new file mode 100644 index 00000000000..632998af786 --- /dev/null +++ b/crates/engine/tree/benches/heavy_persistence.rs @@ -0,0 +1,308 @@ +//! Heavy benchmarks targeting persistence bottlenecks identified in #eng-perf. +//! +//! Key bottlenecks from profiling (Jan 2026): +//! - update_history_indices: 26.0% of persist time +//! - write_trie_updates: 25.4% +//! - write_trie_changesets: 24.2% +//! - write_state: 13.8% +//! - write_hashed_state: 10.6% +//! +//! Run with: cargo bench -p reth-engine-tree --bench heavy_persistence + +#![allow(missing_docs)] + +use alloy_primitives::{Address, B256, U256}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use proptest::test_runner::TestRunner; +use rand::Rng; +use reth_chainspec::ChainSpec; +use reth_db_common::init::init_genesis; +use reth_primitives_traits::Account as RethAccount; +use reth_provider::{ + test_utils::create_test_provider_factory_with_chain_spec, + HistoryWriter, StateWriter, TrieWriter, +}; +use reth_trie::{HashedPostState, HashedStorage, StateRoot}; +use reth_trie_db::DatabaseStateRoot; +use revm_primitives::HashMap; +use std::sync::Arc; + +/// Benchmark parameters simulating realistic block sizes +#[derive(Debug, Clone)] +struct PersistenceParams { + /// Number of accounts modified per block + accounts_per_block: usize, + /// Storage slots modified per account + storage_slots_per_account: usize, + /// Number of blocks to accumulate before persistence + blocks_accumulated: usize, +} + +impl PersistenceParams { + fn total_state_changes(&self) -> usize { + self.accounts_per_block * self.blocks_accumulated + } + + fn total_storage_changes(&self) -> usize { + self.total_state_changes() * self.storage_slots_per_account + } +} + +/// Generate realistic state changes simulating high-TPS block execution +fn generate_state_changes(params: &PersistenceParams) -> Vec<(HashedPostState, Vec)> { + let mut runner = TestRunner::deterministic(); + let mut rng = runner.rng().clone(); + let mut blocks = Vec::with_capacity(params.blocks_accumulated); + + for _ in 0..params.blocks_accumulated { + let mut hashed_state = HashedPostState::default(); + let mut account_addresses = Vec::with_capacity(params.accounts_per_block); + + for _ in 0..params.accounts_per_block { + let address = Address::random_with(&mut rng); + let hashed_address = alloy_primitives::keccak256(address); + account_addresses.push(hashed_address); + + let account = RethAccount { + balance: U256::from(rng.random::()), + nonce: rng.random::(), + bytecode_hash: if rng.random_bool(0.1) { Some(B256::random()) } else { None }, + }; + + hashed_state = hashed_state.with_accounts(std::iter::once((hashed_address, Some(account)))); + + let storage: HashMap = (0..params.storage_slots_per_account) + .map(|_| (B256::random_with(&mut rng), U256::from(rng.random::()))) + .collect(); + + hashed_state = hashed_state.with_storages(std::iter::once(( + hashed_address, + HashedStorage::from_iter(false, storage), + ))); + } + + blocks.push((hashed_state, account_addresses)); + } + + blocks +} + +/// Benchmark: write_hashed_state performance with varying state sizes +/// Targets the 10.6% bottleneck +fn bench_write_hashed_state(c: &mut Criterion) { + let mut group = c.benchmark_group("persistence/write_hashed_state"); + group.sample_size(10); + + // Scenarios from Slack discussions: + // - Normal block: ~500 accounts, ~10 slots each + // - Heavy DeFi block: ~2000 accounts, ~50 slots each + // - Megablock (1.5 GGas): ~5000 accounts, ~100 slots each + let scenarios = vec![ + PersistenceParams { accounts_per_block: 500, storage_slots_per_account: 10, blocks_accumulated: 1 }, + PersistenceParams { accounts_per_block: 2000, storage_slots_per_account: 50, blocks_accumulated: 1 }, + PersistenceParams { accounts_per_block: 5000, storage_slots_per_account: 100, blocks_accumulated: 1 }, + ]; + + for params in scenarios { + let id = format!( + "accounts_{}_slots_{}", + params.accounts_per_block, params.storage_slots_per_account + ); + group.throughput(Throughput::Elements(params.total_storage_changes() as u64)); + + group.bench_function(BenchmarkId::new("single_block", &id), |b| { + b.iter_with_setup( + || { + let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); + let _ = init_genesis(&factory).unwrap(); + let blocks = generate_state_changes(¶ms); + (factory, blocks) + }, + |(factory, blocks)| { + let provider_rw = factory.provider_rw().unwrap(); + for (hashed_state, _) in blocks { + provider_rw.write_hashed_state(&hashed_state.into_sorted()).unwrap(); + } + provider_rw.commit().unwrap(); + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Accumulated block persistence (back-to-back scenario) +/// This simulates the O(N²) overlay merge problem identified in Slack +fn bench_accumulated_persistence(c: &mut Criterion) { + let mut group = c.benchmark_group("persistence/accumulated_blocks"); + group.sample_size(10); + + // Simulate the "backpressure flywheel" problem: + // Higher throughput → more blocks accumulate → longer persist time + let scenarios = vec![ + // Normal: ~75 blocks accumulated (from Slack discussion) + PersistenceParams { accounts_per_block: 200, storage_slots_per_account: 20, blocks_accumulated: 75 }, + // Heavy backpressure: ~250 blocks accumulated + PersistenceParams { accounts_per_block: 200, storage_slots_per_account: 20, blocks_accumulated: 250 }, + ]; + + for params in scenarios { + let id = format!("blocks_{}_accounts_{}", params.blocks_accumulated, params.accounts_per_block); + group.throughput(Throughput::Elements(params.blocks_accumulated as u64)); + + group.bench_function(BenchmarkId::new("overlay_merge", &id), |b| { + b.iter_with_setup( + || { + let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); + let _ = init_genesis(&factory).unwrap(); + let blocks = generate_state_changes(¶ms); + (factory, blocks) + }, + |(factory, blocks)| { + let provider_rw = factory.provider_rw().unwrap(); + + // Simulate merging all overlays - this is where O(N²) happens + let mut merged = HashedPostState::default(); + for (hashed_state, _) in blocks { + merged.extend(hashed_state); + } + + provider_rw.write_hashed_state(&merged.into_sorted()).unwrap(); + provider_rw.commit().unwrap(); + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: State root calculation after persistence +/// Targets the 25.4% write_trie_updates + 24.2% write_trie_changesets +fn bench_state_root_after_persist(c: &mut Criterion) { + let mut group = c.benchmark_group("persistence/state_root"); + group.sample_size(10); + + let scenarios = vec![ + PersistenceParams { accounts_per_block: 1000, storage_slots_per_account: 20, blocks_accumulated: 1 }, + PersistenceParams { accounts_per_block: 5000, storage_slots_per_account: 50, blocks_accumulated: 1 }, + ]; + + for params in scenarios { + let id = format!("accounts_{}_slots_{}", params.accounts_per_block, params.storage_slots_per_account); + + group.bench_function(BenchmarkId::new("full_root_calculation", &id), |b| { + b.iter_with_setup( + || { + let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); + let _ = init_genesis(&factory).unwrap(); + let blocks = generate_state_changes(¶ms); + + // Pre-populate state + { + let provider_rw = factory.provider_rw().unwrap(); + for (hashed_state, _) in &blocks { + provider_rw.write_hashed_state(&hashed_state.clone().into_sorted()).unwrap(); + } + provider_rw.commit().unwrap(); + } + + (factory, blocks) + }, + |(factory, _blocks)| { + let provider = factory.provider().unwrap(); + let (root, updates) = StateRoot::from_tx(provider.tx_ref()) + .root_with_updates() + .unwrap(); + + // Write trie updates - this is where 25.4% of time goes + let provider_rw = factory.provider_rw().unwrap(); + provider_rw.write_trie_updates(updates).unwrap(); + provider_rw.commit().unwrap(); + + root + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: History indices update performance +/// Targets the 26.0% update_history_indices bottleneck +fn bench_history_indices(c: &mut Criterion) { + let mut group = c.benchmark_group("persistence/history_indices"); + group.sample_size(10); + + // From Slack: The fix is to derive transitions from in-memory ExecutionOutcome + // instead of scanning AccountChangeSets/StorageChangeSets tables + let scenarios = vec![ + PersistenceParams { accounts_per_block: 500, storage_slots_per_account: 10, blocks_accumulated: 10 }, + PersistenceParams { accounts_per_block: 1000, storage_slots_per_account: 20, blocks_accumulated: 50 }, + ]; + + for params in scenarios { + let id = format!( + "blocks_{}_accounts_{}_slots_{}", + params.blocks_accumulated, params.accounts_per_block, params.storage_slots_per_account + ); + group.throughput(Throughput::Elements(params.total_state_changes() as u64)); + + group.bench_function(BenchmarkId::new("insert_indices", &id), |b| { + b.iter_with_setup( + || { + let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); + let _ = init_genesis(&factory).unwrap(); + + // Build history index data structure (simulating in-memory derivation) + // Use Address type for account transitions as required by HistoryWriter + let mut account_transitions: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + let mut storage_transitions: std::collections::BTreeMap<(Address, B256), Vec> = + std::collections::BTreeMap::new(); + + let mut rng = rand::rng(); + for block_idx in 0..params.blocks_accumulated { + let block_number = block_idx as u64 + 1; + for _ in 0..params.accounts_per_block { + let address = Address::random_with(&mut rng); + account_transitions.entry(address).or_default().push(block_number); + // Add some storage transitions + for i in 0..params.storage_slots_per_account { + let slot = B256::from(U256::from(i)); + storage_transitions.entry((address, slot)).or_default().push(block_number); + } + } + } + + (factory, account_transitions, storage_transitions) + }, + |(factory, account_transitions, storage_transitions)| { + let provider_rw = factory.provider_rw().unwrap(); + + // This simulates the optimized path: insert_account_history_index + // and insert_storage_history_index from in-memory data + provider_rw.insert_account_history_index(account_transitions).unwrap(); + provider_rw.insert_storage_history_index(storage_transitions).unwrap(); + + provider_rw.commit().unwrap(); + }, + ); + }); + } + + group.finish(); +} + +criterion_group!( + name = heavy_persistence; + config = Criterion::default().significance_level(0.05).sample_size(10); + targets = + bench_write_hashed_state, + bench_accumulated_persistence, + bench_state_root_after_persist, + bench_history_indices +); +criterion_main!(heavy_persistence); diff --git a/crates/trie/parallel/Cargo.toml b/crates/trie/parallel/Cargo.toml index 9fb882b44a5..fad1f145a2f 100644 --- a/crates/trie/parallel/Cargo.toml +++ b/crates/trie/parallel/Cargo.toml @@ -69,3 +69,7 @@ test-utils = [ [[bench]] name = "root" harness = false + +[[bench]] +name = "heavy_root" +harness = false diff --git a/crates/trie/parallel/benches/heavy_root.rs b/crates/trie/parallel/benches/heavy_root.rs new file mode 100644 index 00000000000..27158977ee4 --- /dev/null +++ b/crates/trie/parallel/benches/heavy_root.rs @@ -0,0 +1,330 @@ +//! Heavy benchmarks for parallel state root calculation. +//! +//! Based on #eng-perf profiling showing: +//! - State root calculation is 50-80% of validation time for 3s blocks +//! - Parallel vs sync root has significant delta at scale +//! - Sparse trie updates vs full recalculation trade-offs +//! +//! Run with: cargo bench -p reth-trie-parallel --bench heavy_root + +#![allow(missing_docs, unreachable_pub)] + +use alloy_primitives::{B256, U256}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use proptest::{prelude::*, strategy::ValueTree, test_runner::TestRunner}; +use proptest_arbitrary_interop::arb; +use reth_primitives_traits::Account; +use reth_provider::{ + providers::OverlayStateProviderFactory, test_utils::create_test_provider_factory, StateWriter, + TrieWriter, +}; +use reth_trie::{ + hashed_cursor::HashedPostStateCursorFactory, HashedPostState, HashedStorage, StateRoot, + TrieInput, +}; +use reth_trie_db::{ChangesetCache, DatabaseHashedCursorFactory, DatabaseStateRoot}; +use reth_trie_parallel::root::ParallelStateRoot; +use std::collections::HashMap; + +/// Benchmark parameters for megablock scenarios +#[derive(Debug, Clone)] +struct StateRootParams { + /// Total accounts in database + db_accounts: usize, + /// Storage slots per account + storage_per_account: usize, + /// Percentage of accounts updated + update_percentage: f64, +} + +impl StateRootParams { + fn updated_accounts(&self) -> usize { + (self.db_accounts as f64 * self.update_percentage) as usize + } +} + +fn generate_heavy_test_data(params: &StateRootParams) -> (HashedPostState, HashedPostState) { + let mut runner = TestRunner::deterministic(); + + let db_state = proptest::collection::hash_map( + any::(), + ( + arb::().prop_filter("non empty account", |a| !a.is_empty()), + proptest::collection::hash_map( + any::(), + any::().prop_filter("non zero value", |v| !v.is_zero()), + params.storage_per_account, + ), + ), + params.db_accounts, + ) + .new_tree(&mut runner) + .unwrap() + .current(); + + let keys = db_state.keys().copied().collect::>(); + let num_updates = params.updated_accounts(); + let keys_to_update = proptest::sample::subsequence(keys, num_updates) + .new_tree(&mut runner) + .unwrap() + .current(); + + let updated_storages = keys_to_update + .into_iter() + .map(|address| { + let (_, storage) = db_state.get(&address).unwrap(); + let slots = storage.keys().copied().collect::>(); + let slots_to_update = proptest::sample::subsequence(slots, params.storage_per_account / 2) + .new_tree(&mut runner) + .unwrap() + .current(); + ( + address, + slots_to_update + .into_iter() + .map(|slot| (slot, any::().new_tree(&mut runner).unwrap().current())) + .collect::>(), + ) + }) + .collect::>(); + + ( + HashedPostState::default() + .with_accounts( + db_state.iter().map(|(address, (account, _))| (*address, Some(*account))), + ) + .with_storages(db_state.into_iter().map(|(address, (_, storage))| { + (address, HashedStorage::from_iter(false, storage)) + })), + HashedPostState::default().with_storages( + updated_storages + .into_iter() + .map(|(address, storage)| (address, HashedStorage::from_iter(false, storage))), + ), + ) +} + +/// Benchmark: Sync vs Parallel state root at various scales +fn bench_sync_vs_parallel(c: &mut Criterion) { + let mut group = c.benchmark_group("state_root/sync_vs_parallel"); + group.sample_size(10); + + // Scenarios based on Slack discussions: + // - Normal block: 3000 accounts + // - Heavy block: 10000 accounts + // - Megablock (1.5 GGas): 30000+ accounts + let scenarios = vec![ + StateRootParams { db_accounts: 3000, storage_per_account: 100, update_percentage: 0.5 }, + StateRootParams { db_accounts: 10000, storage_per_account: 100, update_percentage: 0.3 }, + StateRootParams { db_accounts: 30000, storage_per_account: 50, update_percentage: 0.2 }, + ]; + + for params in scenarios { + let (db_state, updated_state) = generate_heavy_test_data(¶ms); + let provider_factory = create_test_provider_factory(); + + // Setup: write initial state + { + let provider_rw = provider_factory.provider_rw().unwrap(); + provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); + let (_, updates) = StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); + provider_rw.write_trie_updates(updates).unwrap(); + provider_rw.commit().unwrap(); + } + + let id = format!( + "db_{}_updated_{}", + params.db_accounts, + params.updated_accounts() + ); + + let changeset_cache = ChangesetCache::new(); + let factory = OverlayStateProviderFactory::new(provider_factory.clone(), changeset_cache); + + // Sync state root + group.bench_function(BenchmarkId::new("sync", &id), |b| { + b.iter_with_setup( + || { + let sorted_state = updated_state.clone().into_sorted(); + let prefix_sets = updated_state.construct_prefix_sets().freeze(); + let provider = provider_factory.provider().unwrap(); + (provider, sorted_state, prefix_sets) + }, + |(provider, sorted_state, prefix_sets)| { + let hashed_cursor_factory = HashedPostStateCursorFactory::new( + DatabaseHashedCursorFactory::new(provider.tx_ref()), + &sorted_state, + ); + StateRoot::from_tx(provider.tx_ref()) + .with_hashed_cursor_factory(hashed_cursor_factory) + .with_prefix_sets(prefix_sets) + .root() + }, + ); + }); + + // Parallel state root + group.bench_function(BenchmarkId::new("parallel", &id), |b| { + b.iter_with_setup( + || { + let trie_input = TrieInput::from_state(updated_state.clone()); + ParallelStateRoot::new(factory.clone(), trie_input.prefix_sets.freeze()) + }, + |calculator| calculator.incremental_root(), + ); + }); + } + + group.finish(); +} + +/// Benchmark: Incremental updates (sparse trie) at scale +fn bench_incremental_updates(c: &mut Criterion) { + let mut group = c.benchmark_group("state_root/incremental"); + group.sample_size(10); + + // Test repeated incremental updates (simulating back-to-back blocks) + let num_updates_sequence = [5, 10, 25, 50]; + + for num_updates in num_updates_sequence { + let params = StateRootParams { + db_accounts: 5000, + storage_per_account: 50, + update_percentage: 0.1, + }; + + let id = format!("sequential_updates_{}", num_updates); + group.throughput(Throughput::Elements(num_updates as u64)); + + group.bench_function(BenchmarkId::new("sparse_trie", &id), |b| { + b.iter_with_setup( + || { + let (db_state, _) = generate_heavy_test_data(¶ms); + let provider_factory = create_test_provider_factory(); + + { + let provider_rw = provider_factory.provider_rw().unwrap(); + provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); + let (_, updates) = + StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); + provider_rw.write_trie_updates(updates).unwrap(); + provider_rw.commit().unwrap(); + } + + // Generate sequence of updates + let updates: Vec = (0..num_updates) + .map(|_| { + let (_, update) = generate_heavy_test_data(&StateRootParams { + db_accounts: 500, + storage_per_account: 20, + update_percentage: 1.0, + }); + update + }) + .collect(); + + (provider_factory, updates) + }, + |(provider_factory, updates)| { + let changeset_cache = ChangesetCache::new(); + let factory = OverlayStateProviderFactory::new(provider_factory, changeset_cache); + + let mut roots = Vec::with_capacity(updates.len()); + for update in updates { + let trie_input = TrieInput::from_state(update); + let calculator = + ParallelStateRoot::new(factory.clone(), trie_input.prefix_sets.freeze()); + roots.push(calculator.incremental_root().unwrap()); + } + roots + }, + ); + }); + } + + group.finish(); +} + +/// Benchmark: Large storage trie updates (contract-heavy blocks) +fn bench_large_storage_tries(c: &mut Criterion) { + let mut group = c.benchmark_group("state_root/large_storage"); + group.sample_size(10); + + // Simulate contracts with large storage (DEX, AMM, etc.) + let storage_sizes = [1000, 5000, 10000]; + + for storage_size in storage_sizes { + let id = format!("slots_{}", storage_size); + group.throughput(Throughput::Elements(storage_size as u64)); + + group.bench_function(BenchmarkId::new("single_contract", &id), |b| { + b.iter_with_setup( + || { + let mut runner = TestRunner::deterministic(); + let contract_address = any::().new_tree(&mut runner).unwrap().current(); + + let storage: HashMap = proptest::collection::hash_map( + any::(), + any::().prop_filter("non zero", |v| !v.is_zero()), + storage_size, + ) + .new_tree(&mut runner) + .unwrap() + .current(); + + let db_state = HashedPostState::default() + .with_accounts(std::iter::once((contract_address, Some(Account::default())))) + .with_storages(std::iter::once(( + contract_address, + HashedStorage::from_iter(false, storage.clone()), + ))); + + let provider_factory = create_test_provider_factory(); + { + let provider_rw = provider_factory.provider_rw().unwrap(); + provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); + let (_, updates) = + StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); + provider_rw.write_trie_updates(updates).unwrap(); + provider_rw.commit().unwrap(); + } + + // Update half the storage + let update_storage: HashMap = storage + .into_iter() + .take(storage_size / 2) + .map(|(k, _)| (k, U256::from(999))) + .collect(); + + let update_state = HashedPostState::default().with_storages(std::iter::once(( + contract_address, + HashedStorage::from_iter(false, update_storage), + ))); + + (provider_factory, update_state) + }, + |(provider_factory, update_state)| { + let changeset_cache = ChangesetCache::new(); + let factory = OverlayStateProviderFactory::new(provider_factory, changeset_cache); + + let trie_input = TrieInput::from_state(update_state); + let calculator = + ParallelStateRoot::new(factory, trie_input.prefix_sets.freeze()); + calculator.incremental_root() + }, + ); + }); + } + + group.finish(); +} + +criterion_group!( + name = heavy_root; + config = Criterion::default().significance_level(0.05).sample_size(10); + targets = + bench_sync_vs_parallel, + bench_incremental_updates, + bench_large_storage_tries +); +criterion_main!(heavy_root); diff --git a/docs/perf/OPTIMIZATION_OPPORTUNITIES.md b/docs/perf/OPTIMIZATION_OPPORTUNITIES.md new file mode 100644 index 00000000000..5c7354854e4 --- /dev/null +++ b/docs/perf/OPTIMIZATION_OPPORTUNITIES.md @@ -0,0 +1,170 @@ +# Reth/Tempo Performance Optimization Opportunities + +Based on #eng-perf and #ai-agent Slack channel analysis (Jan 2026). + +## Current Bottleneck Analysis + +From profiling sessions, the persistence phase breakdown is: + +| Component | % Time | Priority | +|-----------|--------|----------| +| `update_history_indices` | 26.0% | **Critical** | +| `write_trie_updates` | 25.4% | High | +| `write_trie_changesets` | 24.2% | High | +| `write_state` | 13.8% | Medium | +| `write_hashed_state` | 10.6% | Medium | + +## Critical Optimizations + +### 1. History Indices from In-Memory Data (26% improvement potential) + +**Problem**: Currently scans `AccountChangeSets`/`StorageChangeSets` tables via DB cursors. + +**Solution**: Derive transitions directly from in-memory `ExecutionOutcome`: + +```rust +// Before the loop, accumulate transitions: +let mut account_transitions: BTreeMap> = BTreeMap::new(); +let mut storage_transitions: BTreeMap<(Address, B256), Vec> = BTreeMap::new(); + +// Inside the per-block loop, extract from execution_outcome.bundle.reverts +for (block_idx, block_reverts) in execution_output.bundle.reverts.iter().enumerate() { + let block_number = execution_output.first_block() + block_idx as u64; + for (address, account_revert) in block_reverts { + account_transitions.entry(*address).or_default().push(block_number); + for storage_key in account_revert.storage.keys() { + let key = B256::new(storage_key.to_be_bytes()); + storage_transitions.entry((*address, key)).or_default().push(block_number); + } + } +} + +// After loop, replace update_history_indices(range) with: +self.insert_account_history_index(account_transitions)?; +self.insert_storage_history_index(storage_transitions)?; +``` + +**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- history_indices` + +### 2. Batch Trie Writes Across Blocks (~50% improvement for b2b) + +**Problem**: `write_trie_changesets` and `write_trie_updates_sorted` called per-block. + +**Solution**: Accumulate overlay across blocks, write once at end: + +```rust +let mut trie_overlay = TrieUpdatesSorted::default(); + +for block in blocks { + self.write_trie_changesets(block_number, &trie_data.trie_updates, Some(&trie_overlay))?; + trie_overlay.extend(&trie_data.trie_updates); +} +// Single write at end +self.write_trie_updates_sorted(&trie_overlay)?; +``` + +**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- accumulated` + +### 3. O(N²) Overlay Merge Fix (PR #20774) + +**Problem**: `wait_cloned()` iterates through ALL ancestors for each block → O(N²) complexity. + +**Results achieved**: +- p50: -8.40% +- p90: -42.46% +- p99: -60.30% +- Gas/Second: +73.34% + +**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- overlay_merge` + +## High Priority Optimizations + +### 4. MDBX Configuration Tuning + +**Problem**: msync taking seconds on new reth boxes (99% of samples in `mdbx_txn_commit_ex`). + +**Potential solutions**: +- Expose `txn_dp_limit` - forces dirty pages to spill during transaction +- Expose `sync_bytes`/`sync_period` - triggers intermediate flushes +- Lower `spill_max_denominator` for more aggressive spilling + +```rust +// Already in libmdbx-rs but not exposed to CLI +mdbx_env_set_syncbytes(env, 100*1024*1024); // Flush every 100MB +mdbx_env_set_syncperiod(env, 16384); // Or every ~0.25s +``` + +### 5. Execution Cache Improvements + +**Problem**: moka cache contention under high throughput, expensive cache misses. + +**Findings**: +- TIP-20 transfers trigger mostly cache misses (unique accounts) +- fixed_cache (4GB) allocation overhead when cache misses +- Pre-warming effectiveness varies + +**Benchmark**: `cargo bench -p reth-engine-tree --bench execution_cache` + +**Key metrics to track**: +- Cache hit rate (baseline 45%, target 78%+ like Half-Path) +- Contention under 8-32 threads +- Burst insert latency + +### 6. Parallel State Root Scaling + +**Problem**: State root calculation is 50-80% of validation time for large blocks. + +**Benchmark scenarios**: +- Normal block: 3,000 accounts +- Heavy block: 10,000 accounts +- Megablock (1.5 GGas): 30,000+ accounts + +**Benchmark**: `cargo bench -p reth-trie-parallel --bench heavy_root` + +## Medium Priority Optimizations + +### 7. Pre-sort Storage Tries Once + +Currently sorted every call at multiple locations: +```rust +let mut storage_updates = trie_updates.storage_tries_ref().iter().collect::>(); +storage_updates.sort_unstable_by(|a, b| a.0.cmp(b.0)); +``` + +Could use `BTreeMap` internally or pre-sorted Vec in `TrieUpdatesSorted`. + +### 8. Remove Expensive HashMap Clone + +Recent change introduced a clone showing up in profiles (3s block profile). + +## Benchmarking Commands + +```bash +# Run all heavy benchmarks on an idle box +./scripts/bench-heavy.sh ./results-$(date +%Y%m%d) + +# Run specific benchmark groups +cargo bench -p reth-engine-tree --bench execution_cache +cargo bench -p reth-engine-tree --bench heavy_persistence +cargo bench -p reth-trie-parallel --bench heavy_root + +# Compare against baseline +cargo bench -p reth-engine-tree --bench execution_cache -- --baseline heavy-cache + +# Profile with samply +samply record -- cargo bench -p reth-engine-tree --bench execution_cache -- cache/contention +``` + +## Profiling Resources + +- [How to benchmark and profile Reth](https://www.notion.so/How-to-benchmark-and-profile-Reth-21532f2c34848058a2f6efc5f852603d) +- [Perf onboarding doc](https://docs.google.com/document/d/1pgbWk6wjd3p3oGy2SC2mWiGAvzlcIoiB-8g20fm6Acc) +- Firefox Profiler compare: https://profiler.firefox.com/compare/ + +## Related PRs + +- [#20774](https://github.com/paradigmxyz/reth/pull/20774) - Overlay reuse optimization +- [#20616](https://github.com/paradigmxyz/reth/pull/20616) - Subscribe to persisted block +- [#20520](https://github.com/paradigmxyz/reth/pull/20520) - fixed_cache execution cache +- [#20405](https://github.com/paradigmxyz/reth/pull/20405) - Defer transaction pool notifications +- [#20398](https://github.com/paradigmxyz/reth/pull/20398) - Use RwLock for transaction pool listeners diff --git a/scripts/bench-heavy.sh b/scripts/bench-heavy.sh new file mode 100755 index 00000000000..d4e086018c6 --- /dev/null +++ b/scripts/bench-heavy.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Heavy Performance Benchmarks for Reth/Tempo +# +# Based on #eng-perf Slack discussions identifying key bottlenecks: +# - update_history_indices: 26% of persist time +# - write_trie_updates: 25.4% +# - write_trie_changesets: 24.2% +# - Execution cache contention under high throughput +# +# Run this on an idle reth box (check with: reth-box-status) +# +# Usage: ./scripts/bench-heavy.sh [output_dir] + +set -euo pipefail + +OUTPUT_DIR="${1:-./benchmark-results}" +mkdir -p "$OUTPUT_DIR" + +echo "=============================================" +echo "Heavy Performance Benchmarks for Reth/Tempo" +echo "=============================================" +echo "Output directory: $OUTPUT_DIR" +echo "Started at: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +echo "" + +# Build with optimizations +echo "[1/5] Building benchmarks with release profile..." +cargo build --release --benches -p reth-engine-tree -p reth-trie-parallel -p reth-trie-sparse + +# Run execution cache benchmarks +echo "" +echo "[2/5] Running execution cache benchmarks..." +echo " - Cache hit rate simulations (45%, 78%, 90%, 95%)" +echo " - Concurrent access contention (1-32 threads)" +echo " - Burst insert patterns (2K-50K entries)" +echo " - TIP-20 transfer patterns" +echo " - Pre-warming effectiveness" +cargo bench -p reth-engine-tree --bench execution_cache -- --save-baseline heavy-cache 2>&1 | tee "$OUTPUT_DIR/execution_cache.log" + +# Run heavy persistence benchmarks +echo "" +echo "[3/5] Running heavy persistence benchmarks..." +echo " - write_hashed_state with 500-5000 accounts" +echo " - Accumulated block persistence (75-250 blocks)" +echo " - State root calculation after persistence" +echo " - History indices insertion" +cargo bench -p reth-engine-tree --bench heavy_persistence -- --save-baseline heavy-persist 2>&1 | tee "$OUTPUT_DIR/heavy_persistence.log" + +# Run parallel state root benchmarks +echo "" +echo "[4/5] Running parallel state root benchmarks..." +echo " - Sync vs Parallel root (3K-30K accounts)" +echo " - Incremental updates (5-50 sequential)" +echo " - Large storage tries (1K-10K slots)" +cargo bench -p reth-trie-parallel --bench heavy_root -- --save-baseline heavy-root 2>&1 | tee "$OUTPUT_DIR/heavy_root.log" + +# Run existing state root task benchmarks +echo "" +echo "[5/5] Running state root task benchmarks..." +cargo bench -p reth-engine-tree --bench state_root_task -- --save-baseline state-root 2>&1 | tee "$OUTPUT_DIR/state_root_task.log" + +echo "" +echo "=============================================" +echo "Benchmarks completed at: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +echo "Results saved to: $OUTPUT_DIR" +echo "" +echo "To compare baselines later:" +echo " cargo bench -p reth-engine-tree --bench execution_cache -- --baseline heavy-cache" +echo "" +echo "To generate HTML reports:" +echo " open target/criterion/*/report/index.html" +echo "=============================================" From 6a2467d04a26b7f82ad0d396a0c93262a0583e40 Mon Sep 17 00:00:00 2001 From: Georgios Konstantopoulos Date: Fri, 16 Jan 2026 18:22:55 +0000 Subject: [PATCH 2/3] perf(storage): batch trie updates across blocks in save_blocks Previously, `write_trie_updates_sorted` was called once per block in the save_blocks loop. This opened/closed cursors N times for N blocks. This change accumulates trie updates across all blocks using `extend_ref` and writes them in a single batch at the end. This reduces: - Cursor open/close overhead from N to 1 - MDBX transaction overhead For back-to-back block processing with 75-250 accumulated blocks (per #eng-perf profiling), this significantly reduces the ~25% of persist time spent in write_trie_updates. Expected improvement: ~50% reduction in write_trie_updates for b2b scenarios. --- .../provider/src/providers/database/provider.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/crates/storage/provider/src/providers/database/provider.rs b/crates/storage/provider/src/providers/database/provider.rs index af644a47a9b..6154336f6f9 100644 --- a/crates/storage/provider/src/providers/database/provider.rs +++ b/crates/storage/provider/src/providers/database/provider.rs @@ -522,6 +522,10 @@ impl DatabaseProvider = None; + for (i, block) in blocks.iter().enumerate() { let recovered_block = block.recovered_block(); @@ -556,12 +560,23 @@ impl DatabaseProvider acc.extend_ref(&trie_data.trie_updates), + None => accumulated_trie_updates = Some((*trie_data.trie_updates).clone()), + } timings.write_trie_updates += start.elapsed(); } } + // Write all accumulated trie updates in a single batch + if let Some(trie_updates) = &accumulated_trie_updates { + let start = Instant::now(); + self.write_trie_updates_sorted(trie_updates)?; + timings.write_trie_updates += start.elapsed(); + } + // Full mode: update history indices if save_mode.with_state() { let start = Instant::now(); From cc80f6746366a9496e0300d82d94bad54ff080b5 Mon Sep 17 00:00:00 2001 From: Georgios Konstantopoulos Date: Fri, 16 Jan 2026 18:29:59 +0000 Subject: [PATCH 3/3] chore: remove unrelated benchmark/docs changes per review feedback Amp-Thread-ID: https://ampcode.com/threads/T-019bc811-0850-7320-902c-52e64a671eb5 Co-authored-by: Amp --- crates/engine/tree/Cargo.toml | 8 - crates/engine/tree/benches/execution_cache.rs | 371 ------------------ .../engine/tree/benches/heavy_persistence.rs | 308 --------------- crates/trie/parallel/Cargo.toml | 4 - crates/trie/parallel/benches/heavy_root.rs | 330 ---------------- docs/perf/OPTIMIZATION_OPPORTUNITIES.md | 170 -------- scripts/bench-heavy.sh | 72 ---- 7 files changed, 1263 deletions(-) delete mode 100644 crates/engine/tree/benches/execution_cache.rs delete mode 100644 crates/engine/tree/benches/heavy_persistence.rs delete mode 100644 crates/trie/parallel/benches/heavy_root.rs delete mode 100644 docs/perf/OPTIMIZATION_OPPORTUNITIES.md delete mode 100755 scripts/bench-heavy.sh diff --git a/crates/engine/tree/Cargo.toml b/crates/engine/tree/Cargo.toml index 4c3b91daefe..006233c1908 100644 --- a/crates/engine/tree/Cargo.toml +++ b/crates/engine/tree/Cargo.toml @@ -115,14 +115,6 @@ harness = false name = "state_root_task" harness = false -[[bench]] -name = "heavy_persistence" -harness = false - -[[bench]] -name = "execution_cache" -harness = false - [features] test-utils = [ "reth-chain-state/test-utils", diff --git a/crates/engine/tree/benches/execution_cache.rs b/crates/engine/tree/benches/execution_cache.rs deleted file mode 100644 index 11d593d3b57..00000000000 --- a/crates/engine/tree/benches/execution_cache.rs +++ /dev/null @@ -1,371 +0,0 @@ -//! Heavy benchmarks for execution cache performance. -//! -//! Based on #eng-perf discussions about: -//! - moka vs mini-moka cache hit rates -//! - Cache contention under high throughput -//! - Pre-warming effectiveness -//! - 4GB fixed_cache allocation overhead -//! -//! Run with: cargo bench -p reth-engine-tree --bench execution_cache - -#![allow(missing_docs)] - -use alloy_primitives::{Address, B256, Bytes, U256}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use mini_moka::sync::CacheBuilder; -use rand::Rng; -use reth_primitives_traits::{Account, Bytecode}; -use revm_primitives::map::DefaultHashBuilder; -use std::{sync::Arc, thread, time::Duration}; - -type Cache = mini_moka::sync::Cache; - -/// Cache configuration matching production settings -struct CacheConfig { - account_cache_size: u64, - storage_cache_size: u64, - code_cache_size: u64, -} - -impl Default for CacheConfig { - fn default() -> Self { - Self { - account_cache_size: 1_000_000, - storage_cache_size: 10_000_000, - code_cache_size: 100_000, - } - } -} - -fn create_caches(config: &CacheConfig) -> (Cache>, Cache<(B256, B256), U256>, Cache>) { - let account_cache = CacheBuilder::new(config.account_cache_size) - .time_to_idle(Duration::from_secs(300)) - .build_with_hasher(DefaultHashBuilder::default()); - - let storage_cache = CacheBuilder::new(config.storage_cache_size) - .time_to_idle(Duration::from_secs(300)) - .build_with_hasher(DefaultHashBuilder::default()); - - let code_cache = CacheBuilder::new(config.code_cache_size) - .time_to_idle(Duration::from_secs(300)) - .build_with_hasher(DefaultHashBuilder::default()); - - (account_cache, storage_cache, code_cache) -} - -/// Benchmark: Cache lookup performance under varying hit rates -fn bench_cache_hit_rates(c: &mut Criterion) { - let mut group = c.benchmark_group("cache/hit_rates"); - group.sample_size(50); - - let config = CacheConfig::default(); - let hit_rates = [0.45, 0.78, 0.90, 0.95]; // 45% baseline, 78% with Half-Path, optimized - - for hit_rate in hit_rates { - let id = format!("hit_rate_{:.0}pct", hit_rate * 100.0); - group.throughput(Throughput::Elements(10000)); - - group.bench_function(BenchmarkId::new("storage_lookups", &id), |b| { - b.iter_with_setup( - || { - let (_, storage_cache, _) = create_caches(&config); - let mut rng = rand::rng(); - - // Pre-populate cache with some entries - let cached_keys: Vec<(B256, B256)> = (0..10000) - .map(|_| (B256::random(), B256::random())) - .collect(); - - for (addr, slot) in &cached_keys { - storage_cache.insert((*addr, *slot), U256::from(rng.random::())); - } - - // Create lookup keys - mix of cached and uncached - let num_cached = (10000.0 * hit_rate) as usize; - let mut lookup_keys: Vec<(B256, B256)> = cached_keys[..num_cached].to_vec(); - lookup_keys.extend((0..(10000 - num_cached)).map(|_| (B256::random(), B256::random()))); - - // Shuffle - use rand::seq::SliceRandom; - lookup_keys.shuffle(&mut rng); - - (storage_cache, lookup_keys) - }, - |(cache, keys)| { - let mut hits = 0u64; - let mut misses = 0u64; - for (addr, slot) in keys { - if cache.get(&(addr, slot)).is_some() { - hits += 1; - } else { - misses += 1; - } - } - (hits, misses) - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Concurrent cache access (simulating rayon parallel execution) -fn bench_cache_contention(c: &mut Criterion) { - let mut group = c.benchmark_group("cache/contention"); - group.sample_size(20); - - let thread_counts = [1, 4, 8, 16, 32]; // Match rayon thread pool sizes - - for num_threads in thread_counts { - let id = format!("threads_{}", num_threads); - group.throughput(Throughput::Elements(100000)); - - group.bench_function(BenchmarkId::new("concurrent_storage_access", &id), |b| { - b.iter_with_setup( - || { - let config = CacheConfig::default(); - let (_, storage_cache, _) = create_caches(&config); - let storage_cache = Arc::new(storage_cache); - - // Pre-populate with hot data - let mut rng = rand::rng(); - for _ in 0..50000 { - let key = (B256::random(), B256::random()); - storage_cache.insert(key, U256::from(rng.random::())); - } - - storage_cache - }, - |cache| { - let handles: Vec<_> = (0..num_threads) - .map(|_| { - let cache = Arc::clone(&cache); - thread::spawn(move || { - let mut rng = rand::rng(); - let ops_per_thread = 100000 / num_threads; - let mut hits = 0u64; - - for _ in 0..ops_per_thread { - let key = (B256::random(), B256::random()); - // 70% reads, 30% writes - if rng.random_bool(0.7) { - if cache.get(&key).is_some() { - hits += 1; - } - } else { - cache.insert(key, U256::from(rng.random::())); - } - } - hits - }) - }) - .collect(); - - let total_hits: u64 = handles.into_iter().map(|h| h.join().unwrap()).sum(); - total_hits - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Cache insertion burst (simulating BundleState merge after block) -fn bench_cache_burst_insert(c: &mut Criterion) { - let mut group = c.benchmark_group("cache/burst_insert"); - group.sample_size(20); - - // Sizes from Slack discussions: - // - Normal block: ~2000 storage changes - // - Heavy block: ~10000 storage changes - // - Megablock: ~50000 storage changes - let burst_sizes = [2000, 10000, 50000]; - - for size in burst_sizes { - let id = format!("entries_{}", size); - group.throughput(Throughput::Elements(size as u64)); - - group.bench_function(BenchmarkId::new("storage_burst", &id), |b| { - b.iter_with_setup( - || { - let config = CacheConfig::default(); - let (_, storage_cache, _) = create_caches(&config); - - let mut rng = rand::rng(); - let entries: Vec<((B256, B256), U256)> = (0..size) - .map(|_| { - ((B256::random(), B256::random()), U256::from(rng.random::())) - }) - .collect(); - - (storage_cache, entries) - }, - |(cache, entries)| { - for (key, value) in entries { - cache.insert(key, value); - } - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Simulating TIP-20 token transfer patterns (Tempo-specific) -/// These trigger mostly cache misses per Slack discussion -fn bench_tip20_cache_pattern(c: &mut Criterion) { - let mut group = c.benchmark_group("cache/tip20_pattern"); - group.sample_size(20); - - // TIP-20 transfers access unique accounts rarely seen before - let transfer_counts = [100, 500, 1000]; - - for num_transfers in transfer_counts { - let id = format!("transfers_{}", num_transfers); - group.throughput(Throughput::Elements(num_transfers as u64)); - - group.bench_function(BenchmarkId::new("unique_account_access", &id), |b| { - b.iter_with_setup( - || { - let config = CacheConfig::default(); - let (account_cache, storage_cache, _) = create_caches(&config); - - // TIP-20: Each transfer accesses sender, recipient, fee token contract - // Most are unique addresses → cache misses - let mut rng = rand::rng(); - let transfer_accounts: Vec<(B256, B256, B256)> = (0..num_transfers) - .map(|_| (B256::random(), B256::random(), B256::random())) - .collect(); - - (account_cache, storage_cache, transfer_accounts) - }, - |(account_cache, storage_cache, transfers)| { - let mut account_misses = 0u64; - let mut storage_misses = 0u64; - - for (sender, recipient, fee_contract) in transfers { - // Check sender account - if account_cache.get(&sender).is_none() { - account_misses += 1; - // Simulate DB lookup and cache population - account_cache.insert(sender, Some(Account::default())); - } - - // Check recipient - if account_cache.get(&recipient).is_none() { - account_misses += 1; - account_cache.insert(recipient, Some(Account::default())); - } - - // Check fee contract storage (balance slot) - let balance_slot = B256::ZERO; - if storage_cache.get(&(fee_contract, balance_slot)).is_none() { - storage_misses += 1; - storage_cache.insert((fee_contract, balance_slot), U256::from(1000)); - } - } - - (account_misses, storage_misses) - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Pre-warming effectiveness -fn bench_prewarm_effectiveness(c: &mut Criterion) { - let mut group = c.benchmark_group("cache/prewarm"); - group.sample_size(20); - - let block_sizes = [500, 2000, 5000]; - - for num_accounts in block_sizes { - let id = format!("accounts_{}", num_accounts); - - // Without pre-warming - group.bench_function(BenchmarkId::new("cold_execution", &id), |b| { - b.iter_with_setup( - || { - let config = CacheConfig::default(); - let (account_cache, storage_cache, _) = create_caches(&config); - - let mut rng = rand::rng(); - let accounts: Vec = (0..num_accounts).map(|_| B256::random()).collect(); - - (account_cache, storage_cache, accounts) - }, - |(account_cache, storage_cache, accounts)| { - let mut misses = 0u64; - for addr in &accounts { - if account_cache.get(addr).is_none() { - misses += 1; - // Simulate expensive DB lookup - std::hint::black_box(0u64); - account_cache.insert(*addr, Some(Account::default())); - } - // Access storage - let slot = B256::ZERO; - if storage_cache.get(&(*addr, slot)).is_none() { - std::hint::black_box(0u64); - storage_cache.insert((*addr, slot), U256::ZERO); - } - } - misses - }, - ); - }); - - // With pre-warming - group.bench_function(BenchmarkId::new("warm_execution", &id), |b| { - b.iter_with_setup( - || { - let config = CacheConfig::default(); - let (account_cache, storage_cache, _) = create_caches(&config); - - let mut rng = rand::rng(); - let accounts: Vec = (0..num_accounts).map(|_| B256::random()).collect(); - - // Pre-warm the cache - for addr in &accounts { - account_cache.insert(*addr, Some(Account::default())); - storage_cache.insert((*addr, B256::ZERO), U256::ZERO); - } - - (account_cache, storage_cache, accounts) - }, - |(account_cache, storage_cache, accounts)| { - let mut hits = 0u64; - for addr in &accounts { - if account_cache.get(addr).is_some() { - hits += 1; - } - let slot = B256::ZERO; - if storage_cache.get(&(*addr, slot)).is_some() { - hits += 1; - } - } - hits - }, - ); - }); - } - - group.finish(); -} - -criterion_group!( - name = execution_cache; - config = Criterion::default().significance_level(0.05).sample_size(20); - targets = - bench_cache_hit_rates, - bench_cache_contention, - bench_cache_burst_insert, - bench_tip20_cache_pattern, - bench_prewarm_effectiveness -); -criterion_main!(execution_cache); diff --git a/crates/engine/tree/benches/heavy_persistence.rs b/crates/engine/tree/benches/heavy_persistence.rs deleted file mode 100644 index 632998af786..00000000000 --- a/crates/engine/tree/benches/heavy_persistence.rs +++ /dev/null @@ -1,308 +0,0 @@ -//! Heavy benchmarks targeting persistence bottlenecks identified in #eng-perf. -//! -//! Key bottlenecks from profiling (Jan 2026): -//! - update_history_indices: 26.0% of persist time -//! - write_trie_updates: 25.4% -//! - write_trie_changesets: 24.2% -//! - write_state: 13.8% -//! - write_hashed_state: 10.6% -//! -//! Run with: cargo bench -p reth-engine-tree --bench heavy_persistence - -#![allow(missing_docs)] - -use alloy_primitives::{Address, B256, U256}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use proptest::test_runner::TestRunner; -use rand::Rng; -use reth_chainspec::ChainSpec; -use reth_db_common::init::init_genesis; -use reth_primitives_traits::Account as RethAccount; -use reth_provider::{ - test_utils::create_test_provider_factory_with_chain_spec, - HistoryWriter, StateWriter, TrieWriter, -}; -use reth_trie::{HashedPostState, HashedStorage, StateRoot}; -use reth_trie_db::DatabaseStateRoot; -use revm_primitives::HashMap; -use std::sync::Arc; - -/// Benchmark parameters simulating realistic block sizes -#[derive(Debug, Clone)] -struct PersistenceParams { - /// Number of accounts modified per block - accounts_per_block: usize, - /// Storage slots modified per account - storage_slots_per_account: usize, - /// Number of blocks to accumulate before persistence - blocks_accumulated: usize, -} - -impl PersistenceParams { - fn total_state_changes(&self) -> usize { - self.accounts_per_block * self.blocks_accumulated - } - - fn total_storage_changes(&self) -> usize { - self.total_state_changes() * self.storage_slots_per_account - } -} - -/// Generate realistic state changes simulating high-TPS block execution -fn generate_state_changes(params: &PersistenceParams) -> Vec<(HashedPostState, Vec)> { - let mut runner = TestRunner::deterministic(); - let mut rng = runner.rng().clone(); - let mut blocks = Vec::with_capacity(params.blocks_accumulated); - - for _ in 0..params.blocks_accumulated { - let mut hashed_state = HashedPostState::default(); - let mut account_addresses = Vec::with_capacity(params.accounts_per_block); - - for _ in 0..params.accounts_per_block { - let address = Address::random_with(&mut rng); - let hashed_address = alloy_primitives::keccak256(address); - account_addresses.push(hashed_address); - - let account = RethAccount { - balance: U256::from(rng.random::()), - nonce: rng.random::(), - bytecode_hash: if rng.random_bool(0.1) { Some(B256::random()) } else { None }, - }; - - hashed_state = hashed_state.with_accounts(std::iter::once((hashed_address, Some(account)))); - - let storage: HashMap = (0..params.storage_slots_per_account) - .map(|_| (B256::random_with(&mut rng), U256::from(rng.random::()))) - .collect(); - - hashed_state = hashed_state.with_storages(std::iter::once(( - hashed_address, - HashedStorage::from_iter(false, storage), - ))); - } - - blocks.push((hashed_state, account_addresses)); - } - - blocks -} - -/// Benchmark: write_hashed_state performance with varying state sizes -/// Targets the 10.6% bottleneck -fn bench_write_hashed_state(c: &mut Criterion) { - let mut group = c.benchmark_group("persistence/write_hashed_state"); - group.sample_size(10); - - // Scenarios from Slack discussions: - // - Normal block: ~500 accounts, ~10 slots each - // - Heavy DeFi block: ~2000 accounts, ~50 slots each - // - Megablock (1.5 GGas): ~5000 accounts, ~100 slots each - let scenarios = vec![ - PersistenceParams { accounts_per_block: 500, storage_slots_per_account: 10, blocks_accumulated: 1 }, - PersistenceParams { accounts_per_block: 2000, storage_slots_per_account: 50, blocks_accumulated: 1 }, - PersistenceParams { accounts_per_block: 5000, storage_slots_per_account: 100, blocks_accumulated: 1 }, - ]; - - for params in scenarios { - let id = format!( - "accounts_{}_slots_{}", - params.accounts_per_block, params.storage_slots_per_account - ); - group.throughput(Throughput::Elements(params.total_storage_changes() as u64)); - - group.bench_function(BenchmarkId::new("single_block", &id), |b| { - b.iter_with_setup( - || { - let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); - let _ = init_genesis(&factory).unwrap(); - let blocks = generate_state_changes(¶ms); - (factory, blocks) - }, - |(factory, blocks)| { - let provider_rw = factory.provider_rw().unwrap(); - for (hashed_state, _) in blocks { - provider_rw.write_hashed_state(&hashed_state.into_sorted()).unwrap(); - } - provider_rw.commit().unwrap(); - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Accumulated block persistence (back-to-back scenario) -/// This simulates the O(N²) overlay merge problem identified in Slack -fn bench_accumulated_persistence(c: &mut Criterion) { - let mut group = c.benchmark_group("persistence/accumulated_blocks"); - group.sample_size(10); - - // Simulate the "backpressure flywheel" problem: - // Higher throughput → more blocks accumulate → longer persist time - let scenarios = vec![ - // Normal: ~75 blocks accumulated (from Slack discussion) - PersistenceParams { accounts_per_block: 200, storage_slots_per_account: 20, blocks_accumulated: 75 }, - // Heavy backpressure: ~250 blocks accumulated - PersistenceParams { accounts_per_block: 200, storage_slots_per_account: 20, blocks_accumulated: 250 }, - ]; - - for params in scenarios { - let id = format!("blocks_{}_accounts_{}", params.blocks_accumulated, params.accounts_per_block); - group.throughput(Throughput::Elements(params.blocks_accumulated as u64)); - - group.bench_function(BenchmarkId::new("overlay_merge", &id), |b| { - b.iter_with_setup( - || { - let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); - let _ = init_genesis(&factory).unwrap(); - let blocks = generate_state_changes(¶ms); - (factory, blocks) - }, - |(factory, blocks)| { - let provider_rw = factory.provider_rw().unwrap(); - - // Simulate merging all overlays - this is where O(N²) happens - let mut merged = HashedPostState::default(); - for (hashed_state, _) in blocks { - merged.extend(hashed_state); - } - - provider_rw.write_hashed_state(&merged.into_sorted()).unwrap(); - provider_rw.commit().unwrap(); - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: State root calculation after persistence -/// Targets the 25.4% write_trie_updates + 24.2% write_trie_changesets -fn bench_state_root_after_persist(c: &mut Criterion) { - let mut group = c.benchmark_group("persistence/state_root"); - group.sample_size(10); - - let scenarios = vec![ - PersistenceParams { accounts_per_block: 1000, storage_slots_per_account: 20, blocks_accumulated: 1 }, - PersistenceParams { accounts_per_block: 5000, storage_slots_per_account: 50, blocks_accumulated: 1 }, - ]; - - for params in scenarios { - let id = format!("accounts_{}_slots_{}", params.accounts_per_block, params.storage_slots_per_account); - - group.bench_function(BenchmarkId::new("full_root_calculation", &id), |b| { - b.iter_with_setup( - || { - let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); - let _ = init_genesis(&factory).unwrap(); - let blocks = generate_state_changes(¶ms); - - // Pre-populate state - { - let provider_rw = factory.provider_rw().unwrap(); - for (hashed_state, _) in &blocks { - provider_rw.write_hashed_state(&hashed_state.clone().into_sorted()).unwrap(); - } - provider_rw.commit().unwrap(); - } - - (factory, blocks) - }, - |(factory, _blocks)| { - let provider = factory.provider().unwrap(); - let (root, updates) = StateRoot::from_tx(provider.tx_ref()) - .root_with_updates() - .unwrap(); - - // Write trie updates - this is where 25.4% of time goes - let provider_rw = factory.provider_rw().unwrap(); - provider_rw.write_trie_updates(updates).unwrap(); - provider_rw.commit().unwrap(); - - root - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: History indices update performance -/// Targets the 26.0% update_history_indices bottleneck -fn bench_history_indices(c: &mut Criterion) { - let mut group = c.benchmark_group("persistence/history_indices"); - group.sample_size(10); - - // From Slack: The fix is to derive transitions from in-memory ExecutionOutcome - // instead of scanning AccountChangeSets/StorageChangeSets tables - let scenarios = vec![ - PersistenceParams { accounts_per_block: 500, storage_slots_per_account: 10, blocks_accumulated: 10 }, - PersistenceParams { accounts_per_block: 1000, storage_slots_per_account: 20, blocks_accumulated: 50 }, - ]; - - for params in scenarios { - let id = format!( - "blocks_{}_accounts_{}_slots_{}", - params.blocks_accumulated, params.accounts_per_block, params.storage_slots_per_account - ); - group.throughput(Throughput::Elements(params.total_state_changes() as u64)); - - group.bench_function(BenchmarkId::new("insert_indices", &id), |b| { - b.iter_with_setup( - || { - let factory = create_test_provider_factory_with_chain_spec(Arc::new(ChainSpec::default())); - let _ = init_genesis(&factory).unwrap(); - - // Build history index data structure (simulating in-memory derivation) - // Use Address type for account transitions as required by HistoryWriter - let mut account_transitions: std::collections::BTreeMap> = - std::collections::BTreeMap::new(); - let mut storage_transitions: std::collections::BTreeMap<(Address, B256), Vec> = - std::collections::BTreeMap::new(); - - let mut rng = rand::rng(); - for block_idx in 0..params.blocks_accumulated { - let block_number = block_idx as u64 + 1; - for _ in 0..params.accounts_per_block { - let address = Address::random_with(&mut rng); - account_transitions.entry(address).or_default().push(block_number); - // Add some storage transitions - for i in 0..params.storage_slots_per_account { - let slot = B256::from(U256::from(i)); - storage_transitions.entry((address, slot)).or_default().push(block_number); - } - } - } - - (factory, account_transitions, storage_transitions) - }, - |(factory, account_transitions, storage_transitions)| { - let provider_rw = factory.provider_rw().unwrap(); - - // This simulates the optimized path: insert_account_history_index - // and insert_storage_history_index from in-memory data - provider_rw.insert_account_history_index(account_transitions).unwrap(); - provider_rw.insert_storage_history_index(storage_transitions).unwrap(); - - provider_rw.commit().unwrap(); - }, - ); - }); - } - - group.finish(); -} - -criterion_group!( - name = heavy_persistence; - config = Criterion::default().significance_level(0.05).sample_size(10); - targets = - bench_write_hashed_state, - bench_accumulated_persistence, - bench_state_root_after_persist, - bench_history_indices -); -criterion_main!(heavy_persistence); diff --git a/crates/trie/parallel/Cargo.toml b/crates/trie/parallel/Cargo.toml index fad1f145a2f..9fb882b44a5 100644 --- a/crates/trie/parallel/Cargo.toml +++ b/crates/trie/parallel/Cargo.toml @@ -69,7 +69,3 @@ test-utils = [ [[bench]] name = "root" harness = false - -[[bench]] -name = "heavy_root" -harness = false diff --git a/crates/trie/parallel/benches/heavy_root.rs b/crates/trie/parallel/benches/heavy_root.rs deleted file mode 100644 index 27158977ee4..00000000000 --- a/crates/trie/parallel/benches/heavy_root.rs +++ /dev/null @@ -1,330 +0,0 @@ -//! Heavy benchmarks for parallel state root calculation. -//! -//! Based on #eng-perf profiling showing: -//! - State root calculation is 50-80% of validation time for 3s blocks -//! - Parallel vs sync root has significant delta at scale -//! - Sparse trie updates vs full recalculation trade-offs -//! -//! Run with: cargo bench -p reth-trie-parallel --bench heavy_root - -#![allow(missing_docs, unreachable_pub)] - -use alloy_primitives::{B256, U256}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use proptest::{prelude::*, strategy::ValueTree, test_runner::TestRunner}; -use proptest_arbitrary_interop::arb; -use reth_primitives_traits::Account; -use reth_provider::{ - providers::OverlayStateProviderFactory, test_utils::create_test_provider_factory, StateWriter, - TrieWriter, -}; -use reth_trie::{ - hashed_cursor::HashedPostStateCursorFactory, HashedPostState, HashedStorage, StateRoot, - TrieInput, -}; -use reth_trie_db::{ChangesetCache, DatabaseHashedCursorFactory, DatabaseStateRoot}; -use reth_trie_parallel::root::ParallelStateRoot; -use std::collections::HashMap; - -/// Benchmark parameters for megablock scenarios -#[derive(Debug, Clone)] -struct StateRootParams { - /// Total accounts in database - db_accounts: usize, - /// Storage slots per account - storage_per_account: usize, - /// Percentage of accounts updated - update_percentage: f64, -} - -impl StateRootParams { - fn updated_accounts(&self) -> usize { - (self.db_accounts as f64 * self.update_percentage) as usize - } -} - -fn generate_heavy_test_data(params: &StateRootParams) -> (HashedPostState, HashedPostState) { - let mut runner = TestRunner::deterministic(); - - let db_state = proptest::collection::hash_map( - any::(), - ( - arb::().prop_filter("non empty account", |a| !a.is_empty()), - proptest::collection::hash_map( - any::(), - any::().prop_filter("non zero value", |v| !v.is_zero()), - params.storage_per_account, - ), - ), - params.db_accounts, - ) - .new_tree(&mut runner) - .unwrap() - .current(); - - let keys = db_state.keys().copied().collect::>(); - let num_updates = params.updated_accounts(); - let keys_to_update = proptest::sample::subsequence(keys, num_updates) - .new_tree(&mut runner) - .unwrap() - .current(); - - let updated_storages = keys_to_update - .into_iter() - .map(|address| { - let (_, storage) = db_state.get(&address).unwrap(); - let slots = storage.keys().copied().collect::>(); - let slots_to_update = proptest::sample::subsequence(slots, params.storage_per_account / 2) - .new_tree(&mut runner) - .unwrap() - .current(); - ( - address, - slots_to_update - .into_iter() - .map(|slot| (slot, any::().new_tree(&mut runner).unwrap().current())) - .collect::>(), - ) - }) - .collect::>(); - - ( - HashedPostState::default() - .with_accounts( - db_state.iter().map(|(address, (account, _))| (*address, Some(*account))), - ) - .with_storages(db_state.into_iter().map(|(address, (_, storage))| { - (address, HashedStorage::from_iter(false, storage)) - })), - HashedPostState::default().with_storages( - updated_storages - .into_iter() - .map(|(address, storage)| (address, HashedStorage::from_iter(false, storage))), - ), - ) -} - -/// Benchmark: Sync vs Parallel state root at various scales -fn bench_sync_vs_parallel(c: &mut Criterion) { - let mut group = c.benchmark_group("state_root/sync_vs_parallel"); - group.sample_size(10); - - // Scenarios based on Slack discussions: - // - Normal block: 3000 accounts - // - Heavy block: 10000 accounts - // - Megablock (1.5 GGas): 30000+ accounts - let scenarios = vec![ - StateRootParams { db_accounts: 3000, storage_per_account: 100, update_percentage: 0.5 }, - StateRootParams { db_accounts: 10000, storage_per_account: 100, update_percentage: 0.3 }, - StateRootParams { db_accounts: 30000, storage_per_account: 50, update_percentage: 0.2 }, - ]; - - for params in scenarios { - let (db_state, updated_state) = generate_heavy_test_data(¶ms); - let provider_factory = create_test_provider_factory(); - - // Setup: write initial state - { - let provider_rw = provider_factory.provider_rw().unwrap(); - provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); - let (_, updates) = StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); - provider_rw.write_trie_updates(updates).unwrap(); - provider_rw.commit().unwrap(); - } - - let id = format!( - "db_{}_updated_{}", - params.db_accounts, - params.updated_accounts() - ); - - let changeset_cache = ChangesetCache::new(); - let factory = OverlayStateProviderFactory::new(provider_factory.clone(), changeset_cache); - - // Sync state root - group.bench_function(BenchmarkId::new("sync", &id), |b| { - b.iter_with_setup( - || { - let sorted_state = updated_state.clone().into_sorted(); - let prefix_sets = updated_state.construct_prefix_sets().freeze(); - let provider = provider_factory.provider().unwrap(); - (provider, sorted_state, prefix_sets) - }, - |(provider, sorted_state, prefix_sets)| { - let hashed_cursor_factory = HashedPostStateCursorFactory::new( - DatabaseHashedCursorFactory::new(provider.tx_ref()), - &sorted_state, - ); - StateRoot::from_tx(provider.tx_ref()) - .with_hashed_cursor_factory(hashed_cursor_factory) - .with_prefix_sets(prefix_sets) - .root() - }, - ); - }); - - // Parallel state root - group.bench_function(BenchmarkId::new("parallel", &id), |b| { - b.iter_with_setup( - || { - let trie_input = TrieInput::from_state(updated_state.clone()); - ParallelStateRoot::new(factory.clone(), trie_input.prefix_sets.freeze()) - }, - |calculator| calculator.incremental_root(), - ); - }); - } - - group.finish(); -} - -/// Benchmark: Incremental updates (sparse trie) at scale -fn bench_incremental_updates(c: &mut Criterion) { - let mut group = c.benchmark_group("state_root/incremental"); - group.sample_size(10); - - // Test repeated incremental updates (simulating back-to-back blocks) - let num_updates_sequence = [5, 10, 25, 50]; - - for num_updates in num_updates_sequence { - let params = StateRootParams { - db_accounts: 5000, - storage_per_account: 50, - update_percentage: 0.1, - }; - - let id = format!("sequential_updates_{}", num_updates); - group.throughput(Throughput::Elements(num_updates as u64)); - - group.bench_function(BenchmarkId::new("sparse_trie", &id), |b| { - b.iter_with_setup( - || { - let (db_state, _) = generate_heavy_test_data(¶ms); - let provider_factory = create_test_provider_factory(); - - { - let provider_rw = provider_factory.provider_rw().unwrap(); - provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); - let (_, updates) = - StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); - provider_rw.write_trie_updates(updates).unwrap(); - provider_rw.commit().unwrap(); - } - - // Generate sequence of updates - let updates: Vec = (0..num_updates) - .map(|_| { - let (_, update) = generate_heavy_test_data(&StateRootParams { - db_accounts: 500, - storage_per_account: 20, - update_percentage: 1.0, - }); - update - }) - .collect(); - - (provider_factory, updates) - }, - |(provider_factory, updates)| { - let changeset_cache = ChangesetCache::new(); - let factory = OverlayStateProviderFactory::new(provider_factory, changeset_cache); - - let mut roots = Vec::with_capacity(updates.len()); - for update in updates { - let trie_input = TrieInput::from_state(update); - let calculator = - ParallelStateRoot::new(factory.clone(), trie_input.prefix_sets.freeze()); - roots.push(calculator.incremental_root().unwrap()); - } - roots - }, - ); - }); - } - - group.finish(); -} - -/// Benchmark: Large storage trie updates (contract-heavy blocks) -fn bench_large_storage_tries(c: &mut Criterion) { - let mut group = c.benchmark_group("state_root/large_storage"); - group.sample_size(10); - - // Simulate contracts with large storage (DEX, AMM, etc.) - let storage_sizes = [1000, 5000, 10000]; - - for storage_size in storage_sizes { - let id = format!("slots_{}", storage_size); - group.throughput(Throughput::Elements(storage_size as u64)); - - group.bench_function(BenchmarkId::new("single_contract", &id), |b| { - b.iter_with_setup( - || { - let mut runner = TestRunner::deterministic(); - let contract_address = any::().new_tree(&mut runner).unwrap().current(); - - let storage: HashMap = proptest::collection::hash_map( - any::(), - any::().prop_filter("non zero", |v| !v.is_zero()), - storage_size, - ) - .new_tree(&mut runner) - .unwrap() - .current(); - - let db_state = HashedPostState::default() - .with_accounts(std::iter::once((contract_address, Some(Account::default())))) - .with_storages(std::iter::once(( - contract_address, - HashedStorage::from_iter(false, storage.clone()), - ))); - - let provider_factory = create_test_provider_factory(); - { - let provider_rw = provider_factory.provider_rw().unwrap(); - provider_rw.write_hashed_state(&db_state.into_sorted()).unwrap(); - let (_, updates) = - StateRoot::from_tx(provider_rw.tx_ref()).root_with_updates().unwrap(); - provider_rw.write_trie_updates(updates).unwrap(); - provider_rw.commit().unwrap(); - } - - // Update half the storage - let update_storage: HashMap = storage - .into_iter() - .take(storage_size / 2) - .map(|(k, _)| (k, U256::from(999))) - .collect(); - - let update_state = HashedPostState::default().with_storages(std::iter::once(( - contract_address, - HashedStorage::from_iter(false, update_storage), - ))); - - (provider_factory, update_state) - }, - |(provider_factory, update_state)| { - let changeset_cache = ChangesetCache::new(); - let factory = OverlayStateProviderFactory::new(provider_factory, changeset_cache); - - let trie_input = TrieInput::from_state(update_state); - let calculator = - ParallelStateRoot::new(factory, trie_input.prefix_sets.freeze()); - calculator.incremental_root() - }, - ); - }); - } - - group.finish(); -} - -criterion_group!( - name = heavy_root; - config = Criterion::default().significance_level(0.05).sample_size(10); - targets = - bench_sync_vs_parallel, - bench_incremental_updates, - bench_large_storage_tries -); -criterion_main!(heavy_root); diff --git a/docs/perf/OPTIMIZATION_OPPORTUNITIES.md b/docs/perf/OPTIMIZATION_OPPORTUNITIES.md deleted file mode 100644 index 5c7354854e4..00000000000 --- a/docs/perf/OPTIMIZATION_OPPORTUNITIES.md +++ /dev/null @@ -1,170 +0,0 @@ -# Reth/Tempo Performance Optimization Opportunities - -Based on #eng-perf and #ai-agent Slack channel analysis (Jan 2026). - -## Current Bottleneck Analysis - -From profiling sessions, the persistence phase breakdown is: - -| Component | % Time | Priority | -|-----------|--------|----------| -| `update_history_indices` | 26.0% | **Critical** | -| `write_trie_updates` | 25.4% | High | -| `write_trie_changesets` | 24.2% | High | -| `write_state` | 13.8% | Medium | -| `write_hashed_state` | 10.6% | Medium | - -## Critical Optimizations - -### 1. History Indices from In-Memory Data (26% improvement potential) - -**Problem**: Currently scans `AccountChangeSets`/`StorageChangeSets` tables via DB cursors. - -**Solution**: Derive transitions directly from in-memory `ExecutionOutcome`: - -```rust -// Before the loop, accumulate transitions: -let mut account_transitions: BTreeMap> = BTreeMap::new(); -let mut storage_transitions: BTreeMap<(Address, B256), Vec> = BTreeMap::new(); - -// Inside the per-block loop, extract from execution_outcome.bundle.reverts -for (block_idx, block_reverts) in execution_output.bundle.reverts.iter().enumerate() { - let block_number = execution_output.first_block() + block_idx as u64; - for (address, account_revert) in block_reverts { - account_transitions.entry(*address).or_default().push(block_number); - for storage_key in account_revert.storage.keys() { - let key = B256::new(storage_key.to_be_bytes()); - storage_transitions.entry((*address, key)).or_default().push(block_number); - } - } -} - -// After loop, replace update_history_indices(range) with: -self.insert_account_history_index(account_transitions)?; -self.insert_storage_history_index(storage_transitions)?; -``` - -**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- history_indices` - -### 2. Batch Trie Writes Across Blocks (~50% improvement for b2b) - -**Problem**: `write_trie_changesets` and `write_trie_updates_sorted` called per-block. - -**Solution**: Accumulate overlay across blocks, write once at end: - -```rust -let mut trie_overlay = TrieUpdatesSorted::default(); - -for block in blocks { - self.write_trie_changesets(block_number, &trie_data.trie_updates, Some(&trie_overlay))?; - trie_overlay.extend(&trie_data.trie_updates); -} -// Single write at end -self.write_trie_updates_sorted(&trie_overlay)?; -``` - -**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- accumulated` - -### 3. O(N²) Overlay Merge Fix (PR #20774) - -**Problem**: `wait_cloned()` iterates through ALL ancestors for each block → O(N²) complexity. - -**Results achieved**: -- p50: -8.40% -- p90: -42.46% -- p99: -60.30% -- Gas/Second: +73.34% - -**Benchmark**: `cargo bench -p reth-engine-tree --bench heavy_persistence -- overlay_merge` - -## High Priority Optimizations - -### 4. MDBX Configuration Tuning - -**Problem**: msync taking seconds on new reth boxes (99% of samples in `mdbx_txn_commit_ex`). - -**Potential solutions**: -- Expose `txn_dp_limit` - forces dirty pages to spill during transaction -- Expose `sync_bytes`/`sync_period` - triggers intermediate flushes -- Lower `spill_max_denominator` for more aggressive spilling - -```rust -// Already in libmdbx-rs but not exposed to CLI -mdbx_env_set_syncbytes(env, 100*1024*1024); // Flush every 100MB -mdbx_env_set_syncperiod(env, 16384); // Or every ~0.25s -``` - -### 5. Execution Cache Improvements - -**Problem**: moka cache contention under high throughput, expensive cache misses. - -**Findings**: -- TIP-20 transfers trigger mostly cache misses (unique accounts) -- fixed_cache (4GB) allocation overhead when cache misses -- Pre-warming effectiveness varies - -**Benchmark**: `cargo bench -p reth-engine-tree --bench execution_cache` - -**Key metrics to track**: -- Cache hit rate (baseline 45%, target 78%+ like Half-Path) -- Contention under 8-32 threads -- Burst insert latency - -### 6. Parallel State Root Scaling - -**Problem**: State root calculation is 50-80% of validation time for large blocks. - -**Benchmark scenarios**: -- Normal block: 3,000 accounts -- Heavy block: 10,000 accounts -- Megablock (1.5 GGas): 30,000+ accounts - -**Benchmark**: `cargo bench -p reth-trie-parallel --bench heavy_root` - -## Medium Priority Optimizations - -### 7. Pre-sort Storage Tries Once - -Currently sorted every call at multiple locations: -```rust -let mut storage_updates = trie_updates.storage_tries_ref().iter().collect::>(); -storage_updates.sort_unstable_by(|a, b| a.0.cmp(b.0)); -``` - -Could use `BTreeMap` internally or pre-sorted Vec in `TrieUpdatesSorted`. - -### 8. Remove Expensive HashMap Clone - -Recent change introduced a clone showing up in profiles (3s block profile). - -## Benchmarking Commands - -```bash -# Run all heavy benchmarks on an idle box -./scripts/bench-heavy.sh ./results-$(date +%Y%m%d) - -# Run specific benchmark groups -cargo bench -p reth-engine-tree --bench execution_cache -cargo bench -p reth-engine-tree --bench heavy_persistence -cargo bench -p reth-trie-parallel --bench heavy_root - -# Compare against baseline -cargo bench -p reth-engine-tree --bench execution_cache -- --baseline heavy-cache - -# Profile with samply -samply record -- cargo bench -p reth-engine-tree --bench execution_cache -- cache/contention -``` - -## Profiling Resources - -- [How to benchmark and profile Reth](https://www.notion.so/How-to-benchmark-and-profile-Reth-21532f2c34848058a2f6efc5f852603d) -- [Perf onboarding doc](https://docs.google.com/document/d/1pgbWk6wjd3p3oGy2SC2mWiGAvzlcIoiB-8g20fm6Acc) -- Firefox Profiler compare: https://profiler.firefox.com/compare/ - -## Related PRs - -- [#20774](https://github.com/paradigmxyz/reth/pull/20774) - Overlay reuse optimization -- [#20616](https://github.com/paradigmxyz/reth/pull/20616) - Subscribe to persisted block -- [#20520](https://github.com/paradigmxyz/reth/pull/20520) - fixed_cache execution cache -- [#20405](https://github.com/paradigmxyz/reth/pull/20405) - Defer transaction pool notifications -- [#20398](https://github.com/paradigmxyz/reth/pull/20398) - Use RwLock for transaction pool listeners diff --git a/scripts/bench-heavy.sh b/scripts/bench-heavy.sh deleted file mode 100755 index d4e086018c6..00000000000 --- a/scripts/bench-heavy.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -# Heavy Performance Benchmarks for Reth/Tempo -# -# Based on #eng-perf Slack discussions identifying key bottlenecks: -# - update_history_indices: 26% of persist time -# - write_trie_updates: 25.4% -# - write_trie_changesets: 24.2% -# - Execution cache contention under high throughput -# -# Run this on an idle reth box (check with: reth-box-status) -# -# Usage: ./scripts/bench-heavy.sh [output_dir] - -set -euo pipefail - -OUTPUT_DIR="${1:-./benchmark-results}" -mkdir -p "$OUTPUT_DIR" - -echo "=============================================" -echo "Heavy Performance Benchmarks for Reth/Tempo" -echo "=============================================" -echo "Output directory: $OUTPUT_DIR" -echo "Started at: $(date -u +%Y-%m-%dT%H:%M:%SZ)" -echo "" - -# Build with optimizations -echo "[1/5] Building benchmarks with release profile..." -cargo build --release --benches -p reth-engine-tree -p reth-trie-parallel -p reth-trie-sparse - -# Run execution cache benchmarks -echo "" -echo "[2/5] Running execution cache benchmarks..." -echo " - Cache hit rate simulations (45%, 78%, 90%, 95%)" -echo " - Concurrent access contention (1-32 threads)" -echo " - Burst insert patterns (2K-50K entries)" -echo " - TIP-20 transfer patterns" -echo " - Pre-warming effectiveness" -cargo bench -p reth-engine-tree --bench execution_cache -- --save-baseline heavy-cache 2>&1 | tee "$OUTPUT_DIR/execution_cache.log" - -# Run heavy persistence benchmarks -echo "" -echo "[3/5] Running heavy persistence benchmarks..." -echo " - write_hashed_state with 500-5000 accounts" -echo " - Accumulated block persistence (75-250 blocks)" -echo " - State root calculation after persistence" -echo " - History indices insertion" -cargo bench -p reth-engine-tree --bench heavy_persistence -- --save-baseline heavy-persist 2>&1 | tee "$OUTPUT_DIR/heavy_persistence.log" - -# Run parallel state root benchmarks -echo "" -echo "[4/5] Running parallel state root benchmarks..." -echo " - Sync vs Parallel root (3K-30K accounts)" -echo " - Incremental updates (5-50 sequential)" -echo " - Large storage tries (1K-10K slots)" -cargo bench -p reth-trie-parallel --bench heavy_root -- --save-baseline heavy-root 2>&1 | tee "$OUTPUT_DIR/heavy_root.log" - -# Run existing state root task benchmarks -echo "" -echo "[5/5] Running state root task benchmarks..." -cargo bench -p reth-engine-tree --bench state_root_task -- --save-baseline state-root 2>&1 | tee "$OUTPUT_DIR/state_root_task.log" - -echo "" -echo "=============================================" -echo "Benchmarks completed at: $(date -u +%Y-%m-%dT%H:%M:%SZ)" -echo "Results saved to: $OUTPUT_DIR" -echo "" -echo "To compare baselines later:" -echo " cargo bench -p reth-engine-tree --bench execution_cache -- --baseline heavy-cache" -echo "" -echo "To generate HTML reports:" -echo " open target/criterion/*/report/index.html" -echo "============================================="