diff --git a/.github/scripts/check-hive-results.sh b/.github/scripts/check-hive-results.sh index 85b05d46136..aac7b200bb6 100755 --- a/.github/scripts/check-hive-results.sh +++ b/.github/scripts/check-hive-results.sh @@ -63,6 +63,7 @@ KNOWN_FLAKY_TESTS=( "Invalid Missing Ancestor Syncing ReOrg, Timestamp, EmptyTxs=False, CanonicalReOrg=False, Invalid P8" "Invalid Missing Ancestor Syncing ReOrg, Timestamp, EmptyTxs=False, CanonicalReOrg=True, Invalid P8" "Invalid Missing Ancestor Syncing ReOrg, Transaction Value, EmptyTxs=False, CanonicalReOrg=False, Invalid P9" + "Invalid Missing Ancestor Syncing ReOrg, Transaction Nonce, EmptyTxs=False, CanonicalReOrg=True, Invalid P9" ) # Build a jq filter that excludes known-flaky tests. diff --git a/CHANGELOG.md b/CHANGELOG.md index 522dcdc9877..58fef1f6a3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## Perf +### 2026-03-03 + +- Add bloom filter to skip trie seeks for non-existent storage slots [#6288](https://github.com/lambdaclass/ethrex/pull/6288) + ### 2026-03-02 - SIMD-accelerate trie nibble operations for block execution [#6286](https://github.com/lambdaclass/ethrex/pull/6286) diff --git a/crates/storage/bloom.rs b/crates/storage/bloom.rs new file mode 100644 index 00000000000..d90abf7dd5e --- /dev/null +++ b/crates/storage/bloom.rs @@ -0,0 +1,137 @@ +use std::fmt; +use std::sync::OnceLock; +use std::sync::atomic::{AtomicBool, Ordering}; + +use ethrex_common::{Address, H256}; +use fastbloom::AtomicBloomFilter; +use rustc_hash::FxBuildHasher; + +const FALSE_POSITIVE_RATE: f64 = 0.01; + +/// Bloom filter that tracks which (address, storage_key) pairs have non-zero +/// storage values. Used to skip expensive trie lookups for slots that were +/// never written to. +/// +/// The filter is allocated lazily on first `insert()` to avoid ~240MB of +/// upfront memory when the bloom is never used (e.g., dev mode, testnets). +pub struct StorageBloomFilter { + filter: OnceLock>, + capacity: usize, + enabled: AtomicBool, +} + +impl fmt::Debug for StorageBloomFilter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageBloomFilter").finish() + } +} + +impl StorageBloomFilter { + pub fn new(capacity: usize) -> Self { + Self { + filter: OnceLock::new(), + capacity, + enabled: AtomicBool::new(false), + } + } + + /// Activate the bloom filter after it has been populated. + /// Before this is called, `might_contain` always returns `true` (pass-through). + /// + /// # Precondition + /// + /// The filter MUST have been fully populated (via `insert`) for ALL + /// (address, storage_key) pairs that exist in the trie before this is + /// called. This includes genesis slots, snap-synced data, and all slots + /// written during block processing. Calling `enable()` prematurely will + /// cause false negatives that silently corrupt storage reads. + #[allow(dead_code)] + pub fn enable(&self) { + self.enabled.store(true, Ordering::Release); + } + + /// Record that a non-zero value exists at (address, key). + /// + /// Called unconditionally on every non-zero storage write, even while the + /// filter is disabled. This is intentional warm-up: the filter is populated + /// in the background so it is ready when `enable()` is eventually called. + pub fn insert(&self, address: Address, key: H256) { + let bloom_key = Self::make_key(address, key); + self.filter().insert(&bloom_key); + } + + /// Returns `true` if the slot *might* contain a non-zero value. + /// Returns `false` if the slot was definitely never written. + /// When the filter is not yet enabled, always returns `true` (pass-through). + pub fn might_contain(&self, address: Address, key: H256) -> bool { + if !self.enabled.load(Ordering::Acquire) { + return true; + } + let bloom_key = Self::make_key(address, key); + self.filter().contains(&bloom_key) + } + + fn filter(&self) -> &AtomicBloomFilter { + self.filter.get_or_init(|| { + AtomicBloomFilter::with_false_pos(FALSE_POSITIVE_RATE) + .hasher(FxBuildHasher) + .expected_items(self.capacity) + }) + } + + fn make_key(address: Address, key: H256) -> [u8; 52] { + let mut buf = [0u8; 52]; + buf[..20].copy_from_slice(address.as_bytes()); + buf[20..].copy_from_slice(key.as_bytes()); + buf + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn addr(n: u8) -> Address { + Address::from([n; 20]) + } + + fn key(n: u8) -> H256 { + H256::from([n; 32]) + } + + #[test] + fn disabled_is_pass_through() { + let bloom = StorageBloomFilter::new(1000); + // Before enable, might_contain always returns true + assert!(bloom.might_contain(addr(1), key(1))); + assert!(bloom.might_contain(addr(99), key(255))); + } + + #[test] + fn no_false_negatives_after_enable() { + let bloom = StorageBloomFilter::new(1000); + bloom.insert(addr(1), key(10)); + bloom.insert(addr(2), key(20)); + bloom.enable(); + // Inserted keys must always return true + assert!(bloom.might_contain(addr(1), key(10))); + assert!(bloom.might_contain(addr(2), key(20))); + } + + #[test] + fn rejects_unknown_after_enable() { + let bloom = StorageBloomFilter::new(1000); + bloom.insert(addr(1), key(10)); + bloom.enable(); + // A never-inserted key should return false (with high probability) + assert!(!bloom.might_contain(addr(99), key(99))); + } + + #[test] + fn make_key_distinctness() { + // Different (address, key) pairs must produce different bloom keys + let k1 = StorageBloomFilter::make_key(addr(1), key(2)); + let k2 = StorageBloomFilter::make_key(addr(2), key(1)); + assert_ne!(k1, k2); + } +} diff --git a/crates/storage/lib.rs b/crates/storage/lib.rs index ee45d5b41a0..2384b339043 100644 --- a/crates/storage/lib.rs +++ b/crates/storage/lib.rs @@ -66,6 +66,7 @@ pub mod api; pub mod backend; +mod bloom; pub mod error; mod layering; pub mod rlp; diff --git a/crates/storage/store.rs b/crates/storage/store.rs index 9ceae218ca1..72bd3a42c94 100644 --- a/crates/storage/store.rs +++ b/crates/storage/store.rs @@ -1,5 +1,6 @@ #[cfg(feature = "rocksdb")] use crate::backend::rocksdb::RocksDBBackend; +use crate::bloom::StorageBloomFilter; use crate::{ STORE_METADATA_FILENAME, STORE_SCHEMA_VERSION, api::{ @@ -187,6 +188,10 @@ pub struct Store { /// Uses FxHashMap for efficient lookups, much smaller than code cache. code_metadata_cache: Arc>>, + /// Bloom filter tracking (address, storage_key) pairs with non-zero values. + /// Used to skip trie lookups for storage slots that were never written. + storage_bloom: Arc, + background_threads: Arc, } @@ -1164,6 +1169,10 @@ impl Store { /// CAUTION: This method writes directly to the underlying database, bypassing any caching layer. /// For updating the state after block execution, use [`Self::store_block_updates`]. + /// + /// NOTE: This method does not update the storage bloom filter. Slots written + /// through this path (e.g., snap sync) will be invisible to `might_contain` + /// after `enable()`. A backfill step is needed before enabling the bloom. pub async fn write_storage_trie_nodes_batch( &self, storage_trie_nodes: StorageUpdates, @@ -1494,6 +1503,7 @@ impl Store { last_computed_flatkeyvalue: Arc::new(RwLock::new(last_written)), account_code_cache: Arc::new(Mutex::new(CodeCache::default())), code_metadata_cache: Arc::new(Mutex::new(rustc_hash::FxHashMap::default())), + storage_bloom: Arc::new(StorageBloomFilter::new(200_000_000)), background_threads: Default::default(), }; let backend_clone = store.backend.clone(); @@ -1740,6 +1750,7 @@ impl Store { if storage_value.is_zero() { storage_trie.remove(&hashed_key)?; } else { + self.storage_bloom.insert(update.address, *storage_key); storage_trie.insert(hashed_key, storage_value.encode_to_vec())?; } } @@ -1831,6 +1842,7 @@ impl Store { if storage_value.is_zero() { storage_trie.remove(&hashed_key)?; } else { + self.storage_bloom.insert(update.address, *storage_key); storage_trie.insert(hashed_key, storage_value.encode_to_vec())?; } } @@ -1881,6 +1893,8 @@ impl Store { if !storage_value.is_zero() { let hashed_key = hash_key(&H256(storage_key.to_big_endian())); storage_trie.insert(hashed_key, storage_value.encode_to_vec())?; + // TODO: call storage_bloom.insert(address, storage_key) here when + // bloom is wired up, otherwise genesis-only slots become false negatives. } } @@ -2118,6 +2132,15 @@ impl Store { address: Address, storage_key: H256, ) -> Result, StoreError> { + // Fast path: if the bloom filter says this slot was never written, skip the trie. + // NOTE: The bloom only tracks writes during the current process lifetime. + // For historical state_root queries (RPC), a slot that was non-zero in older + // states but later zeroed won't be in the filter. When the bloom is enabled, + // this check may need to be limited to latest-state lookups only. + if !self.storage_bloom.might_contain(address, storage_key) { + return Ok(None); + } + let account_hash = hash_address_fixed(&address); // Pre-acquire shared resources once for both trie opens