From 11f31dcd27f6380a274efa7f0c78eedf38cfbae6 Mon Sep 17 00:00:00 2001 From: David Palm Date: Sat, 21 Mar 2020 00:43:59 +0100 Subject: [PATCH 01/18] Add a command to rebuild the accounts bloom --- Cargo.lock | 5 +++++ Cargo.toml | 5 +++++ parity/blockchain.rs | 46 +++++++++++++++++++++++++++++++++++++++- parity/cli/mod.rs | 5 +++++ parity/configuration.rs | 17 ++++++++++----- parity/db/mod.rs | 2 +- parity/db/rocksdb/mod.rs | 1 + parity/lib.rs | 5 +++-- 8 files changed, 77 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 743e8f8c7b5..d5e9017cac8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3176,6 +3176,7 @@ checksum = "93f5bb2e8e8dec81642920ccff6b61f1eb94fa3020c5a325c9851ff604152409" name = "open-ethereum" version = "2.8.0" dependencies = [ + "account-state", "ansi_term", "atty", "blooms-db", @@ -3191,6 +3192,7 @@ dependencies = [ "ethcore", "ethcore-accounts", "ethcore-blockchain", + "ethcore-bloom-journal", "ethcore-call-contract", "ethcore-db", "ethcore-io", @@ -3232,6 +3234,7 @@ dependencies = [ "parity-util-mem", "parity-version", "parking_lot 0.10.0", + "patricia-trie-ethereum", "pretty_assertions", "regex", "registrar", @@ -3245,10 +3248,12 @@ dependencies = [ "serde_json", "snapshot", "spec", + "state-db", "tempdir", "term_size", "textwrap 0.9.0", "toml", + "trie-db", "verification", "winapi 0.3.8", ] diff --git a/Cargo.toml b/Cargo.toml index 37388ad3e5b..d8f2d2eecd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ license = "GPL-3.0" authors = ["Open Ethereum developers", "Parity Technologies "] [dependencies] +account-state = {path = "ethcore/account-state" } ansi_term = "0.11" atty = "0.2.8" blooms-db = { path = "util/blooms-db" } @@ -21,6 +22,7 @@ engine = { path = "ethcore/engine" } ethabi = { version = "9.0.1", optional = true } ethcore = { path = "ethcore", features = ["parity"] } ethcore-accounts = { path = "accounts", optional = true } +ethcore-bloom-journal = { path = "util/bloom" } ethcore-blockchain = { path = "ethcore/blockchain" } ethcore-call-contract = { path = "ethcore/call-contract", optional = true } ethcore-db = { path = "ethcore/db" } @@ -61,6 +63,7 @@ parity-updater = { path = "updater" } parity-util-mem = { version = "0.5.1", features = ["jemalloc-global"] } parity-version = { path = "util/version" } parking_lot = "0.10.0" +patricia-trie-ethereum = { path = "util/patricia-trie-ethereum" } regex = "1.0" registrar = { path = "util/registrar" } rlp = "0.4.0" @@ -72,9 +75,11 @@ serde_derive = "1.0" serde_json = "1.0" snapshot = { path = "ethcore/snapshot" } spec = { path = "ethcore/spec" } +state-db = { path = "ethcore/state-db" } term_size = "0.3" textwrap = "0.9" toml = "0.5.6" +trie-db = "0.20.0" # todo[dvdplm] maybe not? verification = { path = "ethcore/verification" } [build-dependencies] diff --git a/parity/blockchain.rs b/parity/blockchain.rs index 78730a89f29..179883653dc 100644 --- a/parity/blockchain.rs +++ b/parity/blockchain.rs @@ -55,7 +55,8 @@ pub enum BlockchainCmd { Import(ImportBlockchain), Export(ExportBlockchain), ExportState(ExportState), - Reset(ResetBlockchain) + Reset(ResetBlockchain), + RebuildAccountsBloom(RebuildAccountsBloom), } #[derive(Debug, PartialEq)] @@ -79,6 +80,15 @@ pub struct KillBlockchain { pub pruning: Pruning, } +#[derive(Debug, PartialEq)] +/// Rebuild the accounts existence-test bloom filter. +pub struct RebuildAccountsBloom { + pub spec: SpecType, + pub dirs: Directories, + pub pruning: Pruning, + pub compaction: DatabaseCompactionProfile, +} + #[derive(Debug, PartialEq)] pub struct ImportBlockchain { pub spec: SpecType, @@ -152,6 +162,7 @@ pub fn execute(cmd: BlockchainCmd) -> Result<(), String> { BlockchainCmd::Export(export_cmd) => execute_export(export_cmd), BlockchainCmd::ExportState(export_cmd) => execute_export_state(export_cmd), BlockchainCmd::Reset(reset_cmd) => execute_reset(reset_cmd), + BlockchainCmd::RebuildAccountsBloom(cmd) => rebuild_accounts_bloom(cmd), } } @@ -674,6 +685,39 @@ pub fn kill_db(cmd: KillBlockchain) -> Result<(), String> { Ok(()) } +pub fn rebuild_accounts_bloom(cmd: RebuildAccountsBloom) -> Result<(), String> { + use super::rebuild_accounts_bloom::rebuild_accounts_bloom; + info!("Rebuilding accounts bloom"); + + // load spec file + let spec = cmd.spec.spec(&cmd.dirs.cache)?; + + // load genesis hash + let genesis_hash = spec.genesis_header().hash(); + + // database paths + let db_dirs = cmd.dirs.database(genesis_hash, None, spec.data_dir.clone()); + + // user defaults path + let user_defaults_path = db_dirs.user_defaults_path(); + + // load user defaults + let user_defaults = UserDefaults::load(&user_defaults_path)?; + + // select pruning algorithm + let algorithm = cmd.pruning.to_algorithm(&user_defaults); + + let db_path = db_dirs.client_path(algorithm); + + let compaction = super::db::compaction_profile( + &cmd.compaction, + &db_path, + ); + + rebuild_accounts_bloom(&db_path, compaction).map_err(|e| e.to_string() )?; + Ok(()) +} + #[cfg(test)] mod test { use super::DataFormat; diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index 6f08a074722..fa864331219 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -229,6 +229,10 @@ usage! { "Number of blocks to revert", } + CMD cmd_db_rebuild_accounts_bloom { + "Rebuild the accounts bloom filter. Iterate over all accounts in the state db and add its address to the bloom filter. Can take a very long time for big databases.", + } + } CMD cmd_export_hardcoded_sync @@ -1692,6 +1696,7 @@ mod tests { cmd_db: false, cmd_db_kill: false, cmd_db_reset: false, + cmd_db_regenerate_bloom: false, cmd_export_hardcoded_sync: false, // Arguments diff --git a/parity/configuration.rs b/parity/configuration.rs index eabe46cef94..ad8c945adb9 100644 --- a/parity/configuration.rs +++ b/parity/configuration.rs @@ -47,7 +47,7 @@ use secretstore::{NodeSecretKey, Configuration as SecretStoreConfiguration, Cont use updater::{UpdatePolicy, UpdateFilter, ReleaseTrack}; use run::RunCmd; use types::data_format::DataFormat; -use blockchain::{BlockchainCmd, ImportBlockchain, ExportBlockchain, KillBlockchain, ExportState, ResetBlockchain}; +use blockchain::{BlockchainCmd, ImportBlockchain, ExportBlockchain, KillBlockchain, ExportState, ResetBlockchain, RebuildAccountsBloom}; use export_hardcoded_sync::ExportHsyncCmd; use presale::ImportWallet; use account::{AccountCmd, NewAccount, ListAccounts, ImportAccounts, ImportFromGethAccounts}; @@ -194,16 +194,23 @@ impl Configuration { })) } else if self.args.cmd_db && self.args.cmd_db_kill { Cmd::Blockchain(BlockchainCmd::Kill(KillBlockchain { - spec: spec, - dirs: dirs, - pruning: pruning, + spec, + dirs, + pruning, + })) + } else if self.args.cmd_db && self.args.cmd_db_rebuild_accounts_bloom { + Cmd::Blockchain(BlockchainCmd::RebuildAccountsBloom(RebuildAccountsBloom { + spec, + dirs, + pruning, + compaction, })) } else if self.args.cmd_account { let account_cmd = if self.args.cmd_account_new { let new_acc = NewAccount { iterations: key_iterations, path: dirs.keys, - spec: spec, + spec, password_file: self.accounts_config()?.password_files.first().map(|x| x.to_owned()), }; AccountCmd::New(new_acc) diff --git a/parity/db/mod.rs b/parity/db/mod.rs index 9edc444a96f..378ab8b35a2 100644 --- a/parity/db/mod.rs +++ b/parity/db/mod.rs @@ -19,4 +19,4 @@ #[path="rocksdb/mod.rs"] mod impls; -pub use self::impls::{open_db_light, restoration_db_handler, migrate}; +pub use self::impls::{open_db_light, restoration_db_handler, migrate, compaction_profile}; diff --git a/parity/db/rocksdb/mod.rs b/parity/db/rocksdb/mod.rs index f1818b735bc..be277c0bb13 100644 --- a/parity/db/rocksdb/mod.rs +++ b/parity/db/rocksdb/mod.rs @@ -38,6 +38,7 @@ mod migration; mod helpers; pub use self::migration::migrate; +pub use self::helpers::compaction_profile; struct AppDB { key_value: Arc, diff --git a/parity/lib.rs b/parity/lib.rs index 82599dc5c34..a16e7ba7df8 100644 --- a/parity/lib.rs +++ b/parity/lib.rs @@ -104,14 +104,16 @@ mod blockchain; mod cache; mod cli; mod configuration; -mod export_hardcoded_sync; +mod db; mod deprecated; +mod export_hardcoded_sync; mod helpers; mod informant; mod light_helpers; mod modules; mod params; mod presale; +mod rebuild_accounts_bloom; mod rpc; mod rpc_apis; mod run; @@ -120,7 +122,6 @@ mod signer; mod snapshot_cmd; mod upgrade; mod user_defaults; -mod db; use std::fs::File; use std::io::BufReader; From 7e810f1df1ffa4d304fa7ba3bcb8fb4824eee887 Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 11:42:11 +0100 Subject: [PATCH 02/18] Fix a test --- parity/cli/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index fa864331219..e121b6814ac 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -1696,7 +1696,7 @@ mod tests { cmd_db: false, cmd_db_kill: false, cmd_db_reset: false, - cmd_db_regenerate_bloom: false, + cmd_db_rebuild_accounts_bloom: false, cmd_export_hardcoded_sync: false, // Arguments From 2fa6afa6b47e841597521218b3bb4876d93e65db Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 11:49:50 +0100 Subject: [PATCH 03/18] Add missing file --- parity/rebuild_accounts_bloom.rs | 222 +++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 parity/rebuild_accounts_bloom.rs diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs new file mode 100644 index 00000000000..0d010cfeb39 --- /dev/null +++ b/parity/rebuild_accounts_bloom.rs @@ -0,0 +1,222 @@ +// Copyright 2015-2020 Parity Technologies (UK) Ltd. +// This file is part of Open Ethereum. + +// Open Ethereum is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Open Ethereum is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Open Ethereum. If not, see . + +//! Resize the accounts bloom filter for modern times +//! todo[dvdplm] document the choice of parameters etc + + +extern crate kvdb_rocksdb; +extern crate state_db; +extern crate patricia_trie_ethereum as ethtrie; +extern crate account_state; +extern crate ethcore_bloom_journal as accounts_bloom; +extern crate trie_db; + +use std::{ + path::Path, + sync::Arc, +}; + +use ethcore_db::{COL_EXTRA, COL_HEADERS, COL_STATE, COL_ACCOUNT_BLOOM}; +use ethereum_types::{H256, U256}; +use journaldb; +use kvdb::DBTransaction; +use self::{ + account_state::account::Account as StateAccount, + accounts_bloom::Bloom, // todo[dvdplm] rename this crate + ethtrie::TrieDB, + kvdb_rocksdb::{Database, DatabaseConfig}, + state_db::{ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET, StateDB}, + trie_db::Trie, +}; +use types::{ + errors::EthcoreError as Error, + views::{HeaderView, ViewRlp}, +}; +use rlp::{RlpStream, Rlp}; +use self::kvdb_rocksdb::CompactionProfile; + +pub fn rebuild_accounts_bloom>( + path: P, + compaction: CompactionProfile +) -> Result<(), Error> { + let db_config = DatabaseConfig { + compaction, + columns: ethcore_db::NUM_COLUMNS, + ..Default::default() + }; + let path_str = path.as_ref().to_string_lossy(); + let db = Arc::new(Database::open(&db_config, &path_str)?); + + generate_bloom(db)?; + Ok(()) +} + +pub fn backup_bloom(bloom_backup_path: &Path, source: Arc) -> Result<(), Error> { + let num_keys = source.num_keys(COL_ACCOUNT_BLOOM)? / 2; + if num_keys == 0 { + warn!("No bloom in the DB to back up"); + return Ok(()) + } + + let mut bloom_backup = std::fs::File::create(bloom_backup_path) + .map_err(|_| format!("Cannot write to file given: {}", bloom_backup_path.display()))?; + + info!("Saving old bloom to '{}'", bloom_backup_path.display()); + let mut stream = RlpStream::new(); + stream.begin_unbounded_list(); + for (n, (k, v)) in source.iter(COL_ACCOUNT_BLOOM).enumerate() { + stream + .begin_list(2) + .append(&k.to_vec()) + .append(&v.to_vec()); + if n > 0 && n % 50_000 == 0 { + info!(" Bloom entries processed: {}", n); + } + } + stream.finalize_unbounded_list(); + + use std::io::Write; + let written = bloom_backup.write(&stream.out())?; + info!("Saved old bloom to '{}' ({} bytes, {} keys)", bloom_backup_path.display(), written, num_keys); + Ok(()) +} + +fn restore_bloom(bloom_backup_path: &Path, db: Arc) -> Result<(), Error> { + let mut bloom_backup = std::fs::File::open(bloom_backup_path)?; + info!("Restoring bloom from '{}'", bloom_backup_path.display()); + let num_keys = db.num_keys(COL_ACCOUNT_BLOOM)? / 2; + if num_keys != 0 { + warn!("Will not overwrite existing bloom! ({} items found in the DB)", num_keys); + return Err(format!("Blooms DB column is not empty").into()) + } + let mut buf = Vec::with_capacity(10_000_000); + use std::io::Read; + let bytes_read = bloom_backup.read_to_end(&mut buf)?; + let rlp = Rlp::new(&buf); + info!("{} bloom key/values and {} bytes read from disk", rlp.item_count()?, bytes_read); + + let mut batch = DBTransaction::with_capacity(rlp.item_count()?); + for (n, kv_rlp) in rlp.iter().enumerate() { + let kv: Vec> = kv_rlp.as_list()?; + assert_eq!(kv.len(), 2); + batch.put(COL_ACCOUNT_BLOOM, &kv[0], &kv[1]); + if n > 0 && n % 10_000 == 0 { + info!(" Bloom entries prepared for restoration: {}", n); + } + } + db.write(batch)?; + db.flush()?; + info!("Bloom restored ({} bytes)", bytes_read); + Ok(()) +} + +fn clear_bloom(db: Arc) -> Result<(), Error> { + let num_keys = db.num_keys(COL_ACCOUNT_BLOOM)? / 2; + info!("Clearing out old accounts bloom ({} keys)", num_keys); + let mut batch = DBTransaction::with_capacity(num_keys as usize); + for (n, (k,_)) in db.iter(COL_ACCOUNT_BLOOM).enumerate() { + batch.delete(COL_ACCOUNT_BLOOM, &k); + if n > 0 && n % 10_000 == 0 { + info!(" Bloom entries queued for deletion: {}", n); + } + } + let deletions = batch.ops.len(); + db.write(batch)?; + db.flush().map_err(|e| Error::StdIo(e))?; + info!("Deleted {} old bloom items from the DB", deletions); + Ok(()) +} + +/// Rebuild the account bloom. +fn generate_bloom(source: Arc) -> Result<(), Error> { + info!(target: "migration", "Account bloom rebuild started"); + let best_block_hash = match source.get(COL_EXTRA, b"best")? { + None => { + warn!(target: "migration", "No best block hash, skipping"); + return Ok(()); + }, + Some(hash) => hash, + }; + let best_block_header = match source.get(COL_HEADERS, &best_block_hash)? { + // no best block, nothing to do + None => { + warn!(target: "migration", "No best block header, skipping"); + return Ok(()) + }, + Some(x) => x, + }; + + // todo[dvdplm]: need a param `--to=…` for the user to save the old bloom somewhere. + use std::time::{SystemTime, UNIX_EPOCH}; + let bloom_backup_path_str = format!("./bloom-backup-{:?}.bin", SystemTime::now().duration_since(UNIX_EPOCH).expect("system clock error").as_secs()); + let bloom_backup_path = std::path::Path::new(&bloom_backup_path_str); + backup_bloom(bloom_backup_path, source.clone())?; + clear_bloom(source.clone())?; + + // todo[dvdplm]: need a restore command for this + // let test_path = std::path::Path::new("./bloom-backup-1584359135.bin"); + // restore_bloom(test_path, source.clone())?; + // info!("STOP"); + // return Ok(()); + + let mut empty_accounts = 0u64; + let mut non_empty_accounts = 0u64; + + let mut bloom = { + let mut bloom = Bloom::new(ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET); + let state_db = journaldb::new( + source.clone(), + // It does not matter which `journaldb::Algorithm` is used since + // there will be no writes to the state column. + journaldb::Algorithm::OverlayRecent, + COL_STATE); + + let db = state_db.as_hash_db(); + let view = ViewRlp::new(&best_block_header, "", 1); + let state_root = HeaderView::new(view).state_root(); + let account_trie = TrieDB::new(&db, &state_root)?; + // Don't insert empty accounts into the bloom + let empty_account_rlp = StateAccount::new_basic(U256::zero(), U256::zero()).rlp(); + let start = std::time::Instant::now(); + let mut batch_start = std::time::Instant::now(); + for (n, (account_key, account_data)) in account_trie.iter()?.filter_map(Result::ok).enumerate() { + if n > 0 && n % 50_000 == 0 { + info!(" Accounts processed: {} in {:?}. Bloom saturation: {}", n, batch_start.elapsed(), bloom.saturation()); + batch_start = std::time::Instant::now(); + } + if account_data != empty_account_rlp { + non_empty_accounts += 1; + let account_key_hash = H256::from_slice(&account_key); + bloom.set(account_key_hash); + } else { + empty_accounts += 1; + } + } + info!("Finished iterating over the accounts in: {:?}. Bloom saturation: {}", start.elapsed(), bloom.saturation()); + bloom + }; + + let bloom_journal = bloom.drain_journal(); + info!(target: "migration", "Generated {} bloom entries; the DB has {} empty accounts and {} non-empty accounts", bloom_journal.entries.len(), empty_accounts, non_empty_accounts); + info!(target: "migration", "New bloom has {} k_bits (aka 'hash functions')", bloom_journal.hash_functions); + let mut batch = DBTransaction::new(); + StateDB::commit_bloom(&mut batch, bloom_journal)?; + source.write(batch)?; + source.flush()?; + info!(target: "migration", "Finished bloom update"); + Ok(()) +} From 0902fe5b3b0f3c1b5f0defdedd9840da7c038a86 Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 14:48:29 +0100 Subject: [PATCH 04/18] Pass in path to backup from the commandline --- parity/blockchain.rs | 7 ++- parity/cli/mod.rs | 6 ++- parity/cli/usage.rs | 28 +++++------ parity/configuration.rs | 1 + parity/rebuild_accounts_bloom.rs | 84 +++++++++++++++++++------------- 5 files changed, 77 insertions(+), 49 deletions(-) diff --git a/parity/blockchain.rs b/parity/blockchain.rs index 179883653dc..803fe1f6cd3 100644 --- a/parity/blockchain.rs +++ b/parity/blockchain.rs @@ -87,6 +87,7 @@ pub struct RebuildAccountsBloom { pub dirs: Directories, pub pruning: Pruning, pub compaction: DatabaseCompactionProfile, + pub backup_path: Option, } #[derive(Debug, PartialEq)] @@ -714,7 +715,11 @@ pub fn rebuild_accounts_bloom(cmd: RebuildAccountsBloom) -> Result<(), String> { &db_path, ); - rebuild_accounts_bloom(&db_path, compaction).map_err(|e| e.to_string() )?; + rebuild_accounts_bloom( + &db_path, + compaction, + cmd.backup_path, + ).map_err(|e| e.to_string() )?; Ok(()) } diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index e121b6814ac..cbc5e1097ea 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -195,7 +195,7 @@ usage! { "Restore the database of the given --chain (default: mainnet) from a snapshot file", ARG arg_restore_file: (Option) = None, - "[FILE]", + "", "Path to the file to restore from", } @@ -231,6 +231,10 @@ usage! { CMD cmd_db_rebuild_accounts_bloom { "Rebuild the accounts bloom filter. Iterate over all accounts in the state db and add its address to the bloom filter. Can take a very long time for big databases.", + // todo[dvdplm]: how to make this optional? I doesn't seem possible atm. + ARG arg_db_rebuild_accounts_bloom_backup_path: (Option) = None, + "--backup-path=", + "Path to accounts bloom backup", } } diff --git a/parity/cli/usage.rs b/parity/cli/usage.rs index 26b5fa728d8..a7320b46ce4 100644 --- a/parity/cli/usage.rs +++ b/parity/cli/usage.rs @@ -679,21 +679,21 @@ macro_rules! usage { // Subcommand arguments $( raw_args.$subc_arg = return_if_parse_error!(if_option!( + $($subc_arg_type_tt)+, + THEN { + if_option_vec!( $($subc_arg_type_tt)+, - THEN { - if_option_vec!( - $($subc_arg_type_tt)+, - THEN { values_t!(submatches, stringify!($subc_arg), inner_option_vec_type!($($subc_arg_type_tt)+)) } - ELSE { value_t!(submatches, stringify!($subc_arg), inner_option_type!($($subc_arg_type_tt)+)) } - ) - } - ELSE { - if_vec!( - $($subc_arg_type_tt)+, - THEN { values_t!(submatches, stringify!($subc_arg), inner_vec_type!($($subc_arg_type_tt)+)) } - ELSE { value_t!(submatches, stringify!($subc_arg), $($subc_arg_type_tt)+) } - ) - } + THEN { values_t!(submatches, stringify!($subc_arg), inner_option_vec_type!($($subc_arg_type_tt)+)) } + ELSE { value_t!(submatches, stringify!($subc_arg), inner_option_type!($($subc_arg_type_tt)+)) } + ) + } + ELSE { + if_vec!( + $($subc_arg_type_tt)+, + THEN { values_t!(submatches, stringify!($subc_arg), inner_vec_type!($($subc_arg_type_tt)+)) } + ELSE { value_t!(submatches, stringify!($subc_arg), $($subc_arg_type_tt)+) } + ) + } )); )* // Sub-subcommands diff --git a/parity/configuration.rs b/parity/configuration.rs index ad8c945adb9..d6bdc8fd67e 100644 --- a/parity/configuration.rs +++ b/parity/configuration.rs @@ -204,6 +204,7 @@ impl Configuration { dirs, pruning, compaction, + backup_path: self.args.arg_db_rebuild_accounts_bloom_backup_path, })) } else if self.args.cmd_account { let account_cmd = if self.args.cmd_account_new { diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 0d010cfeb39..5c774c4dd90 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -50,22 +50,63 @@ use rlp::{RlpStream, Rlp}; use self::kvdb_rocksdb::CompactionProfile; pub fn rebuild_accounts_bloom>( - path: P, - compaction: CompactionProfile + db_path: P, + compaction: CompactionProfile, + backup_path: Option, ) -> Result<(), Error> { let db_config = DatabaseConfig { compaction, columns: ethcore_db::NUM_COLUMNS, ..Default::default() }; - let path_str = path.as_ref().to_string_lossy(); - let db = Arc::new(Database::open(&db_config, &path_str)?); + let db_path_str = db_path.as_ref().to_string_lossy(); + let db = Arc::new(Database::open(&db_config, &db_path_str)?); - generate_bloom(db)?; + let state_root = if let Some(state_root) = load_state_root(db.clone())? { + state_root + } else { + info!("Nothing to do."); + return Ok(()) + }; + + // todo[dvdplm] I can't make the `--backup-path` optional with the `usage!` + // macro so having `Option` here is pretty useless – it must be + // specified. For the time being we'll always make a backup. + if let Some(backup_path) = backup_path { + let backup_path = Path::new(&backup_path); + backup_bloom(&backup_path, db.clone())?; + } + + generate_bloom(db, state_root)?; Ok(()) } -pub fn backup_bloom(bloom_backup_path: &Path, source: Arc) -> Result<(), Error> { +fn load_state_root(db: Arc) -> Result, Error> { + let best_block_hash = match db.get(COL_EXTRA, b"best")? { + None => { + warn!(target: "migration", "No best block hash, skipping"); + return Ok(None); + }, + Some(hash) => hash, + }; + let best_block_header = match db.get(COL_HEADERS, &best_block_hash)? { + // no best block, nothing to do + None => { + warn!(target: "migration", "No best block header, skipping"); + return Ok(None) + }, + Some(x) => x, + }; + let view = ViewRlp::new(&best_block_header, "", 1); + let state_root = HeaderView::new(view).state_root(); + Ok(Some(state_root)) +} + +// todo[dvdplm]: using `~/path/` does not work – expand `~` to home dir. +fn backup_bloom>( + bloom_backup_path: &P, + source: Arc +) -> Result<(), Error> { let num_keys = source.num_keys(COL_ACCOUNT_BLOOM)? / 2; if num_keys == 0 { warn!("No bloom in the DB to back up"); @@ -73,9 +114,9 @@ pub fn backup_bloom(bloom_backup_path: &Path, source: Arc) -> Result<( } let mut bloom_backup = std::fs::File::create(bloom_backup_path) - .map_err(|_| format!("Cannot write to file given: {}", bloom_backup_path.display()))?; + .map_err(|_| format!("Cannot write to file at path: {}", bloom_backup_path.as_ref().display()))?; - info!("Saving old bloom to '{}'", bloom_backup_path.display()); + info!("Saving old bloom to '{}'", bloom_backup_path.as_ref().display()); let mut stream = RlpStream::new(); stream.begin_unbounded_list(); for (n, (k, v)) in source.iter(COL_ACCOUNT_BLOOM).enumerate() { @@ -91,7 +132,7 @@ pub fn backup_bloom(bloom_backup_path: &Path, source: Arc) -> Result<( use std::io::Write; let written = bloom_backup.write(&stream.out())?; - info!("Saved old bloom to '{}' ({} bytes, {} keys)", bloom_backup_path.display(), written, num_keys); + info!("Saved old bloom to '{}' ({} bytes, {} keys)", bloom_backup_path.as_ref().display(), written, num_keys); Ok(()) } @@ -142,29 +183,8 @@ fn clear_bloom(db: Arc) -> Result<(), Error> { } /// Rebuild the account bloom. -fn generate_bloom(source: Arc) -> Result<(), Error> { +fn generate_bloom(source: Arc, state_root: H256) -> Result<(), Error> { info!(target: "migration", "Account bloom rebuild started"); - let best_block_hash = match source.get(COL_EXTRA, b"best")? { - None => { - warn!(target: "migration", "No best block hash, skipping"); - return Ok(()); - }, - Some(hash) => hash, - }; - let best_block_header = match source.get(COL_HEADERS, &best_block_hash)? { - // no best block, nothing to do - None => { - warn!(target: "migration", "No best block header, skipping"); - return Ok(()) - }, - Some(x) => x, - }; - - // todo[dvdplm]: need a param `--to=…` for the user to save the old bloom somewhere. - use std::time::{SystemTime, UNIX_EPOCH}; - let bloom_backup_path_str = format!("./bloom-backup-{:?}.bin", SystemTime::now().duration_since(UNIX_EPOCH).expect("system clock error").as_secs()); - let bloom_backup_path = std::path::Path::new(&bloom_backup_path_str); - backup_bloom(bloom_backup_path, source.clone())?; clear_bloom(source.clone())?; // todo[dvdplm]: need a restore command for this @@ -186,8 +206,6 @@ fn generate_bloom(source: Arc) -> Result<(), Error> { COL_STATE); let db = state_db.as_hash_db(); - let view = ViewRlp::new(&best_block_header, "", 1); - let state_root = HeaderView::new(view).state_root(); let account_trie = TrieDB::new(&db, &state_root)?; // Don't insert empty accounts into the bloom let empty_account_rlp = StateAccount::new_basic(U256::zero(), U256::zero()).rlp(); From 24380b7774f003ee0db6cd3e76e18fd4170e0340 Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 17:08:43 +0100 Subject: [PATCH 05/18] fix tests --- parity/cli/mod.rs | 1 + parity/rebuild_accounts_bloom.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index cbc5e1097ea..60fac2d3677 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -1713,6 +1713,7 @@ mod tests { arg_export_state_format: None, arg_snapshot_file: None, arg_restore_file: None, + arg_db_rebuild_accounts_bloom_backup_path: None, arg_tools_hash_file: None, arg_enable_signing_queue: false, diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 5c774c4dd90..84aa0a699db 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -85,7 +85,7 @@ fn load_state_root(db: Arc) -> Result, Error> { let best_block_hash = match db.get(COL_EXTRA, b"best")? { None => { warn!(target: "migration", "No best block hash, skipping"); - return Ok(None); + return Err(Error::Msg("No best block hash in the DB.".to_owned())); }, Some(hash) => hash, }; From b8afdd453bc59e276f3be8a9c92cc1dd2eb422fd Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 18:10:40 +0100 Subject: [PATCH 06/18] misc polish --- parity/rebuild_accounts_bloom.rs | 14 +++++--------- util/dir/src/helpers.rs | 2 ++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 84aa0a699db..50a7874ca2d 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -62,17 +62,13 @@ pub fn rebuild_accounts_bloom>( let db_path_str = db_path.as_ref().to_string_lossy(); let db = Arc::new(Database::open(&db_config, &db_path_str)?); - let state_root = if let Some(state_root) = load_state_root(db.clone())? { - state_root - } else { - info!("Nothing to do."); - return Ok(()) - }; + let state_root = load_state_root(db.clone())?; // todo[dvdplm] I can't make the `--backup-path` optional with the `usage!` // macro so having `Option` here is pretty useless – it must be // specified. For the time being we'll always make a backup. if let Some(backup_path) = backup_path { + let backup_path = dir::helpers::replace_home("", &backup_path); let backup_path = Path::new(&backup_path); backup_bloom(&backup_path, db.clone())?; } @@ -81,7 +77,7 @@ pub fn rebuild_accounts_bloom>( Ok(()) } -fn load_state_root(db: Arc) -> Result, Error> { +fn load_state_root(db: Arc) -> Result { let best_block_hash = match db.get(COL_EXTRA, b"best")? { None => { warn!(target: "migration", "No best block hash, skipping"); @@ -93,13 +89,13 @@ fn load_state_root(db: Arc) -> Result, Error> { // no best block, nothing to do None => { warn!(target: "migration", "No best block header, skipping"); - return Ok(None) + return Err(Error::Msg("No best block header in the DB.".to_owned())); }, Some(x) => x, }; let view = ViewRlp::new(&best_block_header, "", 1); let state_root = HeaderView::new(view).state_root(); - Ok(Some(state_root)) + Ok(state_root) } // todo[dvdplm]: using `~/path/` does not work – expand `~` to home dir. diff --git a/util/dir/src/helpers.rs b/util/dir/src/helpers.rs index 7cdf3565090..ec0dd845430 100644 --- a/util/dir/src/helpers.rs +++ b/util/dir/src/helpers.rs @@ -23,6 +23,8 @@ pub fn replace_home(base: &str, arg: &str) -> String { // We use an `if` so that we don't need to call `home_dir()` if not necessary. let r = if arg.contains("$HOME") { arg.replace("$HOME", home_dir().expect("$HOME isn't defined").to_str().unwrap()) + } else if arg.contains("~") { + arg.replace("~", home_dir().expect("$HOME isn't defined").to_str().unwrap()) } else { arg.to_owned() }; From eabe6b0119e334085c749fbf536610a590d7e29e Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 21:03:16 +0100 Subject: [PATCH 07/18] Add restore-accounts-bloom command --- parity/blockchain.rs | 49 +++++++++++++++++++++++--------- parity/cli/mod.rs | 9 ++++++ parity/configuration.rs | 10 ++++++- parity/rebuild_accounts_bloom.rs | 40 +++++++++++++++++++------- 4 files changed, 82 insertions(+), 26 deletions(-) diff --git a/parity/blockchain.rs b/parity/blockchain.rs index 803fe1f6cd3..ea09b9c7ce9 100644 --- a/parity/blockchain.rs +++ b/parity/blockchain.rs @@ -57,6 +57,7 @@ pub enum BlockchainCmd { ExportState(ExportState), Reset(ResetBlockchain), RebuildAccountsBloom(RebuildAccountsBloom), + RestoreAccountsBloom(RestoreAccountsBloom), } #[derive(Debug, PartialEq)] @@ -90,6 +91,16 @@ pub struct RebuildAccountsBloom { pub backup_path: Option, } +#[derive(Debug, PartialEq)] +/// Restore the accounts existence-test bloom filter from a backup on disk +pub struct RestoreAccountsBloom { + pub spec: SpecType, + pub dirs: Directories, + pub pruning: Pruning, + pub compaction: DatabaseCompactionProfile, + pub backup_path: String, +} + #[derive(Debug, PartialEq)] pub struct ImportBlockchain { pub spec: SpecType, @@ -164,6 +175,7 @@ pub fn execute(cmd: BlockchainCmd) -> Result<(), String> { BlockchainCmd::ExportState(export_cmd) => execute_export_state(export_cmd), BlockchainCmd::Reset(reset_cmd) => execute_reset(reset_cmd), BlockchainCmd::RebuildAccountsBloom(cmd) => rebuild_accounts_bloom(cmd), + BlockchainCmd::RestoreAccountsBloom(cmd) => restore_accounts_bloom(cmd), } } @@ -689,27 +701,13 @@ pub fn kill_db(cmd: KillBlockchain) -> Result<(), String> { pub fn rebuild_accounts_bloom(cmd: RebuildAccountsBloom) -> Result<(), String> { use super::rebuild_accounts_bloom::rebuild_accounts_bloom; info!("Rebuilding accounts bloom"); - - // load spec file let spec = cmd.spec.spec(&cmd.dirs.cache)?; - - // load genesis hash let genesis_hash = spec.genesis_header().hash(); - - // database paths let db_dirs = cmd.dirs.database(genesis_hash, None, spec.data_dir.clone()); - - // user defaults path let user_defaults_path = db_dirs.user_defaults_path(); - - // load user defaults let user_defaults = UserDefaults::load(&user_defaults_path)?; - - // select pruning algorithm let algorithm = cmd.pruning.to_algorithm(&user_defaults); - let db_path = db_dirs.client_path(algorithm); - let compaction = super::db::compaction_profile( &cmd.compaction, &db_path, @@ -723,6 +721,29 @@ pub fn rebuild_accounts_bloom(cmd: RebuildAccountsBloom) -> Result<(), String> { Ok(()) } +pub fn restore_accounts_bloom(cmd: RestoreAccountsBloom) -> Result<(), String> { + use super::rebuild_accounts_bloom::restore_accounts_bloom; + info!("Restoring accounts bloom from backup at {}", cmd.backup_path); + let spec = cmd.spec.spec(&cmd.dirs.cache)?; + let genesis_hash = spec.genesis_header().hash(); + let db_dirs = cmd.dirs.database(genesis_hash, None, spec.data_dir.clone()); + let user_defaults_path = db_dirs.user_defaults_path(); + let user_defaults = UserDefaults::load(&user_defaults_path)?; + let algorithm = cmd.pruning.to_algorithm(&user_defaults); + let db_path = db_dirs.client_path(algorithm); + let compaction = super::db::compaction_profile( + &cmd.compaction, + &db_path, + ); + + restore_accounts_bloom( + &db_path, + compaction, + cmd.backup_path, + ).map_err(|e| e.to_string() )?; + Ok(()) +} + #[cfg(test)] mod test { use super::DataFormat; diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index 60fac2d3677..f306f82dd02 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -237,6 +237,13 @@ usage! { "Path to accounts bloom backup", } + CMD cmd_db_restore_accounts_bloom { + "Restore the accounts bloom filter from a backup file on disk. Destructive.", + ARG arg_db_restore_accounts_bloom_backup_path: (String) = "", + "--backup-path=", + "Path to accounts bloom backup file", + } + } CMD cmd_export_hardcoded_sync @@ -1701,6 +1708,7 @@ mod tests { cmd_db_kill: false, cmd_db_reset: false, cmd_db_rebuild_accounts_bloom: false, + cmd_db_restore_accounts_bloom: false, cmd_export_hardcoded_sync: false, // Arguments @@ -1714,6 +1722,7 @@ mod tests { arg_snapshot_file: None, arg_restore_file: None, arg_db_rebuild_accounts_bloom_backup_path: None, + arg_db_restore_accounts_bloom_backup_path: "", arg_tools_hash_file: None, arg_enable_signing_queue: false, diff --git a/parity/configuration.rs b/parity/configuration.rs index d6bdc8fd67e..a462733fc02 100644 --- a/parity/configuration.rs +++ b/parity/configuration.rs @@ -47,7 +47,7 @@ use secretstore::{NodeSecretKey, Configuration as SecretStoreConfiguration, Cont use updater::{UpdatePolicy, UpdateFilter, ReleaseTrack}; use run::RunCmd; use types::data_format::DataFormat; -use blockchain::{BlockchainCmd, ImportBlockchain, ExportBlockchain, KillBlockchain, ExportState, ResetBlockchain, RebuildAccountsBloom}; +use blockchain::{BlockchainCmd, ImportBlockchain, ExportBlockchain, KillBlockchain, ExportState, ResetBlockchain, RebuildAccountsBloom, RestoreAccountsBloom}; use export_hardcoded_sync::ExportHsyncCmd; use presale::ImportWallet; use account::{AccountCmd, NewAccount, ListAccounts, ImportAccounts, ImportFromGethAccounts}; @@ -206,6 +206,14 @@ impl Configuration { compaction, backup_path: self.args.arg_db_rebuild_accounts_bloom_backup_path, })) + } else if self.args.cmd_db && self.args.cmd_db_restore_accounts_bloom { + Cmd::Blockchain(BlockchainCmd::RestoreAccountsBloom(RestoreAccountsBloom { + spec, + dirs, + pruning, + compaction, + backup_path: self.args.arg_db_restore_accounts_bloom_backup_path, + })) } else if self.args.cmd_account { let account_cmd = if self.args.cmd_account_new { let new_acc = NewAccount { diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 50a7874ca2d..e8cf6a118bc 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -77,6 +77,24 @@ pub fn rebuild_accounts_bloom>( Ok(()) } +pub fn restore_accounts_bloom>( + db_path: P, + compaction: CompactionProfile, + backup_path: String, +) -> Result<(), Error> { + let db_config = DatabaseConfig { + compaction, + columns: ethcore_db::NUM_COLUMNS, + ..Default::default() + }; + let db_path_str = db_path.as_ref().to_string_lossy(); + let db = Arc::new(Database::open(&db_config, &db_path_str)?); + + let backup_path = dir::helpers::replace_home("", &backup_path); + restore_bloom(&backup_path, db.clone())?; + Ok(()) +} + fn load_state_root(db: Arc) -> Result { let best_block_hash = match db.get(COL_EXTRA, b"best")? { None => { @@ -98,7 +116,6 @@ fn load_state_root(db: Arc) -> Result { Ok(state_root) } -// todo[dvdplm]: using `~/path/` does not work – expand `~` to home dir. fn backup_bloom>( bloom_backup_path: &P, source: Arc @@ -132,21 +149,21 @@ fn backup_bloom>( Ok(()) } -fn restore_bloom(bloom_backup_path: &Path, db: Arc) -> Result<(), Error> { +fn restore_bloom>( + bloom_backup_path: &P, + db: Arc +) -> Result<(), Error> { let mut bloom_backup = std::fs::File::open(bloom_backup_path)?; - info!("Restoring bloom from '{}'", bloom_backup_path.display()); - let num_keys = db.num_keys(COL_ACCOUNT_BLOOM)? / 2; - if num_keys != 0 { - warn!("Will not overwrite existing bloom! ({} items found in the DB)", num_keys); - return Err(format!("Blooms DB column is not empty").into()) - } + info!("Restoring bloom from '{}'", bloom_backup_path.as_ref().display()); let mut buf = Vec::with_capacity(10_000_000); use std::io::Read; + // todo[dvdplm]: this is a little terrible – what's the better way? let bytes_read = bloom_backup.read_to_end(&mut buf)?; let rlp = Rlp::new(&buf); - info!("{} bloom key/values and {} bytes read from disk", rlp.item_count()?, bytes_read); + let item_count = rlp.item_count()?; + info!("{} bloom key/values and {} bytes read from disk", item_count, bytes_read); - let mut batch = DBTransaction::with_capacity(rlp.item_count()?); + let mut batch = DBTransaction::with_capacity(item_count); for (n, kv_rlp) in rlp.iter().enumerate() { let kv: Vec> = kv_rlp.as_list()?; assert_eq!(kv.len(), 2); @@ -155,9 +172,10 @@ fn restore_bloom(bloom_backup_path: &Path, db: Arc) -> Result<(), Erro info!(" Bloom entries prepared for restoration: {}", n); } } + clear_bloom(db.clone())?; db.write(batch)?; db.flush()?; - info!("Bloom restored ({} bytes)", bytes_read); + info!("Bloom restored (wrote {} entries, {} bytes)", item_count, bytes_read); Ok(()) } From 47130aab8118e3859ecaa0bfdcb7cc5ddb91ef1b Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 21:27:20 +0100 Subject: [PATCH 08/18] Fix test --- parity/cli/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index f306f82dd02..466d6b37e76 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -1722,7 +1722,7 @@ mod tests { arg_snapshot_file: None, arg_restore_file: None, arg_db_rebuild_accounts_bloom_backup_path: None, - arg_db_restore_accounts_bloom_backup_path: "", + arg_db_restore_accounts_bloom_backup_path: "".to_string(), arg_tools_hash_file: None, arg_enable_signing_queue: false, From dd691d467a88fa47f79e1244c2843a49e8ef0650 Mon Sep 17 00:00:00 2001 From: David Palm Date: Mon, 23 Mar 2020 21:37:41 +0100 Subject: [PATCH 09/18] Incldue best block number in logs --- parity/rebuild_accounts_bloom.rs | 43 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index e8cf6a118bc..480077e9449 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -38,16 +38,16 @@ use self::{ account_state::account::Account as StateAccount, accounts_bloom::Bloom, // todo[dvdplm] rename this crate ethtrie::TrieDB, - kvdb_rocksdb::{Database, DatabaseConfig}, + kvdb_rocksdb::{CompactionProfile, Database, DatabaseConfig}, state_db::{ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET, StateDB}, trie_db::Trie, }; use types::{ + BlockNumber, errors::EthcoreError as Error, views::{HeaderView, ViewRlp}, }; use rlp::{RlpStream, Rlp}; -use self::kvdb_rocksdb::CompactionProfile; pub fn rebuild_accounts_bloom>( db_path: P, @@ -62,7 +62,7 @@ pub fn rebuild_accounts_bloom>( let db_path_str = db_path.as_ref().to_string_lossy(); let db = Arc::new(Database::open(&db_config, &db_path_str)?); - let state_root = load_state_root(db.clone())?; + let (state_root, best_block) = load_state_root(db.clone())?; // todo[dvdplm] I can't make the `--backup-path` optional with the `usage!` // macro so having `Option` here is pretty useless – it must be @@ -70,10 +70,10 @@ pub fn rebuild_accounts_bloom>( if let Some(backup_path) = backup_path { let backup_path = dir::helpers::replace_home("", &backup_path); let backup_path = Path::new(&backup_path); - backup_bloom(&backup_path, db.clone())?; + backup_bloom(&backup_path, db.clone(), best_block)?; } - generate_bloom(db, state_root)?; + generate_bloom(db, state_root, best_block)?; Ok(()) } @@ -95,7 +95,7 @@ pub fn restore_accounts_bloom>( Ok(()) } -fn load_state_root(db: Arc) -> Result { +fn load_state_root(db: Arc) -> Result<(H256, BlockNumber), Error> { let best_block_hash = match db.get(COL_EXTRA, b"best")? { None => { warn!(target: "migration", "No best block hash, skipping"); @@ -112,13 +112,16 @@ fn load_state_root(db: Arc) -> Result { Some(x) => x, }; let view = ViewRlp::new(&best_block_header, "", 1); - let state_root = HeaderView::new(view).state_root(); - Ok(state_root) + let header = HeaderView::new(view); + let best_block_nr = header.number(); + let state_root = header.state_root(); + Ok((state_root, best_block_nr)) } fn backup_bloom>( bloom_backup_path: &P, - source: Arc + source: Arc, + best_block: BlockNumber, ) -> Result<(), Error> { let num_keys = source.num_keys(COL_ACCOUNT_BLOOM)? / 2; if num_keys == 0 { @@ -129,7 +132,7 @@ fn backup_bloom>( let mut bloom_backup = std::fs::File::create(bloom_backup_path) .map_err(|_| format!("Cannot write to file at path: {}", bloom_backup_path.as_ref().display()))?; - info!("Saving old bloom to '{}'", bloom_backup_path.as_ref().display()); + info!("Saving old bloom as of block #{} to '{}'", best_block, bloom_backup_path.as_ref().display()); let mut stream = RlpStream::new(); stream.begin_unbounded_list(); for (n, (k, v)) in source.iter(COL_ACCOUNT_BLOOM).enumerate() { @@ -145,7 +148,7 @@ fn backup_bloom>( use std::io::Write; let written = bloom_backup.write(&stream.out())?; - info!("Saved old bloom to '{}' ({} bytes, {} keys)", bloom_backup_path.as_ref().display(), written, num_keys); + info!("Saved old bloom as of block#{} to '{}' ({} bytes, {} keys)", best_block, bloom_backup_path.as_ref().display(), written, num_keys); Ok(()) } @@ -197,16 +200,14 @@ fn clear_bloom(db: Arc) -> Result<(), Error> { } /// Rebuild the account bloom. -fn generate_bloom(source: Arc, state_root: H256) -> Result<(), Error> { - info!(target: "migration", "Account bloom rebuild started"); +fn generate_bloom( + source: Arc, + state_root: H256, + best_block: BlockNumber, +) -> Result<(), Error> { + info!(target: "migration", "Account bloom rebuild started for chain at #{}", best_block); clear_bloom(source.clone())?; - // todo[dvdplm]: need a restore command for this - // let test_path = std::path::Path::new("./bloom-backup-1584359135.bin"); - // restore_bloom(test_path, source.clone())?; - // info!("STOP"); - // return Ok(()); - let mut empty_accounts = 0u64; let mut non_empty_accounts = 0u64; @@ -238,7 +239,7 @@ fn generate_bloom(source: Arc, state_root: H256) -> Result<(), Error> empty_accounts += 1; } } - info!("Finished iterating over the accounts in: {:?}. Bloom saturation: {}", start.elapsed(), bloom.saturation()); + info!("Finished iterating over the accounts as of block #{} in: {:?}. Bloom saturation: {}", best_block, start.elapsed(), bloom.saturation()); bloom }; @@ -249,6 +250,6 @@ fn generate_bloom(source: Arc, state_root: H256) -> Result<(), Error> StateDB::commit_bloom(&mut batch, bloom_journal)?; source.write(batch)?; source.flush()?; - info!(target: "migration", "Finished bloom update"); + info!(target: "migration", "Finished bloom update for chain at #{}", best_block); Ok(()) } From 53b93048550eaa072c9a3d5ba91f00a9ae621a99 Mon Sep 17 00:00:00 2001 From: David Palm Date: Thu, 26 Mar 2020 00:00:41 +0100 Subject: [PATCH 10/18] Add --accounts arg to bloom rebuild Build bloom using the false positive rate (0.01) and the desired size Store the bloom filter size in the db Ensure compute_bitmap_size returns a size rounded up to nearest multiple of 8 so we can use it to find the right u64-boundaries --- ethcore/state-db/src/lib.rs | 73 ++++++++++++++++++++------------ parity/blockchain.rs | 2 + parity/cli/mod.rs | 10 +++-- parity/configuration.rs | 1 + parity/rebuild_accounts_bloom.rs | 23 +++++++--- util/bloom/src/lib.rs | 41 ++++++++++++------ 6 files changed, 101 insertions(+), 49 deletions(-) diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 60548da352c..aa5c55c0030 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -24,7 +24,7 @@ use ethereum_types::{Address, H256}; use hash_db::HashDB; use keccak_hash::keccak; use kvdb::{DBTransaction, DBValue, KeyValueDB}; -use log::trace; +use log::{debug, trace}; use lru_cache::LruCache; use parking_lot::Mutex; @@ -39,15 +39,20 @@ use memory_cache::MemoryLruCache; /// Value used to initialize bloom bitmap size. /// /// Bitmap size is the size in bytes (not bits) that will be allocated in memory. -pub const ACCOUNT_BLOOM_SPACE: usize = 1048576; +// todo[dvdplm] deprecate this one +const LEGACY_ACCOUNTS_BLOOM_ITEM_COUNT: u64 = 1048576; + /// Value used to initialize bloom items count. /// /// Items count is an estimation of the maximum number of items to store. -pub const DEFAULT_ACCOUNT_PRESET: usize = 1000000; - -/// Key for a value storing amount of hashes -pub const ACCOUNT_BLOOM_HASHCOUNT_KEY: &'static [u8] = b"account_hash_count"; +pub const ACCOUNTS_BLOOM_ITEM_COUNT: u64 = 100_000_000; +/// False positive rate for the accounts bloom filter: 1 in 100. +pub const ACCOUNTS_BLOOM_FP_RATE: f64 = 0.01; +/// Key storing the number of hash functions used in the accounts bloom. +pub const ACCOUNTS_BLOOM_HASHCOUNT_KEY: &'static [u8] = b"account_hash_count"; +/// Key storing number of items the accounts bloom was built to contain. +pub const ACCOUNTS_BLOOM_ITEM_COUNT_KEY: &'static [u8] = b"accounts_bloom_item_count"; const STATE_CACHE_BLOCKS: usize = 12; @@ -160,23 +165,37 @@ impl StateDB { } /// Loads accounts bloom from the database - /// This bloom is used to handle request for the non-existent account fast + /// This bloom is used to quickly handle requests for non-existent accounts. pub fn load_bloom(db: &dyn KeyValueDB) -> Bloom { - let hash_count_entry = db.get(COL_ACCOUNT_BLOOM, ACCOUNT_BLOOM_HASHCOUNT_KEY) - .expect("Low-level database error"); - - let hash_count_bytes = match hash_count_entry { - Some(bytes) => bytes, - None => return Bloom::new(ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET), + let bloom_hash_functions = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY) + .expect("Low-level database error") + .and_then(|bytes| { + assert_eq!(bytes.len(), 1); + Some(bytes[0] as u32) + }); + let bloom_hash_functions = match bloom_hash_functions { + Some(nr) => nr, + None => return Bloom::new_for_fp_rate(ACCOUNTS_BLOOM_ITEM_COUNT, ACCOUNTS_BLOOM_FP_RATE), }; - assert_eq!(hash_count_bytes.len(), 1); - let hash_count = hash_count_bytes[0]; - - let mut bloom_parts = vec![0u64; ACCOUNT_BLOOM_SPACE / 8]; - for i in 0..ACCOUNT_BLOOM_SPACE / 8 { - let key: [u8; 8] = (i as u64).to_le_bytes(); - bloom_parts[i] = db.get(COL_ACCOUNT_BLOOM, &key).expect("low-level database error") + let item_count = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY) + .expect("Low-level database error") + .and_then(|bytes| { + assert_eq!(bytes.len(), 8, "Expected a u64"); + let mut buf = [0u8; 8]; + buf.copy_from_slice(&*bytes); + trace!(target: "accounts_bloom", "DB had a value under 'accounts_bloom_item_count': {:?} (as u64: {})", &bytes, u64::from_le_bytes(buf)); + Some(u64::from_le_bytes(buf)) + }) + // Assume this is an old bloom + .unwrap_or(LEGACY_ACCOUNTS_BLOOM_ITEM_COUNT); + debug!(target: "accounts_bloom", "Accounts bloom is sized for {} entries", item_count); + + let bitmap_size = Bloom::compute_bitmap_size(item_count, ACCOUNTS_BLOOM_FP_RATE); + let mut bloom_parts = vec![0u64; (bitmap_size / 8) as usize]; + for i in 0..bitmap_size / 8 { + let key: [u8; 8] = i.to_le_bytes(); + bloom_parts[i as usize] = db.get(COL_ACCOUNT_BLOOM, &key).expect("low-level database error") .map(|val| { assert_eq!(val.len(), 8, "low-level database error"); let mut buff = [0u8; 8]; @@ -186,15 +205,15 @@ impl StateDB { .unwrap_or(0u64); } - let bloom = Bloom::from_parts(&bloom_parts, hash_count as u32); - trace!(target: "account_bloom", "Bloom is {:?} full, hash functions count = {:?}", bloom.saturation(), hash_count); + let bloom = Bloom::from_parts(&bloom_parts, bloom_hash_functions); + debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", bloom.saturation(), bloom_hash_functions, bloom.number_of_bits()); bloom } /// Commit blooms journal to the database transaction pub fn commit_bloom(batch: &mut DBTransaction, journal: BloomJournal) -> io::Result<()> { assert!(journal.hash_functions <= 255); - batch.put(COL_ACCOUNT_BLOOM, ACCOUNT_BLOOM_HASHCOUNT_KEY, &[journal.hash_functions as u8]); + batch.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY, &[journal.hash_functions as u8]); for (bloom_part_index, bloom_part_value) in journal.entries { let key: [u8; 8] = (bloom_part_index as u64).to_le_bytes(); @@ -463,15 +482,15 @@ impl account_state::Backend for StateDB { } fn note_non_null_account(&self, address: &Address) { - trace!(target: "account_bloom", "Note account bloom: {:?}", address); + trace!(target: "accounts_bloom", "Note account bloom: {:?}", address); let mut bloom = self.account_bloom.lock(); - bloom.set(keccak(address).as_bytes()); + bloom.set(keccak(address)); } fn is_known_null(&self, address: &Address) -> bool { - trace!(target: "account_bloom", "Check account bloom: {:?}", address); + trace!(target: "accounts_bloom", "Check account bloom: {:?}", address); let bloom = self.account_bloom.lock(); - let is_null = !bloom.check(keccak(address).as_bytes()); + let is_null = !bloom.check(keccak(address)); is_null } } diff --git a/parity/blockchain.rs b/parity/blockchain.rs index ea09b9c7ce9..2ac6287ed1f 100644 --- a/parity/blockchain.rs +++ b/parity/blockchain.rs @@ -89,6 +89,7 @@ pub struct RebuildAccountsBloom { pub pruning: Pruning, pub compaction: DatabaseCompactionProfile, pub backup_path: Option, + pub account_count: u64, } #[derive(Debug, PartialEq)] @@ -717,6 +718,7 @@ pub fn rebuild_accounts_bloom(cmd: RebuildAccountsBloom) -> Result<(), String> { &db_path, compaction, cmd.backup_path, + cmd.account_count, ).map_err(|e| e.to_string() )?; Ok(()) } diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index 466d6b37e76..4e11cfbdec9 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -230,15 +230,17 @@ usage! { } CMD cmd_db_rebuild_accounts_bloom { - "Rebuild the accounts bloom filter. Iterate over all accounts in the state db and add its address to the bloom filter. Can take a very long time for big databases.", + "Rebuild the accounts bloom filter. Iterate over all accounts in the state db and add its address to the bloom filter. Can take a very long time for big databases. The old bloom is backed up before starting the rebuilding process. To restore from a backup, use `db restore-accounts-bloom`, but be aware that you can only restore a bloom from backup if the target db is at the same block as the backup file.", // todo[dvdplm]: how to make this optional? I doesn't seem possible atm. ARG arg_db_rebuild_accounts_bloom_backup_path: (Option) = None, - "--backup-path=", - "Path to accounts bloom backup", + "--backup-path=", "Path to accounts bloom backup", + + ARG arg_db_rebuild_accounts_bloom_account_count: (u64) = 100_000_000u64, + "--accounts=", "The number of accounts the bloom should handle", } CMD cmd_db_restore_accounts_bloom { - "Restore the accounts bloom filter from a backup file on disk. Destructive.", + "Restore the accounts bloom filter from a backup file on disk. This is a destructive operation. Make sure that the chain you are restoring to is at the same block as the backup or verification errors are nigh inevitable.", ARG arg_db_restore_accounts_bloom_backup_path: (String) = "", "--backup-path=", "Path to accounts bloom backup file", diff --git a/parity/configuration.rs b/parity/configuration.rs index a462733fc02..91436e1df2e 100644 --- a/parity/configuration.rs +++ b/parity/configuration.rs @@ -205,6 +205,7 @@ impl Configuration { pruning, compaction, backup_path: self.args.arg_db_rebuild_accounts_bloom_backup_path, + account_count: self.args.arg_db_rebuild_accounts_bloom_account_count, })) } else if self.args.cmd_db && self.args.cmd_db_restore_accounts_bloom { Cmd::Blockchain(BlockchainCmd::RestoreAccountsBloom(RestoreAccountsBloom { diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 480077e9449..c37a3077de7 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -39,7 +39,7 @@ use self::{ accounts_bloom::Bloom, // todo[dvdplm] rename this crate ethtrie::TrieDB, kvdb_rocksdb::{CompactionProfile, Database, DatabaseConfig}, - state_db::{ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET, StateDB}, + state_db::StateDB, trie_db::Trie, }; use types::{ @@ -48,11 +48,13 @@ use types::{ views::{HeaderView, ViewRlp}, }; use rlp::{RlpStream, Rlp}; +use self::state_db::ACCOUNTS_BLOOM_ITEM_COUNT_KEY; pub fn rebuild_accounts_bloom>( db_path: P, compaction: CompactionProfile, backup_path: Option, + account_count: u64, ) -> Result<(), Error> { let db_config = DatabaseConfig { compaction, @@ -73,7 +75,7 @@ pub fn rebuild_accounts_bloom>( backup_bloom(&backup_path, db.clone(), best_block)?; } - generate_bloom(db, state_root, best_block)?; + generate_bloom(db, account_count, state_root, best_block)?; Ok(()) } @@ -188,7 +190,7 @@ fn clear_bloom(db: Arc) -> Result<(), Error> { let mut batch = DBTransaction::with_capacity(num_keys as usize); for (n, (k,_)) in db.iter(COL_ACCOUNT_BLOOM).enumerate() { batch.delete(COL_ACCOUNT_BLOOM, &k); - if n > 0 && n % 10_000 == 0 { + if n > 0 && n % 50_000 == 0 { info!(" Bloom entries queued for deletion: {}", n); } } @@ -202,17 +204,24 @@ fn clear_bloom(db: Arc) -> Result<(), Error> { /// Rebuild the account bloom. fn generate_bloom( source: Arc, + account_count: u64, state_root: H256, best_block: BlockNumber, ) -> Result<(), Error> { - info!(target: "migration", "Account bloom rebuild started for chain at #{}", best_block); + let num_keys = source.num_keys(COL_STATE)? / 2; + info!(target: "migration", "Account bloom rebuild started for chain at #{}. There are {} accounts in the DB", best_block, num_keys); + if account_count <= num_keys { + warn!("Rebuilding the bloom with space for {} accounts when the DB contains {} keys is not a good idea: the bloom filter will be saturated right away.", + account_count, num_keys + ); + } clear_bloom(source.clone())?; let mut empty_accounts = 0u64; let mut non_empty_accounts = 0u64; let mut bloom = { - let mut bloom = Bloom::new(ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET); + let mut bloom = Bloom::new_for_fp_rate(account_count, 0.01); let state_db = journaldb::new( source.clone(), // It does not matter which `journaldb::Algorithm` is used since @@ -245,9 +254,11 @@ fn generate_bloom( let bloom_journal = bloom.drain_journal(); info!(target: "migration", "Generated {} bloom entries; the DB has {} empty accounts and {} non-empty accounts", bloom_journal.entries.len(), empty_accounts, non_empty_accounts); - info!(target: "migration", "New bloom has {} k_bits (aka 'hash functions')", bloom_journal.hash_functions); + info!(target: "migration", "New bloom has {} k_bits (aka 'hash functions') and a bitmap size of {} bits", bloom_journal.hash_functions, bloom.number_of_bits()); let mut batch = DBTransaction::new(); StateDB::commit_bloom(&mut batch, bloom_journal)?; + // Write the size of the bloom we just built to the db so we can load&rebuild the bloom at startup + batch.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY, &account_count.to_le_bytes()); source.write(batch)?; source.flush()?; info!(target: "migration", "Finished bloom update for chain at #{}", best_block); diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 35aab538b3b..133eff7c79e 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -17,6 +17,7 @@ use std::{cmp, mem, f64}; use std::hash::{Hash, Hasher}; use std::collections::HashSet; +use std::f64::consts::LN_2; use siphasher::sip::SipHasher; /// BitVec structure with journalling @@ -38,7 +39,7 @@ impl BitVecJournal { pub fn from_parts(parts: &[u64]) -> BitVecJournal { BitVecJournal { - elems: parts.to_vec(), + elems: parts.to_vec(), // todo[dvdplm] looks like a clone we could get rid of journal: HashSet::new(), } } @@ -78,9 +79,9 @@ impl Bloom { /// Create a new bloom filter structure. /// bitmap_size is the size in bytes (not bits) that will be allocated in memory /// items_count is an estimation of the maximum number of items to store. - pub fn new(bitmap_size: usize, items_count: usize) -> Bloom { + pub fn new(bitmap_size: u64, items_count: u64) -> Bloom { assert!(bitmap_size > 0 && items_count > 0); - let bitmap_bits = (bitmap_size as u64) * 8u64; + let bitmap_bits = bitmap_size * 8; let k_num = Bloom::optimal_k_num(bitmap_bits, items_count); let bitmap = BitVecJournal::new(bitmap_bits as usize); Bloom { @@ -91,6 +92,9 @@ impl Bloom { } /// Initializes bloom filter from saved state + // todo[dvdplm] we should not need to pass in `k_num` here – it's a + // deterministic function of the item count in the bloom so we should be + // able to store one of the two only. pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom { let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; @@ -105,20 +109,20 @@ impl Bloom { /// Create a new bloom filter structure. /// items_count is an estimation of the maximum number of items to store. /// fp_p is the wanted rate of false positives, in ]0.0, 1.0[ - pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom { + pub fn new_for_fp_rate(items_count: u64, fp_p: f64) -> Bloom { let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p); Bloom::new(bitmap_size, items_count) } - /// Compute a recommended bitmap size for items_count items + /// Compute a recommended Bloom bitmap size in bytes for `items_count` items /// and a fp_p rate of false positives. - /// fp_p obviously has to be within the ]0.0, 1.0[ range. - pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize { + /// `fp_p` obviously has to be within the ]0.0, 1.0[ range. + pub fn compute_bitmap_size(items_count: u64, fp_p: f64) -> u64 { assert!(items_count > 0); assert!(fp_p > 0.0 && fp_p < 1.0); - let log2 = f64::consts::LN_2; - let log2_2 = log2 * log2; - ((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize + let bitmap_size = ((items_count as f64) * f64::ln(fp_p) / (-8.0 * LN_2 * LN_2)).ceil() as u64; + // Round up to nearest multiple of 8 because we need to use this to index u64s + ((bitmap_size + 7) / 8) * 8 } /// Records the presence of an item. @@ -157,10 +161,12 @@ impl Bloom { self.k_num } - fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 { + /// The optimal number of hash functions for a given bitmap size and item + /// count is calculated as the bits-per-item * ln(2). + fn optimal_k_num(bitmap_bits: u64, items_count: u64) -> u32 { let m = bitmap_bits as f64; let n = items_count as f64; - let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32; + let k_num = (m / n * LN_2).ceil() as u32; cmp::max(k_num, 1) } @@ -237,6 +243,17 @@ mod tests { assert!(full >= 0.0039f64 && full <= 0.004f64); } + #[test] + fn test_compute_bitmap_size() { + use std::f64::consts::LN_2; + let bitmap_size = Bloom::compute_bitmap_size(10_000_000, 0.01); + // ~12Mbytes + let expected_size_in_bits = (-(10_000_000 as f64 * f64::ln(0.01)) / ( LN_2 * LN_2)).ceil() as u64; + assert_eq!(bitmap_size, expected_size_in_bits / 8); + let bloom = Bloom::new( bitmap_size,10_000_000); + assert_eq!(bloom.number_of_hash_functions(), 7); + } + #[test] fn hash_backward_compatibility_for_new() { let ss = vec!["you", "should", "not", "break", "hash", "backward", "compatibility"]; From a5f1f15c3349d305f05d76102141f198c7d7f05c Mon Sep 17 00:00:00 2001 From: David Palm Date: Thu, 26 Mar 2020 00:13:27 +0100 Subject: [PATCH 11/18] fix tests --- parity/cli/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index 4e11cfbdec9..dc926f0d6e8 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -1724,6 +1724,7 @@ mod tests { arg_snapshot_file: None, arg_restore_file: None, arg_db_rebuild_accounts_bloom_backup_path: None, + arg_db_rebuild_accounts_bloom_account_count: 123u64, arg_db_restore_accounts_bloom_backup_path: "".to_string(), arg_tools_hash_file: None, From 4c9f2505d5df112f2e650287b95fc67866317eef Mon Sep 17 00:00:00 2001 From: David Palm Date: Thu, 26 Mar 2020 09:45:37 +0100 Subject: [PATCH 12/18] Ensure bloom size is written to the db on creation --- ethcore/state-db/src/lib.rs | 7 ++++++- parity/cli/mod.rs | 2 +- util/bloom/src/lib.rs | 4 +++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index aa5c55c0030..1962aeb4b96 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -175,7 +175,12 @@ impl StateDB { }); let bloom_hash_functions = match bloom_hash_functions { Some(nr) => nr, - None => return Bloom::new_for_fp_rate(ACCOUNTS_BLOOM_ITEM_COUNT, ACCOUNTS_BLOOM_FP_RATE), + None => { + let mut batch = DBTransaction::new(); + batch.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY, &ACCOUNTS_BLOOM_ITEM_COUNT.to_le_bytes()); + db.write(batch).expect("Low-level database error"); + return Bloom::new_for_fp_rate(ACCOUNTS_BLOOM_ITEM_COUNT, ACCOUNTS_BLOOM_FP_RATE) + }, }; let item_count = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY) diff --git a/parity/cli/mod.rs b/parity/cli/mod.rs index dc926f0d6e8..566389faf16 100644 --- a/parity/cli/mod.rs +++ b/parity/cli/mod.rs @@ -1724,7 +1724,7 @@ mod tests { arg_snapshot_file: None, arg_restore_file: None, arg_db_rebuild_accounts_bloom_backup_path: None, - arg_db_rebuild_accounts_bloom_account_count: 123u64, + arg_db_rebuild_accounts_bloom_account_count: 100000000u64, arg_db_restore_accounts_bloom_backup_path: "".to_string(), arg_tools_hash_file: None, diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 133eff7c79e..23ddf16af82 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -249,7 +249,9 @@ mod tests { let bitmap_size = Bloom::compute_bitmap_size(10_000_000, 0.01); // ~12Mbytes let expected_size_in_bits = (-(10_000_000 as f64 * f64::ln(0.01)) / ( LN_2 * LN_2)).ceil() as u64; - assert_eq!(bitmap_size, expected_size_in_bits / 8); + // rounded up to nearest multiple of 8 + let expected_size_in_bytes = (((expected_size_in_bits / 8) + 7) / 8) * 8; + assert_eq!(bitmap_size, expected_size_in_bytes); let bloom = Bloom::new( bitmap_size,10_000_000); assert_eq!(bloom.number_of_hash_functions(), 7); } From 4f7f4f3cd59b1cbbf63da3fe64d84fa43edc7909 Mon Sep 17 00:00:00 2001 From: David Palm Date: Thu, 26 Mar 2020 15:39:12 +0100 Subject: [PATCH 13/18] resolve some todos docs don't use stored k_num (wip) --- Cargo.toml | 2 +- ethcore/state-db/src/lib.rs | 5 +++-- parity/rebuild_accounts_bloom.rs | 14 +++++++----- util/bloom/src/lib.rs | 38 +++++++++++++++----------------- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d8f2d2eecd4..34c2ddfde5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,7 +79,7 @@ state-db = { path = "ethcore/state-db" } term_size = "0.3" textwrap = "0.9" toml = "0.5.6" -trie-db = "0.20.0" # todo[dvdplm] maybe not? +trie-db = "0.20.0" verification = { path = "ethcore/verification" } [build-dependencies] diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 1962aeb4b96..675b9da7a11 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -167,6 +167,7 @@ impl StateDB { /// Loads accounts bloom from the database /// This bloom is used to quickly handle requests for non-existent accounts. pub fn load_bloom(db: &dyn KeyValueDB) -> Bloom { + // todo[dvdplm] Now this isn't needed anymore – how to handle new DBs? and legacy DBs? let bloom_hash_functions = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY) .expect("Low-level database error") .and_then(|bytes| { @@ -210,8 +211,8 @@ impl StateDB { .unwrap_or(0u64); } - let bloom = Bloom::from_parts(&bloom_parts, bloom_hash_functions); - debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", bloom.saturation(), bloom_hash_functions, bloom.number_of_bits()); + let bloom = Bloom::from_parts(&bloom_parts, item_count); + debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", bloom.saturation(), bloom.number_of_hash_functions(), bloom.number_of_bits()); bloom } diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index c37a3077de7..f64dc4a25b8 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -14,9 +14,13 @@ // You should have received a copy of the GNU General Public License // along with Open Ethereum. If not, see . -//! Resize the accounts bloom filter for modern times -//! todo[dvdplm] document the choice of parameters etc - +//! Resize the accounts bloom filter for modern times. ! The accounts bloom +//! filter provides a way to check if a given account (`Address`) exists or not +//! without touching the database. The filter cannot be resized with less than a +//! complete rebuild, i.e. iterate over all accounts in the state database and +//! mark each account in the bloom bitmap. At the time of writing the number of +//! ethereum accounts is ~85M and increasing. This module implements backing up, +//! clearing, rebuilding and restoring the accounts bloom filter. extern crate kvdb_rocksdb; extern crate state_db; @@ -75,7 +79,7 @@ pub fn rebuild_accounts_bloom>( backup_bloom(&backup_path, db.clone(), best_block)?; } - generate_bloom(db, account_count, state_root, best_block)?; + rebuild_bloom(db, account_count, state_root, best_block)?; Ok(()) } @@ -202,7 +206,7 @@ fn clear_bloom(db: Arc) -> Result<(), Error> { } /// Rebuild the account bloom. -fn generate_bloom( +fn rebuild_bloom( source: Arc, account_count: u64, state_root: H256, diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 23ddf16af82..8fe03893034 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -77,12 +77,12 @@ pub struct Bloom { impl Bloom { /// Create a new bloom filter structure. - /// bitmap_size is the size in bytes (not bits) that will be allocated in memory - /// items_count is an estimation of the maximum number of items to store. - pub fn new(bitmap_size: u64, items_count: u64) -> Bloom { - assert!(bitmap_size > 0 && items_count > 0); + /// `bitmap_size` is the size in bytes (not bits) that will be allocated in memory + /// `items_count` is an estimation of the maximum number of items to store. + pub fn new(bitmap_size: u64, item_count: u64) -> Bloom { + assert!(bitmap_size > 0 && item_count > 0); let bitmap_bits = bitmap_size * 8; - let k_num = Bloom::optimal_k_num(bitmap_bits, items_count); + let k_num = Bloom::optimal_k_num(bitmap_bits, item_count); let bitmap = BitVecJournal::new(bitmap_bits as usize); Bloom { bitmap, @@ -92,13 +92,11 @@ impl Bloom { } /// Initializes bloom filter from saved state - // todo[dvdplm] we should not need to pass in `k_num` here – it's a - // deterministic function of the item count in the bloom so we should be - // able to store one of the two only. - pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom { + pub fn from_parts(parts: &[u64], item_count: u64) -> Bloom { let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; let bitmap = BitVecJournal::from_parts(parts); + let k_num = Self::optimal_k_num(bitmap_bits, item_count); Bloom { bitmap, bitmap_bits, @@ -107,20 +105,20 @@ impl Bloom { } /// Create a new bloom filter structure. - /// items_count is an estimation of the maximum number of items to store. - /// fp_p is the wanted rate of false positives, in ]0.0, 1.0[ - pub fn new_for_fp_rate(items_count: u64, fp_p: f64) -> Bloom { - let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p); - Bloom::new(bitmap_size, items_count) + /// `item_count` is an estimation of the maximum number of items to store. + /// `fp_p` is the desired false positives rate, in ]0.0, 1.0[ + pub fn new_for_fp_rate(item_count: u64, fp_p: f64) -> Bloom { + let bitmap_size = Bloom::compute_bitmap_size(item_count, fp_p); + Bloom::new(bitmap_size, item_count) } /// Compute a recommended Bloom bitmap size in bytes for `items_count` items /// and a fp_p rate of false positives. /// `fp_p` obviously has to be within the ]0.0, 1.0[ range. - pub fn compute_bitmap_size(items_count: u64, fp_p: f64) -> u64 { - assert!(items_count > 0); + pub fn compute_bitmap_size(item_count: u64, fp_p: f64) -> u64 { + assert!(item_count > 0); assert!(fp_p > 0.0 && fp_p < 1.0); - let bitmap_size = ((items_count as f64) * f64::ln(fp_p) / (-8.0 * LN_2 * LN_2)).ceil() as u64; + let bitmap_size = ((item_count as f64) * f64::ln(fp_p) / (-8.0 * LN_2 * LN_2)).ceil() as u64; // Round up to nearest multiple of 8 because we need to use this to index u64s ((bitmap_size + 7) / 8) * 8 } @@ -162,10 +160,10 @@ impl Bloom { } /// The optimal number of hash functions for a given bitmap size and item - /// count is calculated as the bits-per-item * ln(2). - fn optimal_k_num(bitmap_bits: u64, items_count: u64) -> u32 { + /// count is calculated as `bits-per-item * ln(2)`. + fn optimal_k_num(bitmap_bits: u64, item_count: u64) -> u32 { let m = bitmap_bits as f64; - let n = items_count as f64; + let n = item_count as f64; let k_num = (m / n * LN_2).ceil() as u32; cmp::max(k_num, 1) } From 1508c94eea7b6916b858f19726e5acef7330e3f9 Mon Sep 17 00:00:00 2001 From: David Palm Date: Fri, 27 Mar 2020 23:09:40 +0100 Subject: [PATCH 14/18] Move the parts when restoring a bloom from the db (less cloning) --- ethcore/state-db/src/lib.rs | 4 ++-- util/bloom/src/lib.rs | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 675b9da7a11..5da35d64022 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -203,7 +203,7 @@ impl StateDB { let key: [u8; 8] = i.to_le_bytes(); bloom_parts[i as usize] = db.get(COL_ACCOUNT_BLOOM, &key).expect("low-level database error") .map(|val| { - assert_eq!(val.len(), 8, "low-level database error"); + assert_eq!(val.len(), 8, "Expected a u64"); let mut buff = [0u8; 8]; buff.copy_from_slice(&*val); u64::from_le_bytes(buff) @@ -211,7 +211,7 @@ impl StateDB { .unwrap_or(0u64); } - let bloom = Bloom::from_parts(&bloom_parts, item_count); + let bloom = Bloom::from_parts(bloom_parts, item_count); debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", bloom.saturation(), bloom.number_of_hash_functions(), bloom.number_of_bits()); bloom } diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 8fe03893034..5f0daaac6ae 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -37,9 +37,9 @@ impl BitVecJournal { } } - pub fn from_parts(parts: &[u64]) -> BitVecJournal { + pub fn from_parts(parts: Vec) -> BitVecJournal { BitVecJournal { - elems: parts.to_vec(), // todo[dvdplm] looks like a clone we could get rid of + elems: parts, journal: HashSet::new(), } } @@ -92,7 +92,7 @@ impl Bloom { } /// Initializes bloom filter from saved state - pub fn from_parts(parts: &[u64], item_count: u64) -> Bloom { + pub fn from_parts(parts: Vec, item_count: u64) -> Bloom { let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; let bitmap = BitVecJournal::from_parts(parts); @@ -222,22 +222,26 @@ mod tests { #[test] fn journalling() { + // Set up bloom a with 512 bits and 120 estimated items stored; we'll get a `k` of 3… let initial = vec![0u64; 8]; - let mut bloom = Bloom::from_parts(&initial, 3); + let mut bloom = Bloom::from_parts(initial, 120); + // …which will cause this particular key… bloom.set(&vec![5u8, 4]); let drain = bloom.drain_journal(); - + // …to set one bit in two different entries. assert_eq!(2, drain.entries.len()) } #[test] fn saturation() { + // Set up bloom a with 512 bits and 120 estimated items stored; we'll get a `k` of 3… let initial = vec![0u64; 8]; - let mut bloom = Bloom::from_parts(&initial, 3); + let mut bloom = Bloom::from_parts(initial, 120); + // …which will cause this particular key to set one bit in two different entries. bloom.set(&vec![5u8, 4]); let full = bloom.saturation(); - // 2/8/64 = 0.00390625 + // 2 bits touched, over 8 entries where each entry has 64 bits, so 2/8/64 = 0.00390625 assert!(full >= 0.0039f64 && full <= 0.004f64); } @@ -272,7 +276,7 @@ mod tests { fn hash_backward_compatibility_for_from_parts() { let stored_state = vec![2094615114573771027u64, 244675582389208413u64]; let k_num = 12; - let bloom = Bloom::from_parts(&stored_state, k_num); + let bloom = Bloom::from_parts(stored_state, k_num); let ss = vec!["you", "should", "not", "break", "hash", "backward", "compatibility"]; let tt = vec!["this", "doesnot", "exist"]; From 22c8847d2e6c671ebd8202435cc43036137156c2 Mon Sep 17 00:00:00 2001 From: David Palm Date: Sat, 28 Mar 2020 22:57:01 +0100 Subject: [PATCH 15/18] Sort out legacy dbs Un-pubify bloom code cleanup --- ethcore/state-db/src/lib.rs | 108 +++++++++++++++++++----------------- util/bloom/src/lib.rs | 32 ++++++----- 2 files changed, 75 insertions(+), 65 deletions(-) diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 5da35d64022..8e4032b5975 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -24,7 +24,7 @@ use ethereum_types::{Address, H256}; use hash_db::HashDB; use keccak_hash::keccak; use kvdb::{DBTransaction, DBValue, KeyValueDB}; -use log::{debug, trace}; +use log::{debug, trace, warn}; use lru_cache::LruCache; use parking_lot::Mutex; @@ -36,16 +36,10 @@ use journaldb::JournalDB; use keccak_hasher::KeccakHasher; use memory_cache::MemoryLruCache; -/// Value used to initialize bloom bitmap size. -/// -/// Bitmap size is the size in bytes (not bits) that will be allocated in memory. -// todo[dvdplm] deprecate this one -const LEGACY_ACCOUNTS_BLOOM_ITEM_COUNT: u64 = 1048576; - - -/// Value used to initialize bloom items count. +/// Value used to initialize the bloom items count for new DBs /// /// Items count is an estimation of the maximum number of items to store. +// todo[dvdplm] Determine the best value here. Should probably be twice as big. pub const ACCOUNTS_BLOOM_ITEM_COUNT: u64 = 100_000_000; /// False positive rate for the accounts bloom filter: 1 in 100. pub const ACCOUNTS_BLOOM_FP_RATE: f64 = 0.01; @@ -59,6 +53,8 @@ const STATE_CACHE_BLOCKS: usize = 12; // The percentage of supplied cache size to go to accounts. const ACCOUNT_CACHE_RATIO: usize = 90; +const DB_ERROR: &'static str = "Low-level database error"; + /// Shared canonical state cache. struct AccountCache { /// DB Account cache. `None` indicates that account is known to be missing. @@ -164,44 +160,11 @@ impl StateDB { } } - /// Loads accounts bloom from the database - /// This bloom is used to quickly handle requests for non-existent accounts. - pub fn load_bloom(db: &dyn KeyValueDB) -> Bloom { - // todo[dvdplm] Now this isn't needed anymore – how to handle new DBs? and legacy DBs? - let bloom_hash_functions = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY) - .expect("Low-level database error") - .and_then(|bytes| { - assert_eq!(bytes.len(), 1); - Some(bytes[0] as u32) - }); - let bloom_hash_functions = match bloom_hash_functions { - Some(nr) => nr, - None => { - let mut batch = DBTransaction::new(); - batch.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY, &ACCOUNTS_BLOOM_ITEM_COUNT.to_le_bytes()); - db.write(batch).expect("Low-level database error"); - return Bloom::new_for_fp_rate(ACCOUNTS_BLOOM_ITEM_COUNT, ACCOUNTS_BLOOM_FP_RATE) - }, - }; - - let item_count = db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY) - .expect("Low-level database error") - .and_then(|bytes| { - assert_eq!(bytes.len(), 8, "Expected a u64"); - let mut buf = [0u8; 8]; - buf.copy_from_slice(&*bytes); - trace!(target: "accounts_bloom", "DB had a value under 'accounts_bloom_item_count': {:?} (as u64: {})", &bytes, u64::from_le_bytes(buf)); - Some(u64::from_le_bytes(buf)) - }) - // Assume this is an old bloom - .unwrap_or(LEGACY_ACCOUNTS_BLOOM_ITEM_COUNT); - debug!(target: "accounts_bloom", "Accounts bloom is sized for {} entries", item_count); - - let bitmap_size = Bloom::compute_bitmap_size(item_count, ACCOUNTS_BLOOM_FP_RATE); + fn fetch_bloom_parts(db: &dyn KeyValueDB, bitmap_size: u64) -> Vec { let mut bloom_parts = vec![0u64; (bitmap_size / 8) as usize]; - for i in 0..bitmap_size / 8 { + for i in 0..bloom_parts.len() { let key: [u8; 8] = i.to_le_bytes(); - bloom_parts[i as usize] = db.get(COL_ACCOUNT_BLOOM, &key).expect("low-level database error") + bloom_parts[i as usize] = db.get(COL_ACCOUNT_BLOOM, &key).expect(DB_ERROR) .map(|val| { assert_eq!(val.len(), 8, "Expected a u64"); let mut buff = [0u8; 8]; @@ -210,17 +173,58 @@ impl StateDB { }) .unwrap_or(0u64); } + bloom_parts + } + + /// Loads accounts bloom from the database + /// This bloom is used to quickly handle requests for non-existent accounts. + pub fn load_bloom(db: &dyn KeyValueDB) -> Bloom { + let (bloom, item_count) = + if db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY) + .expect(DB_ERROR) + .is_some() { + // The legacy values for bitmap size and hash function count + // (ACCOUNT_BLOOM_SPACE, DEFAULT_ACCOUNT_PRESET) are not + // optimal, so we can't calculate them. + let parts = Self::fetch_bloom_parts(db, 1048576); + (Bloom::from_parts_legacy(parts, 6), 1_000_000) + } else { + let item_count = + db.get(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY) + .expect(DB_ERROR) + .and_then(|bytes| { + assert_eq!(bytes.len(), 8, "Expected a u64"); + let mut buf = [0u8; 8]; + buf.copy_from_slice(&*bytes); + let val = u64::from_le_bytes(buf); + trace!(target: "accounts_bloom", "DB has a value under 'accounts_bloom_item_count': {}", val); + Some(val) + }) + // Assume this is a new DB + .unwrap_or_else(|| { + trace!(target: "accounts_bloom", "New database, building default bloom with space for {} accounts", ACCOUNTS_BLOOM_ITEM_COUNT); + let mut tx = DBTransaction::new(); + tx.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_ITEM_COUNT_KEY, &ACCOUNTS_BLOOM_ITEM_COUNT.to_le_bytes()); + db.write(tx).expect(DB_ERROR); + ACCOUNTS_BLOOM_ITEM_COUNT + }); + let bitmap_size = Bloom::compute_bitmap_size(item_count, ACCOUNTS_BLOOM_FP_RATE); + let parts = Self::fetch_bloom_parts(db, bitmap_size); + (Bloom::from_parts(parts, item_count), item_count) + }; + + debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", + bloom.saturation(), bloom.number_of_hash_functions(), bloom.number_of_bits()); + if bloom.saturation() > 0.9 { + warn!("Your accounts bloom is almost full ({}). Please rebuild it with more space. Your current filter uses {} bits and was built for {} accounts.", + bloom.saturation(), bloom.number_of_bits(), item_count); + } - let bloom = Bloom::from_parts(bloom_parts, item_count); - debug!(target: "accounts_bloom", "Bloom saturation: {:?}, hash functions: {:?}, bitmap size: {} bits", bloom.saturation(), bloom.number_of_hash_functions(), bloom.number_of_bits()); bloom } /// Commit blooms journal to the database transaction pub fn commit_bloom(batch: &mut DBTransaction, journal: BloomJournal) -> io::Result<()> { - assert!(journal.hash_functions <= 255); - batch.put(COL_ACCOUNT_BLOOM, ACCOUNTS_BLOOM_HASHCOUNT_KEY, &[journal.hash_functions as u8]); - for (bloom_part_index, bloom_part_value) in journal.entries { let key: [u8; 8] = (bloom_part_index as u64).to_le_bytes(); let val: [u8; 8] = bloom_part_value.to_le_bytes(); @@ -490,13 +494,13 @@ impl account_state::Backend for StateDB { fn note_non_null_account(&self, address: &Address) { trace!(target: "accounts_bloom", "Note account bloom: {:?}", address); let mut bloom = self.account_bloom.lock(); - bloom.set(keccak(address)); + bloom.set(keccak(address).as_bytes()); } fn is_known_null(&self, address: &Address) -> bool { trace!(target: "accounts_bloom", "Check account bloom: {:?}", address); let bloom = self.account_bloom.lock(); - let is_null = !bloom.check(keccak(address)); + let is_null = !bloom.check(keccak(address).as_bytes()); is_null } } diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 5f0daaac6ae..fa00a3ca0db 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -29,7 +29,7 @@ struct BitVecJournal { } impl BitVecJournal { - pub fn new(size: usize) -> BitVecJournal { + fn new(size: usize) -> BitVecJournal { let extra = if size % 64 > 0 { 1 } else { 0 }; BitVecJournal { elems: vec![0u64; size / 64 + extra], @@ -37,14 +37,14 @@ impl BitVecJournal { } } - pub fn from_parts(parts: Vec) -> BitVecJournal { + fn from_parts(parts: Vec) -> BitVecJournal { BitVecJournal { elems: parts, journal: HashSet::new(), } } - pub fn set(&mut self, index: usize) { + fn set(&mut self, index: usize) { let e_index = index / 64; let bit_index = index % 64; let val = self.elems.get_mut(e_index).unwrap(); @@ -52,18 +52,18 @@ impl BitVecJournal { self.journal.insert(e_index); } - pub fn get(&self, index: usize) -> bool { + fn get(&self, index: usize) -> bool { let e_index = index / 64; let bit_index = index % 64; self.elems[e_index] & (1 << bit_index) != 0 } - pub fn drain(&mut self) -> Vec<(usize, u64)> { + fn drain(&mut self) -> Vec<(usize, u64)> { let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter(); journal.map(|idx| (idx, self.elems[idx])).collect::>() } - pub fn saturation(&self) -> f64 { + fn saturation(&self) -> f64 { self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64 } } @@ -79,7 +79,7 @@ impl Bloom { /// Create a new bloom filter structure. /// `bitmap_size` is the size in bytes (not bits) that will be allocated in memory /// `items_count` is an estimation of the maximum number of items to store. - pub fn new(bitmap_size: u64, item_count: u64) -> Bloom { + fn new(bitmap_size: u64, item_count: u64) -> Bloom { assert!(bitmap_size > 0 && item_count > 0); let bitmap_bits = bitmap_size * 8; let k_num = Bloom::optimal_k_num(bitmap_bits, item_count); @@ -91,17 +91,23 @@ impl Bloom { } } - /// Initializes bloom filter from saved state + /// The legacy accounts bloom filter used non-optimal parameters that cannot + /// be calculated with the facilities in this crate, hence this method that + /// allows the instantiation of a non-optimal filter so that older databases + /// can continue to work. DO NOT USE FOR OTHER PURPOSES. + pub fn from_parts_legacy(parts: Vec, k_num: u32) -> Bloom { + let bitmap_bits = parts.len() as u64 * 64 ; + let bitmap = BitVecJournal::from_parts(parts); + Bloom { bitmap, bitmap_bits, k_num } + } + + /// Initializes a bloom filter from saved state pub fn from_parts(parts: Vec, item_count: u64) -> Bloom { let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; let bitmap = BitVecJournal::from_parts(parts); let k_num = Self::optimal_k_num(bitmap_bits, item_count); - Bloom { - bitmap, - bitmap_bits, - k_num, - } + Bloom { bitmap, bitmap_bits, k_num } } /// Create a new bloom filter structure. From 599cc7f3295b8f9be1d1c75e43d2c2743bb0ce53 Mon Sep 17 00:00:00 2001 From: David Palm Date: Sun, 29 Mar 2020 20:06:06 +0200 Subject: [PATCH 16/18] Speed up fetching bloom parts from the DB --- ethcore/state-db/src/lib.rs | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 8e4032b5975..6448cec3c02 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -161,18 +161,35 @@ impl StateDB { } fn fetch_bloom_parts(db: &dyn KeyValueDB, bitmap_size: u64) -> Vec { - let mut bloom_parts = vec![0u64; (bitmap_size / 8) as usize]; - for i in 0..bloom_parts.len() { - let key: [u8; 8] = i.to_le_bytes(); - bloom_parts[i as usize] = db.get(COL_ACCOUNT_BLOOM, &key).expect(DB_ERROR) - .map(|val| { - assert_eq!(val.len(), 8, "Expected a u64"); + let nr_parts = bitmap_size / 8; + let mut bloom_parts = vec![0u64; nr_parts as usize]; + trace!(target: "accounts_bloom]", "Fething bloom from disk. bitmap_size={}, nr_parts={}", bitmap_size, nr_parts); + + let start = std::time::Instant::now(); + for (k, v) in db.iter(COL_ACCOUNT_BLOOM) { + // The only keys in the `COL_ACCOUNT_BLOOM` that are not `u64`s are + // the two keys where we store the number of hash functions for + // legacy blooms (ACCOUNTS_BLOOM_HASHCOUNT_KEY) and the number of + // estimated items for the bloom (ACCOUNTS_BLOOM_ITEM_COUNT_KEY). + if k.len() == 8 { + let part_idx = { let mut buff = [0u8; 8]; - buff.copy_from_slice(&*val); + buff.copy_from_slice(&*k); u64::from_le_bytes(buff) - }) - .unwrap_or(0u64); + }; + bloom_parts[part_idx as usize] = { + let mut buff = [0u8; 8]; + buff.copy_from_slice(&*v); + u64::from_le_bytes(buff) + }; + } else { + assert!( + &*k == ACCOUNTS_BLOOM_HASHCOUNT_KEY || &*k == ACCOUNTS_BLOOM_ITEM_COUNT_KEY, + "Expect the DB to contain `u64`s or the above two keys – corrupt db?" + ) + } } + debug!(target: "accounts_bloom", "Fetched the bloom from the DB in {:?}. bloom_parts.len={}", start.elapsed(), bloom_parts.len()); bloom_parts } From ccec3c5ad5a47ed4ad691edcc8ea5ac6b4bb00b7 Mon Sep 17 00:00:00 2001 From: David Palm Date: Sun, 29 Mar 2020 21:35:30 +0200 Subject: [PATCH 17/18] Multithreaded blooms rebuilder --- Cargo.lock | 1 + Cargo.toml | 1 + parity/rebuild_accounts_bloom.rs | 135 ++++++++++++++++++++++--------- 3 files changed, 97 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d5e9017cac8..e8fe97aa2c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3184,6 +3184,7 @@ dependencies = [ "cli-signer", "client-traits", "common-types", + "crossbeam-utils 0.7.2", "ctrlc", "dir", "docopt", diff --git a/Cargo.toml b/Cargo.toml index 34c2ddfde5c..44ba879ad22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ clap = "2" cli-signer= { path = "cli-signer" } client-traits = { path = "ethcore/client-traits" } common-types = { path = "ethcore/types" } +crossbeam-utils = "0.7.2" ctrlc = { git = "https://github.com/paritytech/rust-ctrlc.git" } dir = { path = "util/dir" } docopt = "1.0" diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index f64dc4a25b8..82725a7781f 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -22,28 +22,31 @@ //! ethereum accounts is ~85M and increasing. This module implements backing up, //! clearing, rebuilding and restoring the accounts bloom filter. -extern crate kvdb_rocksdb; -extern crate state_db; -extern crate patricia_trie_ethereum as ethtrie; extern crate account_state; extern crate ethcore_bloom_journal as accounts_bloom; +extern crate kvdb_rocksdb; +extern crate patricia_trie_ethereum as ethtrie; +extern crate state_db; extern crate trie_db; +extern crate crossbeam_utils; use std::{ path::Path, - sync::Arc, + sync::{Arc, atomic::{AtomicU64, Ordering}}, }; use ethcore_db::{COL_EXTRA, COL_HEADERS, COL_STATE, COL_ACCOUNT_BLOOM}; use ethereum_types::{H256, U256}; use journaldb; use kvdb::DBTransaction; +use parking_lot::Mutex; use self::{ account_state::account::Account as StateAccount, accounts_bloom::Bloom, // todo[dvdplm] rename this crate + crossbeam_utils::thread, ethtrie::TrieDB, kvdb_rocksdb::{CompactionProfile, Database, DatabaseConfig}, - state_db::StateDB, + state_db::{StateDB, ACCOUNTS_BLOOM_ITEM_COUNT_KEY}, trie_db::Trie, }; use types::{ @@ -52,7 +55,6 @@ use types::{ views::{HeaderView, ViewRlp}, }; use rlp::{RlpStream, Rlp}; -use self::state_db::ACCOUNTS_BLOOM_ITEM_COUNT_KEY; pub fn rebuild_accounts_bloom>( db_path: P, @@ -154,7 +156,7 @@ fn backup_bloom>( use std::io::Write; let written = bloom_backup.write(&stream.out())?; - info!("Saved old bloom as of block#{} to '{}' ({} bytes, {} keys)", best_block, bloom_backup_path.as_ref().display(), written, num_keys); + info!("Saved old bloom as of block #{} to '{}' ({} bytes, {} keys)", best_block, bloom_backup_path.as_ref().display(), written, num_keys); Ok(()) } @@ -213,7 +215,7 @@ fn rebuild_bloom( best_block: BlockNumber, ) -> Result<(), Error> { let num_keys = source.num_keys(COL_STATE)? / 2; - info!(target: "migration", "Account bloom rebuild started for chain at #{}. There are {} accounts in the DB", best_block, num_keys); + info!(target: "migration", "Accounts bloom rebuild started for chain at #{}. There are {} accounts in the DB", best_block, num_keys); if account_count <= num_keys { warn!("Rebuilding the bloom with space for {} accounts when the DB contains {} keys is not a good idea: the bloom filter will be saturated right away.", account_count, num_keys @@ -221,43 +223,95 @@ fn rebuild_bloom( } clear_bloom(source.clone())?; - let mut empty_accounts = 0u64; - let mut non_empty_accounts = 0u64; - let mut bloom = { - let mut bloom = Bloom::new_for_fp_rate(account_count, 0.01); - let state_db = journaldb::new( - source.clone(), - // It does not matter which `journaldb::Algorithm` is used since - // there will be no writes to the state column. - journaldb::Algorithm::OverlayRecent, - COL_STATE); + // Progress counters + let empty_accounts = Arc::new(AtomicU64::new(0)); + let non_empty_accounts = Arc::new(AtomicU64::new(0)); + let total_accounts = Arc::new(AtomicU64::new(0)); + + let state_db = journaldb::new( + source.clone(), + // It does not matter which `journaldb::Algorithm` is used since + // there will be no writes to the state column. + journaldb::Algorithm::OverlayRecent, + COL_STATE); - let db = state_db.as_hash_db(); - let account_trie = TrieDB::new(&db, &state_root)?; - // Don't insert empty accounts into the bloom - let empty_account_rlp = StateAccount::new_basic(U256::zero(), U256::zero()).rlp(); - let start = std::time::Instant::now(); - let mut batch_start = std::time::Instant::now(); - for (n, (account_key, account_data)) in account_trie.iter()?.filter_map(Result::ok).enumerate() { - if n > 0 && n % 50_000 == 0 { - info!(" Accounts processed: {} in {:?}. Bloom saturation: {}", n, batch_start.elapsed(), bloom.saturation()); - batch_start = std::time::Instant::now(); - } - if account_data != empty_account_rlp { - non_empty_accounts += 1; - let account_key_hash = H256::from_slice(&account_key); - bloom.set(account_key_hash); - } else { - empty_accounts += 1; - } + let db = state_db.as_hash_db(); + let start = std::time::Instant::now(); + + let threads = 6; + // Chunk up the state in this many parts; each thread will be assigned one part at a time. + const STATE_SUBPARTS: usize = 16; + let bloom_result = thread::scope(|scope| -> Result>, Error> { + let bloom = Bloom::new_for_fp_rate(account_count, 0.01); + let bloom = Arc::new(Mutex::new(bloom)); + for thr_idx in 0..threads { + let tb = scope.builder().name(format!("accounts worker #{}", thr_idx).to_string()); + let my_bloom = bloom.clone(); + let my_total_accounts = total_accounts.clone(); + let my_non_empty_accounts = non_empty_accounts.clone(); + let my_empty_accounts = empty_accounts.clone(); + tb.spawn(move |_| -> Result<(), Error> { + let mut part_start = std::time::Instant::now(); + // Don't insert empty accounts into the bloom + let empty_account_rlp = StateAccount::new_basic(U256::zero(), U256::zero()).rlp(); + for part in (thr_idx..STATE_SUBPARTS).step_by(threads) { + info!("Processing part {} of the accounts in thread {}", part, thr_idx); + let account_trie = TrieDB::new(&db, &state_root)?; + let mut account_iter = account_trie.iter()?; + + // Seek to the start of this data segment + let mut seek_from = vec![0; 32]; + seek_from[0] = (part * STATE_SUBPARTS) as u8; + account_iter.seek(&seek_from)?; + // Set the upper-bound for this section of the data (but let the last part finish the whole range). + let seek_to = + if part < STATE_SUBPARTS - 1 { + Some(((part + 1) * STATE_SUBPARTS) as u8) + } else { + None + }; + let mut batch_start = std::time::Instant::now(); + for (n, (account_key, account_data)) in account_iter.filter_map(Result::ok).enumerate() { + if seek_to.map_or(false, |seek_to| account_key[0] >= seek_to) { + my_total_accounts.fetch_add(n as u64, Ordering::Relaxed); + let sat = my_bloom.lock().saturation(); + info!(" {} accounts processed in {:?} – end of part {} by thread {}. Bloom saturation: {}", n, part_start.elapsed(), part, thr_idx, sat); + part_start = std::time::Instant::now(); + + break; + } + if n > 0 && n % 50_000 == 0 { + info!(" Accounts processed: {} in {:?} by thread {}", n, batch_start.elapsed(), thr_idx); + batch_start = std::time::Instant::now(); + } + if account_data != empty_account_rlp { + my_bloom.lock().set(&account_key); + my_non_empty_accounts.fetch_add(1, Ordering::Relaxed); + } else { + my_empty_accounts.fetch_add(1, Ordering::Relaxed); + } + } + } + Ok(()) + })?; } - info!("Finished iterating over the accounts as of block #{} in: {:?}. Bloom saturation: {}", best_block, start.elapsed(), bloom.saturation()); - bloom - }; + Ok(bloom) + }); + let bloom = match bloom_result { + Ok(bloom_arc) => bloom_arc?, + Err(e) => { + warn!("One of the bloom-building threads panicked: {:?}", e); + return Err("One of the bloom-building threads panicked".into()) + } + }; + let mut bloom = bloom.lock(); + info!("Finished iterating over {} accounts as of block #{} in: {:?}. Bloom saturation: {}", + total_accounts.load(Ordering::Relaxed), best_block, start.elapsed(), bloom.saturation()); let bloom_journal = bloom.drain_journal(); - info!(target: "migration", "Generated {} bloom entries; the DB has {} empty accounts and {} non-empty accounts", bloom_journal.entries.len(), empty_accounts, non_empty_accounts); + info!(target: "migration", "Generated {} bloom entries; the DB has {} empty accounts and {} non-empty accounts", + bloom_journal.entries.len(), empty_accounts.load(Ordering::Relaxed), non_empty_accounts.load(Ordering::Relaxed)); info!(target: "migration", "New bloom has {} k_bits (aka 'hash functions') and a bitmap size of {} bits", bloom_journal.hash_functions, bloom.number_of_bits()); let mut batch = DBTransaction::new(); StateDB::commit_bloom(&mut batch, bloom_journal)?; @@ -266,5 +320,6 @@ fn rebuild_bloom( source.write(batch)?; source.flush()?; info!(target: "migration", "Finished bloom update for chain at #{}", best_block); + Ok(()) } From c14667c4b9603640631fdb7c16375bcdf6e77998 Mon Sep 17 00:00:00 2001 From: David Palm Date: Tue, 31 Mar 2020 22:23:01 +0200 Subject: [PATCH 18/18] Use all available CPUs to rebuild the bloom Don't allow bloom loading to panic the client: check bounds Report on state memory used while importing blocks from disk Measure time elapsed for actual import when importing from disk --- Cargo.lock | 14 ++++++++++++-- Cargo.toml | 2 +- accounts/ethstore/cli/Cargo.toml | 2 +- ethcore/state-db/src/lib.rs | 14 +++++++++----- parity/blockchain.rs | 23 ++++++++++++++++------- parity/rebuild_accounts_bloom.rs | 9 +++++++-- 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8fe97aa2c1..4605b64f597 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2122,6 +2122,15 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "hermit-abi" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "725cf19794cf90aa94e65050cb4191ff5d8fa87a498383774c47b332e3af952e" +dependencies = [ + "libc", +] + [[package]] name = "hex-literal" version = "0.2.1" @@ -3134,10 +3143,11 @@ checksum = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" [[package]] name = "num_cpus" -version = "1.10.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273" +checksum = "46203554f085ff89c235cd12f7075f3233af9b11ed7c9e16dfe2560d03313ce6" dependencies = [ + "hermit-abi", "libc", ] diff --git a/Cargo.toml b/Cargo.toml index 44ba879ad22..4ee6c19413d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,7 @@ kvdb-rocksdb = "0.6.0" log = "0.4" migration-rocksdb = { path = "util/migration-rocksdb" } node-filter = { path = "ethcore/node-filter" } -num_cpus = "1.2" +num_cpus = "1.12" number_prefix = "0.2" panic_hook = { path = "util/panic-hook" } parity-bytes = "0.1" diff --git a/accounts/ethstore/cli/Cargo.toml b/accounts/ethstore/cli/Cargo.toml index 4bc70ce16a2..1ff7d36fe45 100644 --- a/accounts/ethstore/cli/Cargo.toml +++ b/accounts/ethstore/cli/Cargo.toml @@ -7,7 +7,7 @@ authors = ["Parity Technologies "] [dependencies] docopt = "1.0" env_logger = "0.5" -num_cpus = "1.6" +num_cpus = "1.12" rustc-hex = "2.1.0" serde = "1.0" serde_derive = "1.0" diff --git a/ethcore/state-db/src/lib.rs b/ethcore/state-db/src/lib.rs index 6448cec3c02..0fb973025b6 100644 --- a/ethcore/state-db/src/lib.rs +++ b/ethcore/state-db/src/lib.rs @@ -177,11 +177,15 @@ impl StateDB { buff.copy_from_slice(&*k); u64::from_le_bytes(buff) }; - bloom_parts[part_idx as usize] = { - let mut buff = [0u8; 8]; - buff.copy_from_slice(&*v); - u64::from_le_bytes(buff) - }; + if part_idx >= nr_parts { + warn!(target: "accounts_bloom", "Accounts bloom DB has a key out of bounds: {}/{:?}. Expected {} bloom parts.", part_idx, k, nr_parts); + } else { + bloom_parts[part_idx as usize] = { + let mut buff = [0u8; 8]; + buff.copy_from_slice(&*v); + u64::from_le_bytes(buff) + }; + } } else { assert!( &*k == ACCOUNTS_BLOOM_HASHCOUNT_KEY || &*k == ACCOUNTS_BLOOM_ITEM_COUNT_KEY, diff --git a/parity/blockchain.rs b/parity/blockchain.rs index 2ac6287ed1f..dd91c1c61b2 100644 --- a/parity/blockchain.rs +++ b/parity/blockchain.rs @@ -333,8 +333,6 @@ fn execute_import_light(cmd: ImportBlockchain) -> Result<(), String> { } fn execute_import(cmd: ImportBlockchain) -> Result<(), String> { - let timer = Instant::now(); - // load spec file let spec = cmd.spec.spec(&cmd.dirs.cache)?; @@ -430,19 +428,20 @@ fn execute_import(cmd: ImportBlockchain) -> Result<(), String> { )); service.register_io_handler(informant).map_err(|_| "Unable to register informant handler".to_owned())?; - + let timer = Instant::now(); client.import_blocks(instream, cmd.format)?; - + let elapsed = timer.elapsed(); + let ms = timer.elapsed().as_millis(); // save user defaults user_defaults.pruning = algorithm; user_defaults.tracing = tracing; user_defaults.fat_db = fat_db; user_defaults.save(&user_defaults_path)?; + std::thread::sleep(Duration::from_secs(1)); let report = client.report(); - let elapsed = timer.elapsed(); - let ms = timer.elapsed().as_millis(); - info!("Import completed in {} seconds, {} blocks, {} blk/s, {} transactions, {} tx/s, {} Mgas, {} Mgas/s", + + info!("Import completed in {} seconds, {} blocks, {} blk/s, {} transactions, {} tx/s, {} Mgas, {} Mgas/s, {} state DB memory", elapsed.as_secs(), report.blocks_imported, (report.blocks_imported as u128 * 1000) / ms, @@ -450,6 +449,16 @@ fn execute_import(cmd: ImportBlockchain) -> Result<(), String> { (report.transactions_applied as u128 * 1000) / ms, report.gas_processed / 1_000_000, report.gas_processed / (ms * 1000), + report.state_db_mem, + ); + info!("Import i/o stats. {} reads, {} bytes read, {} cached reads, {} bytes cached reads, {} writes, {} bytes written, {} db transactions", + report.io_stats.reads, + report.io_stats.bytes_read, + report.io_stats.cache_reads, + report.io_stats.cache_read_bytes, + report.io_stats.writes, + report.io_stats.bytes_written, + report.io_stats.transactions, ); Ok(()) } diff --git a/parity/rebuild_accounts_bloom.rs b/parity/rebuild_accounts_bloom.rs index 82725a7781f..24a73d66630 100644 --- a/parity/rebuild_accounts_bloom.rs +++ b/parity/rebuild_accounts_bloom.rs @@ -215,7 +215,7 @@ fn rebuild_bloom( best_block: BlockNumber, ) -> Result<(), Error> { let num_keys = source.num_keys(COL_STATE)? / 2; - info!(target: "migration", "Accounts bloom rebuild started for chain at #{}. There are {} accounts in the DB", best_block, num_keys); + info!(target: "migration", "Accounts bloom rebuild started for chain at #{}. There are {} accounts in the DB (estimate).", best_block, num_keys); if account_count <= num_keys { warn!("Rebuilding the bloom with space for {} accounts when the DB contains {} keys is not a good idea: the bloom filter will be saturated right away.", account_count, num_keys @@ -239,7 +239,12 @@ fn rebuild_bloom( let db = state_db.as_hash_db(); let start = std::time::Instant::now(); - let threads = 6; + // 1 thread: 49627s –> ~14h + // 4 threads: 10825s –> ~3h + // 6 threads: 9399s –> ~2.6h + // 12 threads: 9401s -> ~2.6h (slightly bigger chain though) + // 16 threads: 8805s –> ~2.45h + let threads = num_cpus::get(); // Chunk up the state in this many parts; each thread will be assigned one part at a time. const STATE_SUBPARTS: usize = 16; let bloom_result = thread::scope(|scope| -> Result>, Error> {