diff --git a/ledger/src/blockstore_db.rs b/ledger/src/blockstore_db.rs index f7003112c7a..c9ef037032f 100644 --- a/ledger/src/blockstore_db.rs +++ b/ledger/src/blockstore_db.rs @@ -95,9 +95,6 @@ const PROGRAM_COSTS_CF: &str = "program_costs"; /// Column family for optimistic slots const OPTIMISTIC_SLOTS_CF: &str = "optimistic_slots"; -// 1 day is chosen for the same reasoning of DEFAULT_COMPACTION_SLOT_INTERVAL -const PERIODIC_COMPACTION_SECONDS: u64 = 60 * 60 * 24; - #[derive(Error, Debug)] pub enum BlockstoreError { ShredForIndexExists, @@ -281,7 +278,7 @@ impl Rocks { path, Self::cf_descriptors(&options, &oldest_slot), )?, - access_type: access_type.clone(), + access_type, oldest_slot, column_options, write_batch_perf_status: PerfSamplingStatus::default(), @@ -302,70 +299,13 @@ impl Rocks { &secondary_path, Self::cf_descriptors(&options, &oldest_slot), )?, - access_type: access_type.clone(), + access_type, oldest_slot, column_options, write_batch_perf_status: PerfSamplingStatus::default(), } } }; - // This is only needed by solana-validator for LedgerCleanupService so guard with AccessType::Primary - if matches!(access_type, AccessType::Primary) { - for cf_name in Self::columns() { - // these special column families must be excluded from LedgerCleanupService's rocksdb - // compactions - if should_exclude_from_compaction(cf_name) { - continue; - } - - // This is the crux of our write-stall-free storage cleaning strategy with consistent - // state view for higher-layers - // - // For the consistent view, we commit delete_range on pruned slot range by LedgerCleanupService. - // simple story here. - // - // For actual storage cleaning, we employ RocksDB compaction. But default RocksDB compaction - // settings don't work well for us. That's because we're using it rather like a really big - // (100 GBs) ring-buffer. RocksDB is basically assuming uniform data write over the key space for - // efficient compaction, which isn't true for our use as a ring buffer. - // - // So, we customize the compaction strategy with 2 combined tweaks: - // (1) compaction_filter and (2) shortening its periodic cycles. - // - // Via the compaction_filter, we finally reclaim previously delete_range()-ed storage occupied - // by pruned slots. When compaction_filter is set, each SST files are re-compacted periodically - // to hunt for keys newly expired by the compaction_filter re-evaluation. But RocksDb's default - // `periodic_compaction_seconds` is 30 days, which is too long for our case. So, we - // shorten it to a day (24 hours). - // - // As we write newer SST files over time at rather consistent rate of speed, this - // effectively makes each newly-created sets be re-compacted for the filter at - // well-dispersed different timings. - // As a whole, we rewrite the whole dataset at every PERIODIC_COMPACTION_SECONDS, - // slowly over the duration of PERIODIC_COMPACTION_SECONDS. So, this results in - // amortization. - // So, there is a bit inefficiency here because we'll rewrite not-so-old SST files - // too. But longer period would introduce higher variance of ledger storage sizes over - // the long period. And it's much better than the daily IO spike caused by compact_range() by - // previous implementation. - // - // `ttl` and `compact_range`(`ManualCompaction`), doesn't work nicely. That's - // because its original intention is delete_range()s to reclaim disk space. So it tries to merge - // them with N+1 SST files all way down to the bottommost SSTs, often leading to vastly large amount - // (= all) of invalidated SST files, when combined with newer writes happening at the opposite - // edge of the key space. This causes a long and heavy disk IOs and possible write - // stall and ultimately, the deadly Replay/Banking stage stall at higher layers. - db.db - .set_options_cf( - db.cf_handle(cf_name), - &[( - "periodic_compaction_seconds", - &format!("{}", PERIODIC_COMPACTION_SECONDS), - )], - ) - .unwrap(); - } - } Ok(db) }