diff --git a/Cargo.lock b/Cargo.lock index ab5874283bc..46a8ffe2e68 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,6 +169,7 @@ dependencies = [ "histogram", "itertools 0.12.1", "log", + "mimalloc", "num_cpus", "rayon", "regex", @@ -192,6 +193,7 @@ dependencies = [ "solana-log-collector", "solana-logger", "solana-measure", + "solana-poh", "solana-program-runtime", "solana-rpc", "solana-runtime", @@ -207,7 +209,6 @@ dependencies = [ "solana-vote-program", "solana_rbpf", "thiserror 2.0.4", - "tikv-jemallocator", "tokio", ] @@ -263,6 +264,7 @@ dependencies = [ "libc", "libloading", "log", + "mimalloc", "num_cpus", "predicates", "rand 0.8.5", @@ -308,7 +310,6 @@ dependencies = [ "symlink", "tempfile", "thiserror 2.0.4", - "tikv-jemallocator", "tokio", ] @@ -907,6 +908,17 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +[[package]] +name = "bitfield-struct" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de05f8756f1c68937349406d4632ae96ae35901019b5e59c508d9c38c64715fb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1064,6 +1076,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "branches" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7958fb9748a08a6f46ef773e87c43997a844709bc293b4c3de48135debaf9d2a" + [[package]] name = "brotli" version = "3.3.4" @@ -1131,6 +1149,12 @@ dependencies = [ "serde", ] +[[package]] +name = "by_address" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06" + [[package]] name = "byte-tools" version = "0.3.1" @@ -1572,6 +1596,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" +[[package]] +name = "convert_case" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -1600,6 +1633,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1819,9 +1862,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -1829,29 +1872,35 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim 0.11.1", "syn 2.0.90", ] [[package]] name = "darling_macro" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", "syn 2.0.90", ] +[[package]] +name = "dary_heap" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" + [[package]] name = "dashmap" version = "5.5.3" @@ -1886,6 +1935,15 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + [[package]] name = "derivation-path" version = "0.2.0" @@ -1931,13 +1989,35 @@ version = "0.99.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40eebddd2156ce1bb37b20bbe5151340a31828b1f2d22ba4141f3531710e38df" dependencies = [ - "convert_case", + "convert_case 0.4.0", "proc-macro2", "quote", "rustc_version 0.3.3", "syn 1.0.109", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "convert_case 0.6.0", + "proc-macro2", + "quote", + "syn 2.0.90", + "unicode-xid", +] + [[package]] name = "dialoguer" version = "0.10.4" @@ -2061,6 +2141,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + [[package]] name = "eager" version = "0.1.0" @@ -2168,6 +2254,26 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "enum-ptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b9955cf577337ddbfe2a03307e48bde02ae714346d587fd6f8bb5f262a7e574" +dependencies = [ + "enum-ptr-derive", +] + +[[package]] +name = "enum-ptr-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "367a8dac40699e965e2fb8ac3b272b20058a107ac285b40041155227e4e93fba" +dependencies = [ + "darling", + "quote", + "syn 2.0.90", +] + [[package]] name = "env_logger" version = "0.9.3" @@ -3331,7 +3437,7 @@ version = "18.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2b99d4207e2a04fb4581746903c2bb7eb376f88de9c699d0f3e10feeac0cd3a" dependencies = [ - "derive_more", + "derive_more 0.99.16", "futures 0.3.31", "jsonrpc-core", "jsonrpc-pubsub", @@ -3507,6 +3613,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7d73b3f436185384286bd8098d17ec07c9a7d2388a6599f824d8502b529702a" +[[package]] +name = "libmimalloc-sys" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "librocksdb-sys" version = "0.16.0+8.10.0" @@ -3724,6 +3840,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "mimalloc" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.16" @@ -3820,6 +3945,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + [[package]] name = "multimap" version = "0.8.3" @@ -3942,6 +4073,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-derive" version = "0.4.2" @@ -4026,15 +4163,6 @@ dependencies = [ "syn 2.0.90", ] -[[package]] -name = "num_threads" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ba99ba6393e2c3734791401b66902d981cb03bf190af674ca69949b6d5fb15" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -4437,6 +4565,12 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.15" @@ -4907,6 +5041,15 @@ dependencies = [ name = "rbpf-cli" version = "2.2.0" +[[package]] +name = "rclite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee9f0c2e8b8ef3ea8b0d074b9a0a192d99d47e2023bec8fd6336f2d8543a43b9" +dependencies = [ + "branches", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -6026,9 +6169,11 @@ dependencies = [ name = "solana-banking-bench" version = "2.2.0" dependencies = [ + "assert_matches", "clap 3.2.23", "crossbeam-channel", "log", + "mimalloc", "rand 0.8.5", "rayon", "solana-client", @@ -6043,6 +6188,7 @@ dependencies = [ "solana-sdk", "solana-streamer", "solana-tpu-client", + "solana-unified-scheduler-pool", "solana-version", ] @@ -6113,6 +6259,7 @@ dependencies = [ "crossbeam-channel", "csv", "log", + "mimalloc", "rand 0.8.5", "rayon", "serde", @@ -6673,6 +6820,7 @@ dependencies = [ "chrono", "crossbeam-channel", "dashmap", + "derive_more 1.0.0", "etcd-client", "fs_extra", "futures 0.3.31", @@ -6739,6 +6887,7 @@ dependencies = [ "solana-tpu-client", "solana-transaction-status", "solana-turbine", + "solana-unified-scheduler-logic", "solana-unified-scheduler-pool", "solana-version", "solana-vote", @@ -7697,6 +7846,7 @@ dependencies = [ "bincode", "bv", "caps", + "crossbeam-channel", "curve25519-dalek 4.1.3", "dlopen2", "fnv", @@ -9517,6 +9667,12 @@ name = "solana-unified-scheduler-logic" version = "2.2.0" dependencies = [ "assert_matches", + "bitfield-struct", + "by_address", + "dary_heap", + "enum-ptr", + "more-asserts", + "rclite", "solana-runtime-transaction", "solana-sdk", "static_assertions", @@ -9526,22 +9682,35 @@ dependencies = [ name = "solana-unified-scheduler-pool" version = "2.2.0" dependencies = [ + "ahash 0.8.11", "assert_matches", + "cpu-time", "crossbeam-channel", "dashmap", "derive-where", + "derive_more 1.0.0", + "dyn-clone", + "enum-ptr", "lazy_static", "log", - "qualifier_attr", "scopeguard", + "serde", + "solana-cost-model", + "solana-feature-set", "solana-ledger", "solana-logger", + "solana-perf", + "solana-poh", + "solana-program-runtime", "solana-runtime", "solana-runtime-transaction", "solana-sdk", + "solana-svm", "solana-timings", "solana-unified-scheduler-logic", + "solana-unified-scheduler-pool", "static_assertions", + "trait-set", "vec_extract_if_polyfill", ] @@ -10178,6 +10347,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.24.1" @@ -10516,44 +10691,36 @@ dependencies = [ ] [[package]] -name = "tikv-jemalloc-sys" -version = "0.4.2+5.2.1-patched.2" +name = "time" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5844e429d797c62945a566f8da4e24c7fe3fbd5d6617fd8bf7a0b7dc1ee0f22e" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ - "cc", - "fs_extra", - "libc", + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", ] [[package]] -name = "tikv-jemallocator" -version = "0.4.1" +name = "time-core" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c14a5a604eb8715bc5785018a37d00739b180bcf609916ddf4393d33d49ccdf" -dependencies = [ - "libc", - "tikv-jemalloc-sys", -] +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] -name = "time" -version = "0.3.9" +name = "time-macros" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ - "itoa", - "libc", - "num_threads", - "time-macros", + "num-conv", + "time-core", ] -[[package]] -name = "time-macros" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" - [[package]] name = "tiny-bip39" version = "0.8.2" @@ -10924,6 +11091,17 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "trait-set" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b79e2e9c9ab44c6d7c20d5976961b47e8f49ac199154daa514b77cd1ab536625" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "trees" version = "0.4.2" @@ -11005,6 +11183,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.9" diff --git a/Cargo.toml b/Cargo.toml index c62c59226c7..b842082febc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -294,6 +294,7 @@ curve25519-dalek = { version = "4.1.3", features = ["digest", "rand_core"] } dashmap = "5.5.3" derivation-path = { version = "0.2.0", default-features = false } derive-where = "1.2.7" +derive_more = { version = "1.0.0", features = ["full"] } dialoguer = "0.10.4" digest = "0.10.7" dir-diff = "0.3.3" @@ -331,9 +332,6 @@ index_list = "0.2.14" indexmap = "2.7.0" indicatif = "0.17.9" itertools = "0.12.1" -jemallocator = { package = "tikv-jemallocator", version = "0.4.1", features = [ - "unprefixed_malloc_on_supported_platforms", -] } js-sys = "0.3.74" json5 = "0.4.1" jsonrpc-core = "18.0.0" @@ -357,9 +355,11 @@ lz4 = "1.28.0" memmap2 = "0.5.10" memoffset = "0.9" merlin = "3" +mimalloc = "0.1.43" min-max-heap = "1.3.0" mockall = "0.11.4" modular-bitfield = "0.11.2" +more-asserts = "0.3.1" nix = "0.29.0" num-bigint = "0.4.6" num-derive = "0.4" diff --git a/banking-bench/Cargo.toml b/banking-bench/Cargo.toml index 67ca53f8832..1baa9ba9616 100644 --- a/banking-bench/Cargo.toml +++ b/banking-bench/Cargo.toml @@ -9,13 +9,15 @@ license = { workspace = true } edition = { workspace = true } [dependencies] +assert_matches = { workspace = true } clap = { version = "3.1.8", features = ["derive", "cargo"] } crossbeam-channel = { workspace = true } log = { workspace = true } +mimalloc = { workspace = true } rand = { workspace = true } rayon = { workspace = true } solana-client = { workspace = true } -solana-core = { workspace = true } +solana-core = { workspace = true, features = ["dev-context-only-utils"] } solana-gossip = { workspace = true } solana-ledger = { workspace = true } solana-logger = { workspace = true } @@ -26,6 +28,7 @@ solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } solana-sdk = { workspace = true } solana-streamer = { workspace = true } solana-tpu-client = { workspace = true } +solana-unified-scheduler-pool = { workspace = true } solana-version = { workspace = true } [features] diff --git a/banking-bench/src/main.rs b/banking-bench/src/main.rs index c80e96005c8..a3ab2b5657c 100644 --- a/banking-bench/src/main.rs +++ b/banking-bench/src/main.rs @@ -1,5 +1,6 @@ #![allow(clippy::arithmetic_side_effects)] use { + assert_matches::assert_matches, clap::{crate_description, crate_name, Arg, ArgEnum, Command}, crossbeam_channel::{unbounded, Receiver}, log::*, @@ -7,8 +8,10 @@ use { rayon::prelude::*, solana_client::connection_cache::ConnectionCache, solana_core::{ - banking_stage::BankingStage, - banking_trace::{BankingPacketBatch, BankingTracer, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT}, + banking_stage::{update_bank_forks_and_poh_recorder_for_new_tpu_bank, BankingStage}, + banking_trace::{ + BankingPacketBatch, BankingTracer, Channels, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, + }, validator::BlockProductionMethod, }, solana_gossip::cluster_info::{ClusterInfo, Node}, @@ -29,6 +32,7 @@ use { hash::Hash, message::Message, pubkey::{self, Pubkey}, + scheduling::SchedulingMode, signature::{Keypair, Signature, Signer}, system_instruction, system_transaction, timing::timestamp, @@ -36,6 +40,7 @@ use { }, solana_streamer::socket::SocketAddrSpace, solana_tpu_client::tpu_client::DEFAULT_TPU_CONNECTION_POOL_SIZE, + solana_unified_scheduler_pool::{DefaultSchedulerPool, SupportedSchedulingMode}, std::{ sync::{atomic::Ordering, Arc, RwLock}, thread::sleep, @@ -43,6 +48,9 @@ use { }, }; +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + // transfer transaction cost = 1 * SIGNATURE_COST + // 2 * WRITE_LOCK_UNITS + // 1 * system_program @@ -347,7 +355,7 @@ fn main() { let (replay_vote_sender, _replay_vote_receiver) = unbounded(); let bank0 = Bank::new_for_benches(&genesis_config); let bank_forks = BankForks::new_rw_arc(bank0); - let mut bank = bank_forks.read().unwrap().working_bank(); + let mut bank = bank_forks.read().unwrap().working_bank_with_scheduler(); // set cost tracker limits to MAX so it will not filter out TXs bank.write_cost_tracker() @@ -440,9 +448,36 @@ fn main() { BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) .unwrap(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = banking_tracer.create_channel_gossip_vote(); + let prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let scheduler_pool = if matches!( + block_production_method, + BlockProductionMethod::UnifiedScheduler + ) { + let pool = DefaultSchedulerPool::new( + SupportedSchedulingMode::Either(SchedulingMode::BlockProduction), + None, + None, + None, + Some(replay_vote_sender.clone()), + prioritization_fee_cache.clone(), + poh_recorder.read().unwrap().new_recorder(), + ); + bank_forks + .write() + .unwrap() + .install_scheduler_pool(pool.clone()); + Some(pool) + } else { + None + }; + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(scheduler_pool.as_ref()); let cluster_info = { let keypair = Arc::new(Keypair::new()); let node = Node::new_localhost_with_pubkey(&keypair.pubkey()); @@ -461,7 +496,7 @@ fn main() { ), }; let banking_stage = BankingStage::new_num_threads( - block_production_method, + block_production_method.clone(), &cluster_info, &poh_recorder, non_vote_receiver, @@ -473,10 +508,23 @@ fn main() { None, Arc::new(connection_cache), bank_forks.clone(), - &Arc::new(PrioritizationFeeCache::new(0u64)), + &prioritization_fee_cache, false, + scheduler_pool, ); + // This bench processes transactions, starting from the very first bank, so special-casing is + // needed for unified scheduler. + if matches!( + block_production_method, + BlockProductionMethod::UnifiedScheduler + ) { + bank = bank_forks + .write() + .unwrap() + .reinstall_block_production_scheduler_into_working_genesis_bank(); + } + // This is so that the signal_receiver does not go out of scope after the closure. // If it is dropped before poh_service, then poh_service will error when // calling send() on the channel. @@ -537,33 +585,31 @@ fn main() { tx_total_us += now.elapsed().as_micros() as u64; let mut poh_time = Measure::start("poh_time"); - poh_recorder + let cleared_bank = poh_recorder .write() .unwrap() .reset(bank.clone(), Some((bank.slot(), bank.slot() + 1))); + assert_matches!(cleared_bank, None); poh_time.stop(); let mut new_bank_time = Measure::start("new_bank"); + if let Some((result, _timings)) = bank.wait_for_completed_scheduler() { + assert_matches!(result, Ok(_)); + } let new_slot = bank.slot() + 1; - let new_bank = Bank::new_from_parent(bank, &collector, new_slot); + let new_bank = Bank::new_from_parent(bank.clone(), &collector, new_slot); new_bank_time.stop(); let mut insert_time = Measure::start("insert_time"); - bank_forks.write().unwrap().insert(new_bank); - bank = bank_forks.read().unwrap().working_bank(); + update_bank_forks_and_poh_recorder_for_new_tpu_bank( + &bank_forks, + &poh_recorder, + new_bank, + false, + ); + bank = bank_forks.read().unwrap().working_bank_with_scheduler(); insert_time.stop(); - // set cost tracker limits to MAX so it will not filter out TXs - bank.write_cost_tracker() - .unwrap() - .set_limits(u64::MAX, u64::MAX, u64::MAX); - - assert!(poh_recorder.read().unwrap().bank().is_none()); - poh_recorder - .write() - .unwrap() - .set_bank_for_test(bank.clone()); - assert!(poh_recorder.read().unwrap().bank().is_some()); debug!( "new_bank_time: {}us insert_time: {}us poh_time: {}us", new_bank_time.as_us(), diff --git a/bench-tps/Cargo.toml b/bench-tps/Cargo.toml index 3c3c5f71899..da144d58047 100644 --- a/bench-tps/Cargo.toml +++ b/bench-tps/Cargo.toml @@ -14,6 +14,7 @@ clap = { workspace = true } crossbeam-channel = { workspace = true } csv = { workspace = true } log = { workspace = true } +mimalloc = { workspace = true } rand = { workspace = true } rayon = { workspace = true } serde = { workspace = true } diff --git a/bench-tps/src/main.rs b/bench-tps/src/main.rs index 7222df96fa4..d085327d765 100644 --- a/bench-tps/src/main.rs +++ b/bench-tps/src/main.rs @@ -31,6 +31,9 @@ use { }, }; +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + /// Number of signatures for all transactions in ~1 week at ~100K TPS pub const NUM_SIGNATURES_FOR_TXS: u64 = 100_000 * 60 * 60 * 24 * 7; diff --git a/ci/test-checks.sh b/ci/test-checks.sh index 873c3a9469e..43cc154a37d 100755 --- a/ci/test-checks.sh +++ b/ci/test-checks.sh @@ -83,7 +83,7 @@ _ scripts/check-dev-context-only-utils.sh tree _ scripts/cargo-for-all-lock-files.sh -- "+${rust_nightly}" fmt --all -- --check -_ ci/do-audit.sh +# _ ci/do-audit.sh if [[ -n $CI ]] && [[ $CHANNEL = "stable" ]]; then _ ci/check-install-all.sh diff --git a/core/Cargo.toml b/core/Cargo.toml index df42ec84657..4511fd5be3f 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -24,6 +24,7 @@ bytes = { workspace = true } chrono = { workspace = true, features = ["default", "serde"] } crossbeam-channel = { workspace = true } dashmap = { workspace = true, features = ["rayon", "raw-api"] } +derive_more = { workspace = true } etcd-client = { workspace = true, features = ["tls"] } futures = { workspace = true } histogram = { workspace = true } @@ -87,6 +88,7 @@ solana-tls-utils = { workspace = true } solana-tpu-client = { workspace = true } solana-transaction-status = { workspace = true } solana-turbine = { workspace = true } +solana-unified-scheduler-logic = { workspace = true } solana-unified-scheduler-pool = { workspace = true } solana-version = { workspace = true } solana-vote = { workspace = true } diff --git a/core/benches/banking_stage.rs b/core/benches/banking_stage.rs index 0f449719ce3..51292d4d267 100644 --- a/core/benches/banking_stage.rs +++ b/core/benches/banking_stage.rs @@ -2,7 +2,7 @@ #![feature(test)] use { - solana_core::validator::BlockProductionMethod, + solana_core::{banking_trace::Channels, validator::BlockProductionMethod}, solana_vote_program::{vote_state::TowerSync, vote_transaction::new_tower_sync_transaction}, }; @@ -211,9 +211,14 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { genesis_config.ticks_per_slot = 10_000; let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = banking_tracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); let mut bank = Bank::new_for_benches(&genesis_config); // Allow arbitrary transaction processing time for the purposes of this bench @@ -304,6 +309,7 @@ fn bench_banking(bencher: &mut Bencher, tx_type: TransactionType) { bank_forks, &Arc::new(PrioritizationFeeCache::new(0u64)), false, + None, ); let chunk_len = verified.len() / CHUNKS; diff --git a/core/benches/banking_trace.rs b/core/benches/banking_trace.rs index fb93deebc17..34ab2aaf78f 100644 --- a/core/benches/banking_trace.rs +++ b/core/benches/banking_trace.rs @@ -7,7 +7,7 @@ use { for_test::{ drop_and_clean_temp_dir_unless_suppressed, sample_packet_batch, terminate_tracer, }, - receiving_loop_with_minimized_sender_overhead, BankingPacketBatch, BankingTracer, + receiving_loop_with_minimized_sender_overhead, BankingPacketBatch, BankingTracer, Channels, TraceError, TracerThreadResult, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, }, std::{ @@ -35,7 +35,11 @@ fn black_box_packet_batch(packet_batch: BankingPacketBatch) -> TracerThreadResul fn bench_banking_tracer_main_thread_overhead_noop_baseline(bencher: &mut Bencher) { let exit = Arc::::default(); let tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + .. + } = tracer.create_channels(None); let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { @@ -64,7 +68,11 @@ fn bench_banking_tracer_main_thread_overhead_under_peak_write(bencher: &mut Benc BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ))) .unwrap(); - let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + .. + } = tracer.create_channels(None); let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { @@ -101,7 +109,11 @@ fn bench_banking_tracer_main_thread_overhead_under_sustained_write(bencher: &mut 1024 * 1024, // cause more frequent trace file rotation ))) .unwrap(); - let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + .. + } = tracer.create_channels(None); let exit_for_dummy_thread = exit.clone(); let dummy_main_thread = thread::spawn(move || { @@ -142,7 +154,11 @@ fn bench_banking_tracer_background_thread_throughput(bencher: &mut Bencher) { let (tracer, tracer_thread) = BankingTracer::new(Some((&path, exit.clone(), 50 * 1024 * 1024))).unwrap(); - let (non_vote_sender, non_vote_receiver) = tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + .. + } = tracer.create_channels(None); let dummy_main_thread = thread::spawn(move || { receiving_loop_with_minimized_sender_overhead::<_, TraceError, 0>( diff --git a/core/benches/sigverify_stage.rs b/core/benches/sigverify_stage.rs index 3f11cc15057..5ca5a00ccaf 100644 --- a/core/benches/sigverify_stage.rs +++ b/core/benches/sigverify_stage.rs @@ -185,8 +185,9 @@ fn bench_sigverify_stage(bencher: &mut Bencher, use_same_tx: bool) { if let Ok(message) = verified_r.recv_timeout(Duration::from_millis(10)) { let (verifieds, tracer_packet_stats) = (&message.0, message.1.as_ref().unwrap()); received += verifieds.iter().map(|batch| batch.len()).sum::(); - total_tracer_packets_received_in_sigverify_stage += - tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage; + total_tracer_packets_received_in_sigverify_stage += tracer_packet_stats + .total_tracer_packets_received_in_sigverify_stage + .0; test::black_box(message); if total_tracer_packets_received_in_sigverify_stage >= sent_len { break; diff --git a/core/src/banking_simulation.rs b/core/src/banking_simulation.rs index 6e5113ded67..d9915585e33 100644 --- a/core/src/banking_simulation.rs +++ b/core/src/banking_simulation.rs @@ -1,10 +1,13 @@ #![cfg(feature = "dev-context-only-utils")] use { crate::{ - banking_stage::{BankingStage, LikeClusterInfo}, + banking_stage::{ + update_bank_forks_and_poh_recorder_for_new_tpu_bank, BankingStage, LikeClusterInfo, + }, banking_trace::{ - BankingPacketBatch, BankingTracer, ChannelLabel, TimedTracedEvent, TracedEvent, - TracedSender, TracerThread, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, BASENAME, + BankingPacketBatch, BankingTracer, ChannelLabel, Channels, TimedTracedEvent, + TracedEvent, TracedSender, TracerThread, BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, + BASENAME, }, validator::BlockProductionMethod, }, @@ -23,7 +26,7 @@ use { }, solana_net_utils::bind_to_localhost, solana_poh::{ - poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS}, + poh_recorder::{NewPohRecorder, PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS}, poh_service::{PohService, DEFAULT_HASHES_PER_BATCH, DEFAULT_PINNED_CPU_CORE}, }, solana_runtime::{ @@ -33,15 +36,12 @@ use { prioritization_fee_cache::PrioritizationFeeCache, }, solana_sdk::{ - clock::{Slot, DEFAULT_MS_PER_SLOT, HOLD_TRANSACTIONS_SLOT_OFFSET}, - genesis_config::GenesisConfig, - pubkey::Pubkey, - shred_version::compute_shred_version, - signature::Signer, - signer::keypair::Keypair, + clock::Slot, genesis_config::GenesisConfig, pubkey::Pubkey, + shred_version::compute_shred_version, signature::Signer, signer::keypair::Keypair, }, solana_streamer::socket::SocketAddrSpace, solana_turbine::broadcast_stage::{BroadcastStage, BroadcastStageType}, + solana_unified_scheduler_pool::DefaultSchedulerPool, std::{ collections::BTreeMap, fmt::Display, @@ -53,7 +53,7 @@ use { Arc, RwLock, }, thread::{self, sleep, JoinHandle}, - time::{Duration, SystemTime}, + time::{Duration, Instant, SystemTime}, }, thiserror::Error, }; @@ -126,10 +126,6 @@ pub enum SimulateError { DeserializeError(#[from] bincode::Error), } -// Defined to be enough to cover the holding phase prior to leader slots with some idling (+5 secs) -const WARMUP_DURATION: Duration = - Duration::from_millis(HOLD_TRANSACTIONS_SLOT_OFFSET * DEFAULT_MS_PER_SLOT + 5000); - /// BTreeMap is intentional because events could be unordered slightly due to tracing jitter. type PacketBatchesByTime = BTreeMap; @@ -266,19 +262,25 @@ impl SimulatorLoopLogger { .unwrap() } - fn log_frozen_bank_cost(&self, bank: &Bank) { + fn log_frozen_bank_cost(&self, bank: &Bank, bank_elapsed: Duration) { info!( - "bank cost: slot: {} {:?} (frozen)", + "simulated bank slot+delta: {}+{}ms costs: {:?} fees: {} txs: {} (frozen)", bank.slot(), + bank_elapsed.as_millis(), Self::bank_costs(bank), + bank.collector_fees(), + bank.executed_transaction_count(), ); } - fn log_ongoing_bank_cost(&self, bank: &Bank) { + fn log_ongoing_bank_cost(&self, bank: &Bank, bank_elapsed: Duration) { debug!( - "bank cost: slot: {} {:?} (ongoing)", + "simulated bank slot+delta: {}+{}ms costs: {:?} fees: {} txs: {} (ongoing)", bank.slot(), + bank_elapsed.as_millis(), Self::bank_costs(bank), + bank.collector_fees(), + bank.executed_transaction_count(), ); } @@ -311,8 +313,14 @@ impl SimulatorLoopLogger { } } - fn on_new_leader(&self, bank: &Bank, new_slot: Slot, new_leader: Pubkey) { - self.log_frozen_bank_cost(bank); + fn on_new_leader( + &self, + bank: &Bank, + bank_elapsed: Duration, + new_slot: Slot, + new_leader: Pubkey, + ) { + self.log_frozen_bank_cost(bank, bank_elapsed); info!( "{} isn't leader anymore at slot {}; new leader: {}", self.simulated_leader, new_slot, new_leader @@ -330,6 +338,7 @@ struct SenderLoop { raw_base_event_time: SystemTime, total_batch_count: usize, timed_batches_to_send: TimedBatchesToSend, + warmup_duration: Duration, } impl SenderLoop { @@ -338,7 +347,7 @@ impl SenderLoop { "simulating events: {} (out of {}), starting at slot {} (based on {} from traced event slot: {}) (warmup: -{:?})", self.timed_batches_to_send.len(), self.total_batch_count, self.first_simulated_slot, SenderLoopLogger::format_as_timestamp(self.raw_base_event_time), - self.parent_slot, WARMUP_DURATION, + self.parent_slot, self.warmup_duration, ); } @@ -414,16 +423,32 @@ impl SimulatorLoop { self, base_simulation_time: SystemTime, sender_thread: EventSenderThread, + warmed_up_bank: Bank, ) -> (EventSenderThread, Sender) { - sleep(WARMUP_DURATION); + info!("warmup hack!"); + sleep(Duration::from_millis(330)); + // todo: proper assert + let _ = self.poh_recorder.write().unwrap().reset( + self.bank_forks.write().unwrap().root_bank(), + Some((self.first_simulated_slot, self.first_simulated_slot + 4)), + ); + info!("warmup start!"); + loop { + let current_slot = self.poh_recorder.read().unwrap().slot(); + if current_slot >= self.first_simulated_slot { + break; + } + sleep(Duration::from_millis(10)); + } info!("warmup done!"); - self.start(base_simulation_time, sender_thread) + self.start(base_simulation_time, sender_thread, warmed_up_bank) } fn start( self, base_simulation_time: SystemTime, sender_thread: EventSenderThread, + warmed_up_bank: Bank, ) -> (EventSenderThread, Sender) { let logger = SimulatorLoopLogger { simulated_leader: self.simulated_leader, @@ -431,7 +456,8 @@ impl SimulatorLoop { base_simulation_time, freeze_time_by_slot: self.freeze_time_by_slot, }; - let mut bank = self.bank; + let (mut bank, mut bank_created) = (self.bank, Instant::now()); + let mut warmed_up_bank = Some(warmed_up_bank); loop { if self.poh_recorder.read().unwrap().bank().is_none() { let next_leader_slot = self.leader_schedule_cache.next_leader_slot( @@ -442,13 +468,21 @@ impl SimulatorLoop { GRACE_TICKS_FACTOR * MAX_GRACE_SLOTS, ); debug!("{next_leader_slot:?}"); - self.poh_recorder + // todo: proper assert + let _ = self + .poh_recorder .write() .unwrap() .reset(bank.clone_without_scheduler(), next_leader_slot); info!("Bank::new_from_parent()!"); logger.log_jitter(&bank); + assert!(bank.is_complete()); + if let Some((result, _completed_execute_timings)) = + bank.wait_for_completed_scheduler() + { + info!("us result: {:?}", result); + } bank.freeze(); let new_slot = if bank.slot() == self.parent_slot { info!("initial leader block!"); @@ -462,7 +496,7 @@ impl SimulatorLoop { .slot_leader_at(new_slot, None) .unwrap(); if new_leader != self.simulated_leader { - logger.on_new_leader(&bank, new_slot, new_leader); + logger.on_new_leader(&bank, bank_created.elapsed(), new_slot, new_leader); break; } else if sender_thread.is_finished() { warn!("sender thread existed maybe due to completion of sending traced events"); @@ -470,32 +504,37 @@ impl SimulatorLoop { } else { info!("new leader bank slot: {new_slot}"); } - let new_bank = Bank::new_from_parent( - bank.clone_without_scheduler(), - &self.simulated_leader, - new_slot, - ); + let new_bank = warmed_up_bank.take().unwrap_or_else(|| { + Bank::new_from_parent( + bank.clone_without_scheduler(), + &self.simulated_leader, + new_slot, + ) + }); // make sure parent is frozen for finalized hashes via the above // new()-ing of its child bank self.retracer .hash_event(bank.slot(), &bank.last_blockhash(), &bank.hash()); if *bank.collector_id() == self.simulated_leader { - logger.log_frozen_bank_cost(&bank); + logger.log_frozen_bank_cost(&bank, bank_created.elapsed()); } self.retransmit_slots_sender.send(bank.slot()).unwrap(); - self.bank_forks.write().unwrap().insert(new_bank); - bank = self - .bank_forks - .read() - .unwrap() - .working_bank_with_scheduler() - .clone_with_scheduler(); - self.poh_recorder - .write() - .unwrap() - .set_bank(bank.clone_with_scheduler(), false); + update_bank_forks_and_poh_recorder_for_new_tpu_bank( + &self.bank_forks, + &self.poh_recorder, + new_bank, + false, + ); + (bank, bank_created) = ( + self.bank_forks + .read() + .unwrap() + .working_bank_with_scheduler(), + Instant::now(), + ); + logger.log_ongoing_bank_cost(&bank, bank_created.elapsed()); } else { - logger.log_ongoing_bank_cost(&bank); + logger.log_ongoing_bank_cost(&bank, bank_created.elapsed()); } sleep(Duration::from_millis(10)); @@ -516,7 +555,8 @@ struct SimulatorThreads { impl SimulatorThreads { fn finish(self, sender_thread: EventSenderThread, retransmit_slots_sender: Sender) { info!("Sleeping a bit before signaling exit"); - sleep(Duration::from_millis(100)); + // this is needed for metrics flush + sleep(Duration::from_millis(3000)); self.exit.store(true, Ordering::Relaxed); // The order is important. Consuming sender_thread by joining will drop some channels. That @@ -672,15 +712,13 @@ impl BankingSimulator { bank_forks: Arc>, blockstore: Arc, block_production_method: BlockProductionMethod, + unified_scheduler_pool: Option>, + new_poh_recorder: Option, ) -> (SenderLoop, SimulatorLoop, SimulatorThreads) { let parent_slot = self.parent_slot().unwrap(); let mut packet_batches_by_time = self.banking_trace_events.packet_batches_by_time; let freeze_time_by_slot = self.banking_trace_events.freeze_time_by_slot; - let bank = bank_forks - .read() - .unwrap() - .working_bank_with_scheduler() - .clone_with_scheduler(); + let bank = bank_forks.read().unwrap().working_bank_with_scheduler(); let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank)); assert_eq!(parent_slot, bank.slot()); @@ -693,7 +731,10 @@ impl BankingSimulator { simulated_leader, self.first_simulated_slot, ); - let exit = Arc::new(AtomicBool::default()); + let exit = new_poh_recorder + .as_ref() + .map(|(poh_recorder, ..)| poh_recorder.is_exited.clone()) + .unwrap_or_else(|| Arc::new(AtomicBool::default())); if let Some(end_slot) = blockstore .slot_meta_iterator(self.first_simulated_slot) @@ -709,33 +750,6 @@ impl BankingSimulator { info!("skipping purging..."); } - info!("Poh is starting!"); - - let (poh_recorder, entry_receiver, record_receiver) = PohRecorder::new_with_clear_signal( - bank.tick_height(), - bank.last_blockhash(), - bank.clone(), - None, - bank.ticks_per_slot(), - false, - blockstore.clone(), - blockstore.get_new_shred_signal(0), - &leader_schedule_cache, - &genesis_config.poh_config, - None, - exit.clone(), - ); - let poh_recorder = Arc::new(RwLock::new(poh_recorder)); - let poh_service = PohService::new( - poh_recorder.clone(), - &genesis_config.poh_config, - exit.clone(), - bank.ticks_per_slot(), - DEFAULT_PINNED_CPU_CORE, - DEFAULT_HASHES_PER_BATCH, - record_receiver, - ); - // Enable BankingTracer to approximate the real environment as close as possible because // it's not expected to disable BankingTracer on production environments. // @@ -758,70 +772,51 @@ impl BankingSimulator { BANKING_TRACE_DIR_DEFAULT_BYTE_LIMIT, ); - let (non_vote_sender, non_vote_receiver) = retracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = retracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = retracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = retracer.create_channels(unified_scheduler_pool.as_ref()); let connection_cache = Arc::new(ConnectionCache::new("connection_cache_sim")); let (replay_vote_sender, _replay_vote_receiver) = unbounded(); let (retransmit_slots_sender, retransmit_slots_receiver) = unbounded(); - let shred_version = compute_shred_version( - &genesis_config.hash(), - Some(&bank_forks.read().unwrap().root_bank().hard_forks()), - ); + let shred_version = compute_shred_version(&genesis_config.hash(), Some(&bank.hard_forks())); let (sender, _receiver) = tokio::sync::mpsc::channel(1); // Create a completely-dummy ClusterInfo for the broadcast stage. // We only need it to write shreds into the blockstore and it seems given ClusterInfo is // irrelevant for the neccesary minimum work for this simulation. let random_keypair = Arc::new(Keypair::new()); - let cluster_info = Arc::new(ClusterInfo::new( + let cluster_info_for_broadcast_stage = Arc::new(ClusterInfo::new( Node::new_localhost_with_pubkey(&random_keypair.pubkey()).info, random_keypair, SocketAddrSpace::Unspecified, )); - // Broadcast stage is needed to save the simulated blocks for post-run analysis by - // inserting produced shreds into the blockstore. - let broadcast_stage = BroadcastStageType::Standard.new_broadcast_stage( - vec![bind_to_localhost().unwrap()], - cluster_info.clone(), - entry_receiver, - retransmit_slots_receiver, - exit.clone(), - blockstore.clone(), - bank_forks.clone(), - shred_version, - sender, - ); - - info!("Start banking stage!..."); // Create a partially-dummy ClusterInfo for the banking stage. - let cluster_info = Arc::new(DummyClusterInfo { + let cluster_info_for_banking_stage = Arc::new(DummyClusterInfo { id: simulated_leader.into(), }); let prioritization_fee_cache = &Arc::new(PrioritizationFeeCache::new(0u64)); - let banking_stage = BankingStage::new_num_threads( - block_production_method.clone(), - &cluster_info, - &poh_recorder, - non_vote_receiver, - tpu_vote_receiver, - gossip_vote_receiver, - BankingStage::num_threads(), - None, - replay_vote_sender, - None, - connection_cache, - bank_forks.clone(), - prioritization_fee_cache, - false, - ); - let (&_slot, &raw_base_event_time) = freeze_time_by_slot .range(parent_slot..) .next() .expect("timed hashes"); - let base_event_time = raw_base_event_time - WARMUP_DURATION; + + let poh_bank = bank_forks.read().unwrap().root_bank(); + let target_ns_per_slot = solana_poh::poh_service::PohService::target_ns_per_tick( + poh_bank.ticks_per_slot(), + genesis_config.poh_config.target_tick_duration.as_nanos() as u64, + ) * poh_bank.ticks_per_slot(); + let warmup_duration = Duration::from_nanos( + (self.first_simulated_slot - poh_bank.slot()) * target_ns_per_slot, + ); + // if slot is too short => bail + info!("warmup_duration: {:?}", warmup_duration); + let base_event_time = raw_base_event_time - warmup_duration; let total_batch_count = packet_batches_by_time.len(); let timed_batches_to_send = packet_batches_by_time.split_off(&base_event_time); @@ -845,6 +840,68 @@ impl BankingSimulator { .zip_eq(batch_and_tx_counts) .collect::>(); + info!("Poh is starting!"); + let (poh_recorder, entry_receiver, record_receiver) = + new_poh_recorder.unwrap_or_else(|| { + PohRecorder::new_with_clear_signal( + poh_bank.tick_height(), + poh_bank.last_blockhash(), + poh_bank.clone(), + None, + poh_bank.ticks_per_slot(), + false, + blockstore.clone(), + blockstore.get_new_shred_signal(0), + &leader_schedule_cache, + &genesis_config.poh_config, + None, + exit.clone(), + ) + }); + drop(poh_bank); + let poh_recorder = Arc::new(RwLock::new(poh_recorder)); + let poh_service = PohService::new( + poh_recorder.clone(), + &genesis_config.poh_config, + exit.clone(), + bank.ticks_per_slot(), + DEFAULT_PINNED_CPU_CORE, + DEFAULT_HASHES_PER_BATCH, + record_receiver, + ); + // Broadcast stage is needed to save the simulated blocks for post-run analysis by + // inserting produced shreds into the blockstore. + let broadcast_stage = BroadcastStageType::Standard.new_broadcast_stage( + vec![bind_to_localhost().unwrap()], + cluster_info_for_broadcast_stage.clone(), + entry_receiver, + retransmit_slots_receiver, + exit.clone(), + blockstore.clone(), + bank_forks.clone(), + shred_version, + sender, + ); + + info!("Start banking stage!..."); + let banking_stage = BankingStage::new_num_threads( + block_production_method.clone(), + &cluster_info_for_banking_stage, + &poh_recorder, + non_vote_receiver, + tpu_vote_receiver, + gossip_vote_receiver, + BankingStage::num_threads(), + None, + replay_vote_sender, + None, + connection_cache, + bank_forks.clone(), + prioritization_fee_cache, + false, + unified_scheduler_pool, + ); + let sender_loop = SenderLoop { parent_slot, first_simulated_slot: self.first_simulated_slot, @@ -855,6 +912,7 @@ impl BankingSimulator { raw_base_event_time, total_batch_count, timed_batches_to_send, + warmup_duration, }; let simulator_loop = SimulatorLoop { @@ -889,23 +947,42 @@ impl BankingSimulator { bank_forks: Arc>, blockstore: Arc, block_production_method: BlockProductionMethod, + unified_scheduler_pool: Option>, + new_poh_recorder: Option, ) -> Result<(), SimulateError> { let (sender_loop, simulator_loop, simulator_threads) = self.prepare_simulation( genesis_config, bank_forks, blockstore, block_production_method, + unified_scheduler_pool, + new_poh_recorder, + ); + + info!("warmed up bank: creating...."); + let warmed_up_bank = Bank::new_from_parent( + simulator_loop.bank.clone_without_scheduler(), + &simulator_loop.simulated_leader, + simulator_loop.first_simulated_slot, ); + info!("warmed up bank: done!!!"); sender_loop.log_starting(); let base_simulation_time = SystemTime::now(); // Spawning and entering these two loops must be done at the same time as they're timed. // So, all the mundane setup must be done in advance. let sender_thread = sender_loop.spawn(base_simulation_time)?; - let (sender_thread, retransmit_slots_sender) = - simulator_loop.enter(base_simulation_time, sender_thread); - simulator_threads.finish(sender_thread, retransmit_slots_sender); + let handle = thread::Builder::new() + .name("solSimLoop".into()) + .spawn(move || { + let (sender_thread, retransmit_slots_sender) = + simulator_loop.enter(base_simulation_time, sender_thread, warmed_up_bank); + + simulator_threads.finish(sender_thread, retransmit_slots_sender); + }) + .unwrap(); + let () = handle.join().unwrap(); Ok(()) } diff --git a/core/src/banking_stage.rs b/core/src/banking_stage.rs index 49ccdb6ae15..2ecbe4ea537 100644 --- a/core/src/banking_stage.rs +++ b/core/src/banking_stage.rs @@ -2,6 +2,8 @@ //! to construct a software pipeline. The stage uses all available CPU cores and //! can do its processing in parallel with signature verification on the GPU. +#[cfg(feature = "dev-context-only-utils")] +use qualifier_attr::qualifiers; use { self::{ committer::Committer, @@ -21,7 +23,9 @@ use { packet_deserializer::PacketDeserializer, transaction_scheduler::{ prio_graph_scheduler::PrioGraphScheduler, - scheduler_controller::SchedulerController, scheduler_error::SchedulerError, + receive_and_buffer::{calculate_max_age, calculate_priority_and_cost}, + scheduler_controller::SchedulerController, + scheduler_error::SchedulerError, }, }, banking_trace::BankingPacketReceiver, @@ -30,6 +34,7 @@ use { }, crossbeam_channel::{unbounded, Receiver, RecvTimeoutError, Sender}, histogram::Histogram, + solana_accounts_db::account_locks::validate_account_locks, solana_client::connection_cache::ConnectionCache, solana_gossip::{cluster_info::ClusterInfo, contact_info::ContactInfo}, solana_ledger::blockstore_processor::TransactionStatusSender, @@ -37,10 +42,18 @@ use { solana_perf::{data_budget::DataBudget, packet::PACKETS_PER_BATCH}, solana_poh::poh_recorder::{PohRecorder, TransactionRecorder}, solana_runtime::{ - bank_forks::BankForks, prioritization_fee_cache::PrioritizationFeeCache, + bank::Bank, bank_forks::BankForks, prioritization_fee_cache::PrioritizationFeeCache, vote_sender_types::ReplayVoteSender, }, - solana_sdk::{pubkey::Pubkey, timing::AtomicInterval}, + solana_runtime_transaction::instructions_processor::process_compute_budget_instructions, + solana_sdk::{ + pubkey::Pubkey, + scheduling::{SchedulingMode, TaskKey}, + timing::AtomicInterval, + }, + solana_svm_transaction::svm_message::SVMMessage, + solana_unified_scheduler_logic::TransactionContext, + solana_unified_scheduler_pool::{BankingStageAdapter, DefaultSchedulerPool}, std::{ cmp, env, ops::Deref, @@ -364,6 +377,7 @@ impl BankingStage { bank_forks: Arc>, prioritization_fee_cache: &Arc, enable_forwarding: bool, + unified_scheduler_pool: Option>, ) -> Self { Self::new_num_threads( block_production_method, @@ -380,6 +394,7 @@ impl BankingStage { bank_forks, prioritization_fee_cache, enable_forwarding, + unified_scheduler_pool, ) } @@ -399,9 +414,12 @@ impl BankingStage { bank_forks: Arc>, prioritization_fee_cache: &Arc, enable_forwarding: bool, + unified_scheduler_pool: Option>, ) -> Self { + use BlockProductionMethod::*; + match block_production_method { - BlockProductionMethod::CentralScheduler => Self::new_central_scheduler( + CentralScheduler => Self::new_central_scheduler( cluster_info, poh_recorder, non_vote_receiver, @@ -416,6 +434,16 @@ impl BankingStage { prioritization_fee_cache, enable_forwarding, ), + UnifiedScheduler => Self::new_unified_scheduler( + cluster_info, + poh_recorder, + non_vote_receiver, + tpu_vote_receiver, + gossip_vote_receiver, + num_threads, + bank_forks, + unified_scheduler_pool.unwrap(), + ), } } @@ -645,6 +673,109 @@ impl BankingStage { Self { bank_thread_hdls } } + pub fn new_unified_scheduler( + cluster_info: &impl LikeClusterInfo, + poh_recorder: &Arc>, + non_vote_receiver: BankingPacketReceiver, + tpu_vote_receiver: BankingPacketReceiver, + gossip_vote_receiver: BankingPacketReceiver, + num_threads: u32, + bank_forks: Arc>, + unified_scheduler_pool: Arc, + ) -> Self { + assert!(non_vote_receiver.same_channel(&tpu_vote_receiver)); + assert!(non_vote_receiver.same_channel(&gossip_vote_receiver)); + drop((tpu_vote_receiver, gossip_vote_receiver)); + let unified_receiver = non_vote_receiver; + + // todo: forwarding + let decision_maker = DecisionMaker::new(cluster_info.id(), poh_recorder.clone()); + + let banking_stage_monitor = Box::new(decision_maker.clone()); + + unified_scheduler_pool.register_banking_stage( + unified_receiver, + (num_threads - NUM_VOTE_PROCESSING_THREADS) as usize, + banking_stage_monitor, + Box::new(move |adapter: Arc| { + let decision_maker = decision_maker.clone(); + let bank_forks = bank_forks.clone(); + + Box::new(move |batches, task_submitter| { + let decision = decision_maker.make_consume_or_forward_decision(); + if matches!(decision, BufferedPacketsDecision::Forward) { + return; + } + let bank = bank_forks.read().unwrap().root_bank(); + let alt_resolved_slot = bank.slot(); + let sanitized_epoch = bank.epoch(); + let transaction_account_lock_limit = bank.get_transaction_account_lock_limit(); + let batches = batches.0.iter(); + for batch in batches { + // over-provision nevertheless some of packets could be invalid. + let task_id_base = adapter.generate_task_ids(batch.len() as u64); + let packets = PacketDeserializer::deserialize_packets_with_indexes(batch); + + for (packet, packet_index) in packets { + let Some((transaction, deactivation_slot)) = packet + .build_sanitized_transaction( + bank.vote_only_bank(), + &bank, + bank.get_reserved_account_keys(), + ) + else { + continue; + }; + + let Some(_) = validate_account_locks( + transaction.account_keys(), + transaction_account_lock_limit, + ) + .ok() else { + continue; + }; + + let Some(compute_budget_limits) = process_compute_budget_instructions( + SVMMessage::program_instructions_iter(transaction.message()), + ) + .ok() else { + continue; + }; + + let (priority, _cost) = calculate_priority_and_cost( + &transaction, + &compute_budget_limits.into(), + &bank, + ); + + let context = TransactionContext::BlockProduction(calculate_max_age( + sanitized_epoch, + deactivation_slot, + alt_resolved_slot, + )); + + let index = { + let reversed_priority = (u64::MAX - priority) as TaskKey; + let task_id = (task_id_base + packet_index as u64) as TaskKey; + reversed_priority << const { TaskKey::BITS / 2 } | task_id + }; + + let Some(task) = adapter.create_new_task(transaction, context, index) + else { + continue; + }; + task_submitter(task); + } + } + }) + }), + ); + + Self { + bank_thread_hdls: vec![], + } + } + fn spawn_thread_local_multi_iterator_thread( id: u32, packet_receiver: BankingPacketReceiver, @@ -809,11 +940,32 @@ impl BankingStage { } } +#[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] +pub(crate) fn update_bank_forks_and_poh_recorder_for_new_tpu_bank( + bank_forks: &RwLock, + poh_recorder: &RwLock, + tpu_bank: Bank, + track_transaction_indexes: bool, +) { + // A write lock for the poh recorder must be grabbed for the entire duration of inserting new + // tpu bank into the bank forks. That's because any buffered transactions could immediately be + // executed after the bank forks update, when unified scheduler is enabled for block + // production. And then, the unified scheduler would be hit with false errors due to having no + // bank in the poh recorder otherwise. + let mut poh_recorder = poh_recorder.write().unwrap(); + + let tpu_bank = bank_forks + .write() + .unwrap() + .insert_with_scheduling_mode(SchedulingMode::BlockProduction, tpu_bank); + poh_recorder.set_bank(tpu_bank, track_transaction_indexes); +} + #[cfg(test)] mod tests { use { super::*, - crate::banking_trace::{BankingPacketBatch, BankingTracer}, + crate::banking_trace::{BankingPacketBatch, BankingTracer, Channels}, crossbeam_channel::{unbounded, Receiver}, itertools::Itertools, solana_entry::entry::{self, Entry, EntrySlice}, @@ -874,10 +1026,14 @@ mod tests { let genesis_config = create_genesis_config(2).genesis_config; let (bank, bank_forks) = Bank::new_no_wallclock_throttle_for_tests(&genesis_config); let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -904,6 +1060,7 @@ mod tests { bank_forks, &Arc::new(PrioritizationFeeCache::new(0u64)), false, + None, ); drop(non_vote_sender); drop(tpu_vote_sender); @@ -926,10 +1083,14 @@ mod tests { let (bank, bank_forks) = Bank::new_no_wallclock_throttle_for_tests(&genesis_config); let start_hash = bank.last_blockhash(); let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -960,6 +1121,7 @@ mod tests { bank_forks, &Arc::new(PrioritizationFeeCache::new(0u64)), false, + None, ); trace!("sending bank"); drop(non_vote_sender); @@ -1004,10 +1166,14 @@ mod tests { let (bank, bank_forks) = Bank::new_no_wallclock_throttle_for_tests(&genesis_config); let start_hash = bank.last_blockhash(); let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -1040,6 +1206,7 @@ mod tests { bank_forks.clone(), // keep a local-copy of bank-forks so worker threads do not lose weak access to bank-forks &Arc::new(PrioritizationFeeCache::new(0u64)), false, + None, ); // fund another account so we can send 2 good transactions in a single batch. @@ -1138,7 +1305,14 @@ mod tests { .. } = create_slow_genesis_config(2); let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); // Process a batch that includes a transaction that receives two lamports. let alice = Keypair::new(); @@ -1168,9 +1342,6 @@ mod tests { .send(BankingPacketBatch::new((packet_batches, None))) .unwrap(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let (replay_vote_sender, _replay_vote_receiver) = unbounded(); @@ -1361,10 +1532,14 @@ mod tests { let (bank, bank_forks) = Bank::new_no_wallclock_throttle_for_tests(&genesis_config); let start_hash = bank.last_blockhash(); let banking_tracer = BankingTracer::new_disabled(); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(None); let ledger_path = get_tmp_ledger_path_auto_delete!(); { let blockstore = Arc::new( @@ -1397,6 +1572,7 @@ mod tests { bank_forks, &Arc::new(PrioritizationFeeCache::new(0u64)), false, + None, ); let keypairs = (0..100).map(|_| Keypair::new()).collect_vec(); diff --git a/core/src/banking_stage/consume_worker.rs b/core/src/banking_stage/consume_worker.rs index 815c51b9b6b..2e8355bac81 100644 --- a/core/src/banking_stage/consume_worker.rs +++ b/core/src/banking_stage/consume_worker.rs @@ -752,7 +752,7 @@ mod tests { crate::banking_stage::{ committer::Committer, qos_service::QosService, - scheduler_messages::{MaxAge, TransactionBatchId}, + scheduler_messages::TransactionBatchId, tests::{create_slow_genesis_config, sanitize_transactions, simulate_poh}, }, crossbeam_channel::unbounded, @@ -776,6 +776,7 @@ mod tests { }, poh_config::PohConfig, pubkey::Pubkey, + scheduling::MaxAge, signature::Keypair, signer::Signer, system_instruction, system_transaction, diff --git a/core/src/banking_stage/consumer.rs b/core/src/banking_stage/consumer.rs index b28a46cd512..32da36a98f9 100644 --- a/core/src/banking_stage/consumer.rs +++ b/core/src/banking_stage/consumer.rs @@ -7,7 +7,6 @@ use { }, leader_slot_timing_metrics::LeaderExecuteAndCommitTimings, qos_service::QosService, - scheduler_messages::MaxAge, unprocessed_transaction_storage::{ConsumeScannerPayload, UnprocessedTransactionStorage}, BankingStageStats, }, @@ -32,6 +31,7 @@ use { clock::{FORWARD_TRANSACTIONS_TO_LEADER_AT_SLOT_OFFSET, MAX_PROCESSING_AGE}, fee::FeeBudgetLimits, saturating_add_assign, + scheduling::MaxAge, timing::timestamp, transaction::{self, TransactionError}, }, @@ -438,36 +438,12 @@ impl Consumer { .feature_set .is_active(&feature_set::move_precompile_verification_to_svm::id()); - // Need to filter out transactions since they were sanitized earlier. - // This means that the transaction may cross and epoch boundary (not allowed), - // or account lookup tables may have been closed. let pre_results = txs.iter().zip(max_ages).map(|(tx, max_age)| { - // If the transaction was sanitized before this bank's epoch, - // additional checks are necessary. - if bank.epoch() != max_age.sanitized_epoch { - // Reserved key set may have changed, so we must verify that - // no writable keys are reserved. - bank.check_reserved_keys(tx)?; - } - - if bank.slot() > max_age.alt_invalidation_slot { - // The address table lookup **may** have expired, but the - // expiration is not guaranteed since there may have been - // skipped slot. - // If the addresses still resolve here, then the transaction is still - // valid, and we can continue with processing. - // If they do not, then the ATL has expired and the transaction - // can be dropped. - let (_addresses, _deactivation_slot) = - bank.load_addresses_from_ref(tx.message_address_table_lookups())?; - } - - // Verify pre-compiles. - if !move_precompile_verification_to_svm { - verify_precompiles(tx, &bank.feature_set)?; - } - - Ok(()) + bank.refilter_prebuilt_block_production_transaction( + tx, + max_age, + move_precompile_verification_to_svm, + ) }); self.process_and_record_transactions_with_pre_results(bank, txs, 0, pre_results) } diff --git a/core/src/banking_stage/decision_maker.rs b/core/src/banking_stage/decision_maker.rs index 1bd0b224fdf..99ae967c7e5 100644 --- a/core/src/banking_stage/decision_maker.rs +++ b/core/src/banking_stage/decision_maker.rs @@ -7,7 +7,8 @@ use { }, pubkey::Pubkey, }, - std::sync::{Arc, RwLock}, + solana_unified_scheduler_pool::{BankingStageMonitor, BankingStageStatus}, + std::sync::{atomic::Ordering::Relaxed, Arc, RwLock}, }; #[derive(Debug, Clone)] @@ -28,9 +29,10 @@ impl BufferedPacketsDecision { } } -#[derive(Clone)] +#[derive(Clone, derive_more::Debug)] pub struct DecisionMaker { my_pubkey: Pubkey, + #[debug("{poh_recorder:p}")] poh_recorder: Arc>, } @@ -112,6 +114,21 @@ impl DecisionMaker { } } +impl BankingStageMonitor for DecisionMaker { + fn status(&self) -> BankingStageStatus { + if self.poh_recorder.read().unwrap().is_exited.load(Relaxed) { + BankingStageStatus::Exited + } else if matches!( + self.make_consume_or_forward_decision(), + BufferedPacketsDecision::Forward, + ) { + BankingStageStatus::Inactive + } else { + BankingStageStatus::Active + } + } +} + #[cfg(test)] mod tests { use { @@ -159,7 +176,7 @@ mod tests { let my_pubkey = Pubkey::new_unique(); let decision_maker = DecisionMaker::new(my_pubkey, poh_recorder.clone()); - poh_recorder.write().unwrap().reset(bank.clone(), None); + let _ = poh_recorder.write().unwrap().reset(bank.clone(), None); let slot = bank.slot() + 1; let bank = Arc::new(Bank::new_from_parent(bank, &my_pubkey, slot)); @@ -176,7 +193,7 @@ mod tests { // Will be leader shortly - Hold for next_leader_slot_offset in [0, 1].into_iter() { let next_leader_slot = bank.slot() + next_leader_slot_offset; - poh_recorder.write().unwrap().reset( + let _ = poh_recorder.write().unwrap().reset( bank.clone(), Some(( next_leader_slot, @@ -193,7 +210,7 @@ mod tests { // Will be leader - ForwardAndHold for next_leader_slot_offset in [2, 19].into_iter() { let next_leader_slot = bank.slot() + next_leader_slot_offset; - poh_recorder.write().unwrap().reset( + let _ = poh_recorder.write().unwrap().reset( bank.clone(), Some(( next_leader_slot, @@ -209,7 +226,7 @@ mod tests { // Known leader, not me - Forward { - poh_recorder.write().unwrap().reset(bank, None); + let _ = poh_recorder.write().unwrap().reset(bank, None); let decision = decision_maker.make_consume_or_forward_decision(); assert_matches!(decision, BufferedPacketsDecision::Forward); } diff --git a/core/src/banking_stage/packet_deserializer.rs b/core/src/banking_stage/packet_deserializer.rs index 78fab371825..b3ed73fb33c 100644 --- a/core/src/banking_stage/packet_deserializer.rs +++ b/core/src/banking_stage/packet_deserializer.rs @@ -5,12 +5,9 @@ use { immutable_deserialized_packet::{DeserializedPacketError, ImmutableDeserializedPacket}, packet_filter::PacketFilterFailure, }, - crate::{ - banking_trace::{BankingPacketBatch, BankingPacketReceiver}, - sigverify::SigverifyTracerPacketStats, - }, + crate::banking_trace::{BankingPacketBatch, BankingPacketReceiver}, crossbeam_channel::RecvTimeoutError, - solana_perf::packet::PacketBatch, + solana_perf::packet::{PacketBatch, SigverifyTracerPacketStats}, solana_sdk::saturating_add_assign, std::time::{Duration, Instant}, }; @@ -219,6 +216,18 @@ impl PacketDeserializer { } }) } + + pub(crate) fn deserialize_packets_with_indexes( + packet_batch: &PacketBatch, + ) -> impl Iterator + '_ { + let packet_indexes = PacketDeserializer::generate_packet_indexes(packet_batch); + packet_indexes.into_iter().filter_map(move |packet_index| { + let packet = packet_batch[packet_index].clone(); + ImmutableDeserializedPacket::new(packet) + .ok() + .map(|packet| (packet, packet_index)) + }) + } } #[cfg(test)] diff --git a/core/src/banking_stage/scheduler_messages.rs b/core/src/banking_stage/scheduler_messages.rs index 1c7cf31592b..65ca027fa74 100644 --- a/core/src/banking_stage/scheduler_messages.rs +++ b/core/src/banking_stage/scheduler_messages.rs @@ -1,7 +1,4 @@ -use { - solana_sdk::clock::{Epoch, Slot}, - std::fmt::Display, -}; +use {solana_sdk::scheduling::MaxAge, std::fmt::Display}; /// A unique identifier for a transaction batch. #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] @@ -21,19 +18,6 @@ impl Display for TransactionBatchId { pub type TransactionId = usize; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct MaxAge { - pub sanitized_epoch: Epoch, - pub alt_invalidation_slot: Slot, -} - -impl MaxAge { - pub const MAX: Self = Self { - sanitized_epoch: Epoch::MAX, - alt_invalidation_slot: Slot::MAX, - }; -} - /// Message: [Scheduler -> Worker] /// Transactions to be consumed (i.e. executed, recorded, and committed) pub struct ConsumeWork { diff --git a/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs b/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs index 950f506fd51..1d14b42cb5a 100644 --- a/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs +++ b/core/src/banking_stage/transaction_scheduler/prio_graph_scheduler.rs @@ -8,9 +8,7 @@ use { crate::banking_stage::{ consumer::TARGET_NUM_TRANSACTIONS_PER_BATCH, read_write_account_set::ReadWriteAccountSet, - scheduler_messages::{ - ConsumeWork, FinishedConsumeWork, MaxAge, TransactionBatchId, TransactionId, - }, + scheduler_messages::{ConsumeWork, FinishedConsumeWork, TransactionBatchId, TransactionId}, transaction_scheduler::{ transaction_priority_id::TransactionPriorityId, transaction_state::TransactionState, transaction_state_container::StateContainer, @@ -22,7 +20,7 @@ use { solana_cost_model::block_cost_limits::MAX_BLOCK_UNITS, solana_measure::measure_us, solana_runtime_transaction::transaction_with_meta::TransactionWithMeta, - solana_sdk::{pubkey::Pubkey, saturating_add_assign}, + solana_sdk::{pubkey::Pubkey, saturating_add_assign, scheduling::MaxAge}, solana_svm_transaction::svm_message::SVMMessage, }; diff --git a/core/src/banking_stage/transaction_scheduler/receive_and_buffer.rs b/core/src/banking_stage/transaction_scheduler/receive_and_buffer.rs index 52d35f93e54..2ccb4b5a573 100644 --- a/core/src/banking_stage/transaction_scheduler/receive_and_buffer.rs +++ b/core/src/banking_stage/transaction_scheduler/receive_and_buffer.rs @@ -6,7 +6,7 @@ use { crate::banking_stage::{ decision_maker::BufferedPacketsDecision, immutable_deserialized_packet::ImmutableDeserializedPacket, - packet_deserializer::PacketDeserializer, scheduler_messages::MaxAge, + packet_deserializer::PacketDeserializer, transaction_scheduler::transaction_state::SanitizedTransactionTTL, TransactionStateContainer, }, @@ -26,6 +26,7 @@ use { clock::{Epoch, Slot, MAX_PROCESSING_AGE}, fee::FeeBudgetLimits, saturating_add_assign, + scheduling::MaxAge, transaction::SanitizedTransaction, }, solana_svm::transaction_error_metrics::TransactionErrorMetrics, @@ -296,7 +297,7 @@ impl SanitizedTransactionReceiveAndBuffer { /// from user input. They should never be zero. /// Any difference in the prioritization is negligible for /// the current transaction costs. -fn calculate_priority_and_cost( +pub(crate) fn calculate_priority_and_cost( transaction: &RuntimeTransaction, fee_budget_limits: &FeeBudgetLimits, bank: &Bank, @@ -333,7 +334,7 @@ fn calculate_priority_and_cost( /// slots, the value used here is the lower-bound on the deactivation /// period, i.e. the transaction's address lookups are valid until /// AT LEAST this slot. -fn calculate_max_age( +pub(crate) fn calculate_max_age( sanitized_epoch: Epoch, deactivation_slot: Slot, current_slot: Slot, diff --git a/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs b/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs index 14a175b2018..ffa9c09b3bf 100644 --- a/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs +++ b/core/src/banking_stage/transaction_scheduler/scheduler_controller.rs @@ -444,7 +444,6 @@ mod tests { transaction_scheduler::receive_and_buffer::SanitizedTransactionReceiveAndBuffer, }, banking_trace::BankingPacketBatch, - sigverify::SigverifyTracerPacketStats, }, crossbeam_channel::{unbounded, Receiver, Sender}, itertools::Itertools, @@ -453,7 +452,9 @@ mod tests { blockstore::Blockstore, genesis_utils::GenesisConfigInfo, get_tmp_ledger_path_auto_delete, leader_schedule_cache::LeaderScheduleCache, }, - solana_perf::packet::{to_packet_batches, PacketBatch, NUM_PACKETS}, + solana_perf::packet::{ + to_packet_batches, PacketBatch, SigverifyTracerPacketStats, NUM_PACKETS, + }, solana_poh::poh_recorder::{PohRecorder, Record, WorkingBankEntry}, solana_runtime::bank::Bank, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, diff --git a/core/src/banking_stage/transaction_scheduler/transaction_state.rs b/core/src/banking_stage/transaction_scheduler/transaction_state.rs index 12c2b5de5f0..679ab462f64 100644 --- a/core/src/banking_stage/transaction_scheduler/transaction_state.rs +++ b/core/src/banking_stage/transaction_scheduler/transaction_state.rs @@ -1,8 +1,6 @@ use { - crate::banking_stage::{ - immutable_deserialized_packet::ImmutableDeserializedPacket, scheduler_messages::MaxAge, - }, - std::sync::Arc, + crate::banking_stage::immutable_deserialized_packet::ImmutableDeserializedPacket, + solana_sdk::scheduling::MaxAge, std::sync::Arc, }; /// Simple wrapper type to tie a sanitized transaction to max age slot. diff --git a/core/src/banking_stage/transaction_scheduler/transaction_state_container.rs b/core/src/banking_stage/transaction_scheduler/transaction_state_container.rs index cb57ee13c20..e58e9b5094c 100644 --- a/core/src/banking_stage/transaction_scheduler/transaction_state_container.rs +++ b/core/src/banking_stage/transaction_scheduler/transaction_state_container.rs @@ -210,13 +210,13 @@ impl TransactionStateContainer { mod tests { use { super::*, - crate::banking_stage::scheduler_messages::MaxAge, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, solana_sdk::{ compute_budget::ComputeBudgetInstruction, hash::Hash, message::Message, packet::Packet, + scheduling::MaxAge, signature::Keypair, signer::Signer, system_instruction, diff --git a/core/src/banking_trace.rs b/core/src/banking_trace.rs index 6e0797c8c38..26bb5e486f9 100644 --- a/core/src/banking_trace.rs +++ b/core/src/banking_trace.rs @@ -1,11 +1,11 @@ use { - crate::sigverify::SigverifyTracerPacketStats, bincode::serialize_into, chrono::{DateTime, Local}, crossbeam_channel::{unbounded, Receiver, SendError, Sender, TryRecvError}, rolling_file::{RollingCondition, RollingConditionBasic, RollingFileAppender}, - solana_perf::packet::PacketBatch, + solana_perf::packet::{PacketBatch, SigverifyTracerPacketStats}, solana_sdk::{hash::Hash, slot_history::Slot}, + solana_unified_scheduler_pool::DefaultSchedulerPool, std::{ fs::{create_dir_all, remove_dir_all}, io::{self, Write}, @@ -65,7 +65,7 @@ pub struct BankingTracer { #[cfg_attr( feature = "frozen-abi", derive(AbiExample), - frozen_abi(digest = "6PCDw6YSEivfbwhbPmE4NAsXb88ZX6hkFnruP8B38nma") + frozen_abi(digest = "2skEFDxJCXuMq2LmRK7tJQk1Mzh6Xnouu4uJNixn3ezQ") )] #[derive(Serialize, Deserialize, Debug)] pub struct TimedTracedEvent(pub std::time::SystemTime, pub TracedEvent); @@ -178,6 +178,15 @@ pub fn receiving_loop_with_minimized_sender_overhead( Ok(()) } +pub struct Channels { + pub non_vote_sender: BankingPacketSender, + pub non_vote_receiver: BankingPacketReceiver, + pub tpu_vote_sender: BankingPacketSender, + pub tpu_vote_receiver: BankingPacketReceiver, + pub gossip_vote_sender: BankingPacketSender, + pub gossip_vote_receiver: BankingPacketReceiver, +} + impl BankingTracer { pub fn new( maybe_config: Option<(&PathBuf, Arc, DirByteLimit)>, @@ -220,22 +229,80 @@ impl BankingTracer { self.active_tracer.is_some() } + pub fn create_channels(&self, pool: Option<&Arc>) -> Channels { + if let Some(true) = pool.map(|pool| pool.block_production_supported()) { + let (non_vote_sender, non_vote_receiver) = self.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = + self.create_unified_channel_tpu_vote(&non_vote_sender, &non_vote_receiver); + let (gossip_vote_sender, gossip_vote_receiver) = + self.create_unified_channel_gossip_vote(&non_vote_sender, &non_vote_receiver); + + Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } + } else { + let (non_vote_sender, non_vote_receiver) = self.create_channel_non_vote(); + let (tpu_vote_sender, tpu_vote_receiver) = self.create_channel_tpu_vote(); + let (gossip_vote_sender, gossip_vote_receiver) = self.create_channel_gossip_vote(); + + Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } + } + } + fn create_channel(&self, label: ChannelLabel) -> (BankingPacketSender, BankingPacketReceiver) { Self::channel(label, self.active_tracer.as_ref().cloned()) } - pub fn create_channel_non_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + fn create_channel_non_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { self.create_channel(ChannelLabel::NonVote) } - pub fn create_channel_tpu_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + fn create_channel_tpu_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { self.create_channel(ChannelLabel::TpuVote) } - pub fn create_channel_gossip_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { + fn create_channel_gossip_vote(&self) -> (BankingPacketSender, BankingPacketReceiver) { self.create_channel(ChannelLabel::GossipVote) } + fn create_unified_channel_tpu_vote( + &self, + sender: &TracedSender, + receiver: &BankingPacketReceiver, + ) -> (BankingPacketSender, BankingPacketReceiver) { + Self::channel_inner( + ChannelLabel::TpuVote, + self.active_tracer.as_ref().cloned(), + sender.sender.clone(), + receiver.clone(), + ) + } + + fn create_unified_channel_gossip_vote( + &self, + sender: &TracedSender, + receiver: &BankingPacketReceiver, + ) -> (BankingPacketSender, BankingPacketReceiver) { + Self::channel_inner( + ChannelLabel::GossipVote, + self.active_tracer.as_ref().cloned(), + sender.sender.clone(), + receiver.clone(), + ) + } + pub fn hash_event(&self, slot: Slot, blockhash: &Hash, bank_hash: &Hash) { self.trace_event(|| { TimedTracedEvent( @@ -264,6 +331,15 @@ impl BankingTracer { active_tracer: Option, ) -> (TracedSender, Receiver) { let (sender, receiver) = unbounded(); + Self::channel_inner(label, active_tracer, sender, receiver) + } + + fn channel_inner( + label: ChannelLabel, + active_tracer: Option, + sender: Sender, + receiver: BankingPacketReceiver, + ) -> (TracedSender, Receiver) { (TracedSender::new(label, sender, active_tracer), receiver) } diff --git a/core/src/cluster_info_vote_listener.rs b/core/src/cluster_info_vote_listener.rs index 56869624812..526d297dc21 100644 --- a/core/src/cluster_info_vote_listener.rs +++ b/core/src/cluster_info_vote_listener.rs @@ -194,6 +194,7 @@ impl ClusterInfoVoteListener { verified_packets_sender: BankingPacketSender, vote_tracker: Arc, bank_forks: Arc>, + mut root_bank_cache: RootBankCache, subscriptions: Arc, verified_vote_sender: VerifiedVoteSender, gossip_verified_vote_hash_sender: GossipVerifiedVoteHashSender, @@ -205,7 +206,6 @@ impl ClusterInfoVoteListener { let (verified_vote_transactions_sender, verified_vote_transactions_receiver) = unbounded(); let listen_thread = { let exit = exit.clone(); - let mut root_bank_cache = RootBankCache::new(bank_forks.clone()); Builder::new() .name("solCiVoteLstnr".to_string()) .spawn(move || { diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index 9b998ca9a99..2edf18ce863 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -2,6 +2,7 @@ use { crate::{ + banking_stage::update_bank_forks_and_poh_recorder_for_new_tpu_bank, banking_trace::BankingTracer, cache_block_meta_service::CacheBlockMetaSender, cluster_info_vote_listener::{ @@ -70,6 +71,7 @@ use { hash::Hash, pubkey::Pubkey, saturating_add_assign, + scheduling::SchedulingMode, signature::{Keypair, Signature, Signer}, timing::timestamp, transaction::Transaction, @@ -2220,11 +2222,12 @@ impl ReplayStage { // new()-ing of its child bank banking_tracer.hash_event(parent.slot(), &parent.last_blockhash(), &parent.hash()); - let tpu_bank = bank_forks.write().unwrap().insert(tpu_bank); - poh_recorder - .write() - .unwrap() - .set_bank(tpu_bank, track_transaction_indexes); + update_bank_forks_and_poh_recorder_for_new_tpu_bank( + bank_forks, + poh_recorder, + tpu_bank, + track_transaction_indexes, + ); true } else { error!("{} No next leader found", my_pubkey); @@ -2804,6 +2807,28 @@ impl ReplayStage { } } + fn wait_for_cleared_bank(bank: BankWithScheduler) { + if matches!( + bank.scheduling_mode(), + Some(SchedulingMode::BlockProduction) + ) { + info!("Reaping cleared tpu_bank: {}...", bank.slot()); + if let Some((result, _completed_execute_timings)) = bank.wait_for_completed_scheduler() + { + info!( + "Reaped aborted tpu_bank with unified scheduler: {} {:?}", + bank.slot(), + result + ); + } else { + info!( + "Skipped to reap a tpu_bank (seems unified scheduler is disabled): {}", + bank.slot() + ); + } + } + } + fn reset_poh_recorder( my_pubkey: &Pubkey, blockstore: &Blockstore, @@ -2822,7 +2847,10 @@ impl ReplayStage { GRACE_TICKS_FACTOR * MAX_GRACE_SLOTS, ); - poh_recorder.write().unwrap().reset(bank, next_leader_slot); + let cleared_bank = poh_recorder.write().unwrap().reset(bank, next_leader_slot); + if let Some(cleared_bank) = cleared_bank { + Self::wait_for_cleared_bank(cleared_bank); + } let next_leader_msg = if let Some(next_leader_slot) = next_leader_slot { format!("My next leader slot is {}", next_leader_slot.0) @@ -8384,7 +8412,7 @@ pub(crate) mod tests { .expect("Just inserted"); progress.get_retransmit_info_mut(0).unwrap().retry_time = Instant::now(); - poh_recorder + let _ = poh_recorder .write() .unwrap() .reset(bank_to_dump, Some((slot_to_dump + 1, slot_to_dump + 1))); diff --git a/core/src/sigverify.rs b/core/src/sigverify.rs index 18984ecc4ef..e384b1cb3ea 100644 --- a/core/src/sigverify.rs +++ b/core/src/sigverify.rs @@ -12,49 +12,15 @@ use { banking_trace::{BankingPacketBatch, BankingPacketSender}, sigverify_stage::{SigVerifier, SigVerifyServiceError}, }, - solana_perf::{cuda_runtime::PinnedVec, packet::PacketBatch, recycler::Recycler, sigverify}, - solana_sdk::{packet::Packet, saturating_add_assign}, + solana_perf::{ + cuda_runtime::PinnedVec, + packet::{PacketBatch, SigverifyTracerPacketStats}, + recycler::Recycler, + sigverify, + }, + solana_sdk::packet::Packet, }; -#[cfg_attr(feature = "frozen-abi", derive(AbiExample))] -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct SigverifyTracerPacketStats { - pub total_removed_before_sigverify_stage: usize, - pub total_tracer_packets_received_in_sigverify_stage: usize, - pub total_tracer_packets_deduped: usize, - pub total_excess_tracer_packets: usize, - pub total_tracker_packets_passed_sigverify: usize, -} - -impl SigverifyTracerPacketStats { - pub fn is_default(&self) -> bool { - *self == SigverifyTracerPacketStats::default() - } - - pub fn aggregate(&mut self, other: &SigverifyTracerPacketStats) { - saturating_add_assign!( - self.total_removed_before_sigverify_stage, - other.total_removed_before_sigverify_stage - ); - saturating_add_assign!( - self.total_tracer_packets_received_in_sigverify_stage, - other.total_tracer_packets_received_in_sigverify_stage - ); - saturating_add_assign!( - self.total_tracer_packets_deduped, - other.total_tracer_packets_deduped - ); - saturating_add_assign!( - self.total_excess_tracer_packets, - other.total_excess_tracer_packets - ); - saturating_add_assign!( - self.total_tracker_packets_passed_sigverify, - other.total_tracker_packets_passed_sigverify - ); - } -} - pub struct TransactionSigVerifier { packet_sender: BankingPacketSender, tracer_packet_stats: SigverifyTracerPacketStats, diff --git a/core/src/sigverify_stage.rs b/core/src/sigverify_stage.rs index ac7d9889db0..3059e4528a3 100644 --- a/core/src/sigverify_stage.rs +++ b/core/src/sigverify_stage.rs @@ -579,10 +579,13 @@ mod tests { loop { if let Ok(message) = verified_r.recv() { let (verifieds, tracer_packet_stats) = (&message.0, message.1.as_ref().unwrap()); - total_tracer_packets_received_in_sigverify_stage += - tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage; + total_tracer_packets_received_in_sigverify_stage += tracer_packet_stats + .total_tracer_packets_received_in_sigverify_stage + .0; assert_eq!( - tracer_packet_stats.total_tracer_packets_received_in_sigverify_stage + tracer_packet_stats + .total_tracer_packets_received_in_sigverify_stage + .0 % packets_per_batch, 0, ); @@ -594,27 +597,28 @@ mod tests { // Also have to account for the fact that deduper could be cleared periodically, // in which case the first transaction in the next batch won't be deduped assert!( - (tracer_packet_stats.total_tracer_packets_deduped + (tracer_packet_stats.total_tracer_packets_deduped.0 == tracer_packet_stats .total_tracer_packets_received_in_sigverify_stage + .0 - 1) || (tracer_packet_stats.total_tracer_packets_deduped == tracer_packet_stats .total_tracer_packets_received_in_sigverify_stage) ); assert!( - (tracer_packet_stats.total_tracker_packets_passed_sigverify == 1) - || (tracer_packet_stats.total_tracker_packets_passed_sigverify == 0) + (tracer_packet_stats.total_tracker_packets_passed_sigverify.0 == 1) + || (tracer_packet_stats.total_tracker_packets_passed_sigverify.0 == 0) ); } else { - assert_eq!(tracer_packet_stats.total_tracer_packets_deduped, 0); + assert_eq!(tracer_packet_stats.total_tracer_packets_deduped.0, 0); assert!( (tracer_packet_stats.total_tracker_packets_passed_sigverify == tracer_packet_stats .total_tracer_packets_received_in_sigverify_stage) ); } - assert_eq!(tracer_packet_stats.total_excess_tracer_packets, 0); + assert_eq!(tracer_packet_stats.total_excess_tracer_packets.0, 0); received += verifieds.iter().map(|batch| batch.len()).sum::(); } diff --git a/core/src/tpu.rs b/core/src/tpu.rs index 091a5901c23..837f3c1d5a4 100644 --- a/core/src/tpu.rs +++ b/core/src/tpu.rs @@ -5,7 +5,7 @@ pub use solana_sdk::net::DEFAULT_TPU_COALESCE; use { crate::{ banking_stage::BankingStage, - banking_trace::{BankingTracer, TracerThread}, + banking_trace::{BankingTracer, Channels, TracerThread}, cluster_info_vote_listener::{ ClusterInfoVoteListener, DuplicateConfirmedSlotsSender, GossipVerifiedVoteHashSender, VerifiedVoteSender, VoteTracker, @@ -33,6 +33,7 @@ use { solana_runtime::{ bank_forks::BankForks, prioritization_fee_cache::PrioritizationFeeCache, + root_bank_cache::RootBankCache, vote_sender_types::{ReplayVoteReceiver, ReplayVoteSender}, }, solana_sdk::{clock::Slot, pubkey::Pubkey, quic::NotifyKeyUpdate, signature::Keypair}, @@ -44,6 +45,7 @@ use { streamer::StakedNodes, }, solana_turbine::broadcast_stage::{BroadcastStage, BroadcastStageType}, + solana_unified_scheduler_pool::DefaultSchedulerPool, std::{ collections::HashMap, net::{SocketAddr, UdpSocket}, @@ -99,6 +101,7 @@ impl Tpu { shred_version: u16, vote_tracker: Arc, bank_forks: Arc>, + root_bank_cache: RootBankCache, verified_vote_sender: VerifiedVoteSender, gossip_verified_vote_hash_sender: GossipVerifiedVoteHashSender, replay_vote_receiver: ReplayVoteReceiver, @@ -120,6 +123,7 @@ impl Tpu { block_production_method: BlockProductionMethod, enable_block_production_forwarding: bool, _generator_config: Option, /* vestigial code for replay invalidator */ + unified_scheduler_pool: Option>, ) -> (Self, Vec>) { let TpuSockets { transactions: transactions_sockets, @@ -156,7 +160,14 @@ impl Tpu { shared_staked_nodes_overrides, ); - let (non_vote_sender, non_vote_receiver) = banking_tracer.create_channel_non_vote(); + let Channels { + non_vote_sender, + non_vote_receiver, + tpu_vote_sender, + tpu_vote_receiver, + gossip_vote_sender, + gossip_vote_receiver, + } = banking_tracer.create_channels(unified_scheduler_pool.as_ref()); // Streamer for Votes: let SpawnServerResult { @@ -235,8 +246,6 @@ impl Tpu { SigVerifyStage::new(packet_receiver, verifier, "solSigVerTpu", "tpu-verifier") }; - let (tpu_vote_sender, tpu_vote_receiver) = banking_tracer.create_channel_tpu_vote(); - let vote_sigverify_stage = { let verifier = TransactionSigVerifier::new_reject_non_vote(tpu_vote_sender); SigVerifyStage::new( @@ -247,14 +256,13 @@ impl Tpu { ) }; - let (gossip_vote_sender, gossip_vote_receiver) = - banking_tracer.create_channel_gossip_vote(); let cluster_info_vote_listener = ClusterInfoVoteListener::new( exit.clone(), cluster_info.clone(), gossip_vote_sender, vote_tracker, bank_forks.clone(), + root_bank_cache, subscriptions.clone(), verified_vote_sender, gossip_verified_vote_hash_sender, @@ -278,6 +286,7 @@ impl Tpu { bank_forks.clone(), prioritization_fee_cache, enable_block_production_forwarding, + unified_scheduler_pool, ); let (entry_receiver, tpu_entry_notifier) = diff --git a/core/src/tracer_packet_stats.rs b/core/src/tracer_packet_stats.rs index 2269b35cc70..a9ff49c4d70 100644 --- a/core/src/tracer_packet_stats.rs +++ b/core/src/tracer_packet_stats.rs @@ -1,5 +1,5 @@ use { - crate::sigverify::SigverifyTracerPacketStats, + solana_perf::packet::SigverifyTracerPacketStats, solana_sdk::{pubkey::Pubkey, saturating_add_assign, timing::timestamp}, std::collections::HashSet, }; @@ -128,14 +128,14 @@ impl TracerPacketStats { "total_removed_before_sigverify", modifiable_tracer_packet_stats .sigverify_tracer_packet_stats - .total_removed_before_sigverify_stage as i64, + .total_removed_before_sigverify_stage.0 as i64, i64 ), ( "total_tracer_packets_received_in_sigverify", modifiable_tracer_packet_stats .sigverify_tracer_packet_stats - .total_tracer_packets_received_in_sigverify_stage + .total_tracer_packets_received_in_sigverify_stage.0 as i64, i64 ), @@ -143,21 +143,21 @@ impl TracerPacketStats { "total_tracer_packets_deduped_in_sigverify", modifiable_tracer_packet_stats .sigverify_tracer_packet_stats - .total_tracer_packets_deduped as i64, + .total_tracer_packets_deduped.0 as i64, i64 ), ( "total_excess_tracer_packets_discarded_in_sigverify", modifiable_tracer_packet_stats .sigverify_tracer_packet_stats - .total_excess_tracer_packets as i64, + .total_excess_tracer_packets.0 as i64, i64 ), ( "total_tracker_packets_passed_sigverify", modifiable_tracer_packet_stats .sigverify_tracer_packet_stats - .total_tracker_packets_passed_sigverify as i64, + .total_tracker_packets_passed_sigverify.0 as i64, i64 ), ( diff --git a/core/src/validator.rs b/core/src/validator.rs index e2d05f3f09e..e894b267b66 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -105,6 +105,7 @@ use { bank_forks::BankForks, commitment::BlockCommitmentCache, prioritization_fee_cache::PrioritizationFeeCache, + root_bank_cache::RootBankCache, runtime_config::RuntimeConfig, snapshot_archive_info::SnapshotArchiveInfoGetter, snapshot_bank_utils::{self, DISABLED_SNAPSHOT_ARCHIVE_INTERVAL}, @@ -120,6 +121,7 @@ use { hard_forks::HardForks, hash::Hash, pubkey::Pubkey, + scheduling::SchedulingMode, shred_version::compute_shred_version, signature::{Keypair, Signer}, timing::timestamp, @@ -127,7 +129,7 @@ use { solana_send_transaction_service::send_transaction_service, solana_streamer::{socket::SocketAddrSpace, streamer::StakedNodes}, solana_turbine::{self, broadcast_stage::BroadcastStageType}, - solana_unified_scheduler_pool::DefaultSchedulerPool, + solana_unified_scheduler_pool::{DefaultSchedulerPool, SupportedSchedulingMode}, solana_vote_program::vote_state, solana_wen_restart::wen_restart::{wait_for_wen_restart, WenRestartConfig}, std::{ @@ -184,11 +186,14 @@ impl BlockVerificationMethod { } } -#[derive(Clone, EnumString, EnumVariantNames, Default, IntoStaticStr, Display)] +#[derive( + Clone, EnumCount, EnumIter, EnumString, EnumVariantNames, Default, IntoStaticStr, Display, +)] #[strum(serialize_all = "kebab-case")] pub enum BlockProductionMethod { - #[default] CentralScheduler, + #[default] + UnifiedScheduler, } impl BlockProductionMethod { @@ -208,6 +213,23 @@ impl BlockProductionMethod { } } +pub fn supported_scheduling_mode( + (verification, production): (&BlockVerificationMethod, &BlockProductionMethod), +) -> SupportedSchedulingMode { + match (verification, production) { + (BlockVerificationMethod::UnifiedScheduler, BlockProductionMethod::UnifiedScheduler) => { + SupportedSchedulingMode::Both + } + (BlockVerificationMethod::UnifiedScheduler, _) => { + SupportedSchedulingMode::Either(SchedulingMode::BlockVerification) + } + (_, BlockProductionMethod::UnifiedScheduler) => { + SupportedSchedulingMode::Either(SchedulingMode::BlockProduction) + } + _ => unreachable!("seems unified scheduler is disabled"), + } +} + /// Configuration for the block generator invalidator for replay. #[derive(Clone, Debug)] pub struct GeneratorConfig { @@ -866,32 +888,61 @@ impl Validator { // (by both replay stage and banking stage) let prioritization_fee_cache = Arc::new(PrioritizationFeeCache::default()); - match &config.block_verification_method { - BlockVerificationMethod::BlockstoreProcessor => { - info!("no scheduler pool is installed for block verification..."); - if let Some(count) = config.unified_scheduler_handler_threads { - warn!( - "--unified-scheduler-handler-threads={count} is ignored because unified \ - scheduler isn't enabled" - ); - } - } - BlockVerificationMethod::UnifiedScheduler => { - let scheduler_pool = DefaultSchedulerPool::new_dyn( + let leader_schedule_cache = Arc::new(leader_schedule_cache); + let startup_verification_complete; + let (poh_recorder, entry_receiver, record_receiver) = { + let bank = &bank_forks.read().unwrap().working_bank(); + startup_verification_complete = Arc::clone(bank.get_startup_verification_complete()); + PohRecorder::new_with_clear_signal( + bank.tick_height(), + bank.last_blockhash(), + bank.clone(), + None, + bank.ticks_per_slot(), + config.delay_leader_block_for_pending_fork, + blockstore.clone(), + blockstore.get_new_shred_signal(0), + &leader_schedule_cache, + &genesis_config.poh_config, + Some(poh_timing_point_sender), + exit.clone(), + ) + }; + let poh_recorder = Arc::new(RwLock::new(poh_recorder)); + + let unified_scheduler_pool = match ( + &config.block_verification_method, + &config.block_production_method, + ) { + methods @ (BlockVerificationMethod::UnifiedScheduler, _) + | methods @ (_, BlockProductionMethod::UnifiedScheduler) => { + let pool = DefaultSchedulerPool::new( + supported_scheduling_mode(methods), config.unified_scheduler_handler_threads, config.runtime_config.log_messages_bytes_limit, transaction_status_sender.clone(), Some(replay_vote_sender.clone()), prioritization_fee_cache.clone(), + poh_recorder.read().unwrap().new_recorder(), ); bank_forks .write() .unwrap() - .install_scheduler_pool(scheduler_pool); + .install_scheduler_pool(pool.clone()); + Some(pool) } - } + _ => { + info!("no scheduler pool is installed for block verification/production..."); + if let Some(count) = config.unified_scheduler_handler_threads { + warn!( + "--unified-scheduler-handler-threads={count} is ignored because unified \ + scheduler isn't enabled" + ); + } + None + } + }; - let leader_schedule_cache = Arc::new(leader_schedule_cache); let entry_notification_sender = entry_notifier_service .as_ref() .map(|service| service.sender()); @@ -967,27 +1018,6 @@ impl Validator { let max_slots = Arc::new(MaxSlots::default()); - let startup_verification_complete; - let (poh_recorder, entry_receiver, record_receiver) = { - let bank = &bank_forks.read().unwrap().working_bank(); - startup_verification_complete = Arc::clone(bank.get_startup_verification_complete()); - PohRecorder::new_with_clear_signal( - bank.tick_height(), - bank.last_blockhash(), - bank.clone(), - None, - bank.ticks_per_slot(), - config.delay_leader_block_for_pending_fork, - blockstore.clone(), - blockstore.get_new_shred_signal(0), - &leader_schedule_cache, - &genesis_config.poh_config, - Some(poh_timing_point_sender), - exit.clone(), - ) - }; - let poh_recorder = Arc::new(RwLock::new(poh_recorder)); - let staked_nodes = Arc::new(RwLock::new(StakedNodes::default())); let connection_cache = match use_quic { @@ -1367,6 +1397,7 @@ impl Validator { let cluster_slots = Arc::new(crate::cluster_slots_service::cluster_slots::ClusterSlots::default()); + let root_bank_cache = RootBankCache::new(bank_forks.clone()); let tvu = Tvu::new( vote_account, authorized_voter_keypairs, @@ -1474,6 +1505,7 @@ impl Validator { node.info.shred_version(), vote_tracker, bank_forks.clone(), + root_bank_cache, verified_vote_sender, gossip_verified_vote_hash_sender, replay_vote_receiver, @@ -1495,6 +1527,7 @@ impl Validator { config.block_production_method.clone(), config.enable_block_production_forwarding, config.generator_config.clone(), + unified_scheduler_pool, ); datapoint_info!( diff --git a/core/tests/unified_scheduler.rs b/core/tests/unified_scheduler.rs index 75795f2f6c0..9a8b66fc04e 100644 --- a/core/tests/unified_scheduler.rs +++ b/core/tests/unified_scheduler.rs @@ -17,16 +17,13 @@ use { solana_ledger::genesis_utils::create_genesis_config, solana_runtime::{ accounts_background_service::AbsRequestSender, bank::Bank, bank_forks::BankForks, - genesis_utils::GenesisConfigInfo, prioritization_fee_cache::PrioritizationFeeCache, + genesis_utils::GenesisConfigInfo, installed_scheduler_pool::SchedulingContext, + prioritization_fee_cache::PrioritizationFeeCache, }, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, - solana_sdk::{ - hash::Hash, - pubkey::Pubkey, - system_transaction, - transaction::{Result, SanitizedTransaction}, - }, + solana_sdk::{hash::Hash, pubkey::Pubkey, system_transaction, transaction::Result}, solana_timings::ExecuteTimings, + solana_unified_scheduler_logic::Task, solana_unified_scheduler_pool::{ DefaultTaskHandler, HandlerContext, PooledScheduler, SchedulerPool, TaskHandler, }, @@ -48,9 +45,8 @@ fn test_scheduler_waited_by_drop_bank_service() { fn handle( result: &mut Result<()>, timings: &mut ExecuteTimings, - bank: &Arc, - transaction: &RuntimeTransaction, - index: usize, + scheduling_context: &SchedulingContext, + task: &Task, handler_context: &HandlerContext, ) { info!("Stalling at StallingHandler::handle()..."); @@ -59,7 +55,7 @@ fn test_scheduler_waited_by_drop_bank_service() { std::thread::sleep(std::time::Duration::from_secs(3)); info!("Now entering into DefaultTaskHandler::handle()..."); - DefaultTaskHandler::handle(result, timings, bank, transaction, index, handler_context); + DefaultTaskHandler::handle(result, timings, scheduling_context, task, handler_context); } } @@ -73,7 +69,7 @@ fn test_scheduler_waited_by_drop_bank_service() { let genesis_bank = Bank::new_for_tests(&genesis_config); let bank_forks = BankForks::new_rw_arc(genesis_bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = SchedulerPool::, _>::new( + let pool_raw = SchedulerPool::, _>::new_for_verification( None, None, None, diff --git a/ledger-tool/Cargo.toml b/ledger-tool/Cargo.toml index d8e63900bd4..e94a8a67a83 100644 --- a/ledger-tool/Cargo.toml +++ b/ledger-tool/Cargo.toml @@ -20,6 +20,7 @@ futures = { workspace = true } histogram = { workspace = true } itertools = { workspace = true } log = { workspace = true } +mimalloc = { workspace = true } num_cpus = { workspace = true } rayon = { workspace = true } regex = { workspace = true } @@ -42,6 +43,7 @@ solana-ledger = { workspace = true, features = ["dev-context-only-utils"] } solana-log-collector = { workspace = true } solana-logger = { workspace = true } solana-measure = { workspace = true } +solana-poh = { workspace = true } solana-program-runtime = { workspace = true } solana-rpc = { workspace = true } solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } @@ -59,9 +61,6 @@ solana_rbpf = { workspace = true, features = ["debugger"] } thiserror = { workspace = true } tokio = { workspace = true, features = ["full"] } -[target.'cfg(not(any(target_env = "msvc", target_os = "freebsd")))'.dependencies] -jemallocator = { workspace = true } - [dev-dependencies] assert_cmd = { workspace = true } bytecount = { workspace = true } diff --git a/ledger-tool/src/ledger_utils.rs b/ledger-tool/src/ledger_utils.rs index 03856acbbf7..89814bedc5e 100644 --- a/ledger-tool/src/ledger_utils.rs +++ b/ledger-tool/src/ledger_utils.rs @@ -10,7 +10,8 @@ use { solana_core::{ accounts_hash_verifier::AccountsHashVerifier, rewards_recorder_service::RewardsRecorderService, - snapshot_packager_service::PendingSnapshotPackages, validator::BlockVerificationMethod, + snapshot_packager_service::PendingSnapshotPackages, + validator::{supported_scheduling_mode, BlockProductionMethod, BlockVerificationMethod}, }, solana_geyser_plugin_manager::geyser_plugin_service::{ GeyserPluginService, GeyserPluginServiceError, @@ -27,6 +28,7 @@ use { use_snapshot_archives_at_startup::UseSnapshotArchivesAtStartup, }, solana_measure::measure_time, + solana_poh::poh_recorder::{NewPohRecorder, PohRecorder}, solana_rpc::transaction_status_service::TransactionStatusService, solana_runtime::{ accounts_background_service::{ @@ -66,6 +68,8 @@ pub struct LoadAndProcessLedgerOutput { // not. It is safe to let ABS continue in the background, and ABS will stop // if/when it finally checks the exit flag pub accounts_background_service: AccountsBackgroundService, + pub unified_scheduler_pool: Option>, + pub new_poh_recorder: Option, } const PROCESS_SLOTS_HELP_STRING: &str = @@ -350,43 +354,86 @@ pub fn load_and_process_ledger( exit.clone(), ) .map_err(LoadAndProcessLedgerError::LoadBankForks)?; + let leader_schedule_cache = Arc::new(leader_schedule_cache); let block_verification_method = value_t!( arg_matches, "block_verification_method", BlockVerificationMethod ) .unwrap_or_default(); + let block_production_method = value_t!( + arg_matches, + "block_production_method", + BlockProductionMethod + ) + .inspect(|method| { + if matches!(method, BlockProductionMethod::UnifiedScheduler) + && !arg_matches.is_present("enable_experimental_block_production_method") + { + error!( + "Currently, the unified-scheduler method is experimental for block-production. \ + Explicitly pass --enable-experimental-block-production-method to supress this error" + ); + } + }) + .unwrap_or_default(); info!( - "Using: block-verification-method: {}", - block_verification_method, + "Using: block-verification-method: {}, block-production-method: {}", + block_verification_method, block_production_method ); let unified_scheduler_handler_threads = value_t!(arg_matches, "unified_scheduler_handler_threads", usize).ok(); - match block_verification_method { - BlockVerificationMethod::BlockstoreProcessor => { - info!("no scheduler pool is installed for block verification..."); - if let Some(count) = unified_scheduler_handler_threads { - warn!( - "--unified-scheduler-handler-threads={count} is ignored because unified \ - scheduler isn't enabled" + let (unified_scheduler_pool, new_poh_recorder) = + match (&block_verification_method, &block_production_method) { + methods @ (BlockVerificationMethod::UnifiedScheduler, _) + | methods @ (_, BlockProductionMethod::UnifiedScheduler) => { + let no_replay_vote_sender = None; + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + + let exit = Arc::new(AtomicBool::new(false)); + let poh_bank = bank_forks.read().unwrap().working_bank(); + let new_poh_recorder = PohRecorder::new_with_clear_signal( + poh_bank.tick_height(), + poh_bank.last_blockhash(), + poh_bank.clone(), + None, + poh_bank.ticks_per_slot(), + false, + blockstore.clone(), + blockstore.get_new_shred_signal(0), + &leader_schedule_cache, + &genesis_config.poh_config, + None, + exit.clone(), ); - } - } - BlockVerificationMethod::UnifiedScheduler => { - let no_replay_vote_sender = None; - let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - bank_forks - .write() - .unwrap() - .install_scheduler_pool(DefaultSchedulerPool::new_dyn( + drop(poh_bank); + + let pool = DefaultSchedulerPool::new( + supported_scheduling_mode(methods), unified_scheduler_handler_threads, process_options.runtime_config.log_messages_bytes_limit, transaction_status_sender.clone(), no_replay_vote_sender, ignored_prioritization_fee_cache, - )); - } - } + new_poh_recorder.0.new_recorder(), + ); + bank_forks + .write() + .unwrap() + .install_scheduler_pool(pool.clone()); + (Some(pool), Some(new_poh_recorder)) + } + _ => { + info!("no scheduler pool is installed for block verification/production..."); + if let Some(count) = unified_scheduler_handler_threads { + warn!( + "--unified-scheduler-handler-threads={count} is ignored because unified \ + scheduler isn't enabled" + ); + } + (None, None) + } + }; let pending_snapshot_packages = Arc::new(Mutex::new(PendingSnapshotPackages::default())); let (accounts_package_sender, accounts_package_receiver) = crossbeam_channel::unbounded(); @@ -436,6 +483,8 @@ pub fn load_and_process_ledger( bank_forks, starting_snapshot_hashes, accounts_background_service, + unified_scheduler_pool, + new_poh_recorder, }) .map_err(LoadAndProcessLedgerError::ProcessBlockstoreFromRoot); diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index 4580c0b7731..09f1ae1bc63 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -796,12 +796,8 @@ fn record_transactions( } } -#[cfg(not(any(target_env = "msvc", target_os = "freebsd")))] -use jemallocator::Jemalloc; - -#[cfg(not(any(target_env = "msvc", target_os = "freebsd")))] #[global_allocator] -static GLOBAL: Jemalloc = Jemalloc; +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; #[allow(clippy::cognitive_complexity)] fn main() { @@ -991,6 +987,15 @@ fn main() { .global(true) .help(DefaultSchedulerPool::cli_message()), ) + .arg( + Arg::with_name("enable_experimental_block_production_method") + .long("enable-experimental-block-production-method") + .takes_value(false) + .help( + "Accept unified-scheduler to be used as an experimental block \ + production method", + ), + ) .arg( Arg::with_name("output_format") .long("output") @@ -2068,6 +2073,7 @@ fn main() { bank_forks, starting_snapshot_hashes, accounts_background_service, + .. } = load_and_process_ledger_or_exit( arg_matches, &genesis_config, @@ -2500,14 +2506,18 @@ fn main() { AccessType::Primary, // needed for purging already existing simulated block shreds... )); let genesis_config = open_genesis_config_by(&ledger_path, arg_matches); - let LoadAndProcessLedgerOutput { bank_forks, .. } = - load_and_process_ledger_or_exit( - arg_matches, - &genesis_config, - blockstore.clone(), - process_options, - None, // transaction status sender - ); + let LoadAndProcessLedgerOutput { + bank_forks, + unified_scheduler_pool, + new_poh_recorder, + .. + } = load_and_process_ledger_or_exit( + arg_matches, + &genesis_config, + blockstore.clone(), + process_options, + None, // transaction status sender + ); let block_production_method = value_t!( arg_matches, @@ -2516,13 +2526,13 @@ fn main() { ) .unwrap_or_default(); - info!("Using: block-production-method: {block_production_method}"); - match simulator.start( genesis_config, bank_forks, blockstore, block_production_method, + unified_scheduler_pool, + new_poh_recorder, ) { Ok(()) => println!("Ok"), Err(error) => { diff --git a/ledger/benches/blockstore_processor.rs b/ledger/benches/blockstore_processor.rs index 44f65db1d54..711c5381b63 100644 --- a/ledger/benches/blockstore_processor.rs +++ b/ledger/benches/blockstore_processor.rs @@ -162,6 +162,7 @@ fn bench_execute_batch( &mut timing, None, &prioritization_fee_cache, + None:: Option>>, ); } }); diff --git a/ledger/src/blockstore_processor.rs b/ledger/src/blockstore_processor.rs index 2a514286b6e..58be6e21774 100644 --- a/ledger/src/blockstore_processor.rs +++ b/ledger/src/blockstore_processor.rs @@ -47,6 +47,7 @@ use { hash::Hash, pubkey::Pubkey, saturating_add_assign, + scheduling::TaskKey, signature::{Keypair, Signature}, transaction::{ Result, SanitizedTransaction, TransactionError, TransactionVerificationMode, @@ -110,6 +111,7 @@ fn first_err(results: &[Result<()>]) -> Result<()> { fn get_first_error( batch: &TransactionBatch, commit_results: &[TransactionCommitResult], + is_unified_scheduler_for_block_production: bool, ) -> Option<(Result<()>, Signature)> { let mut first_err = None; for (commit_result, transaction) in commit_results.iter().zip(batch.sanitized_transactions()) { @@ -117,18 +119,20 @@ fn get_first_error( if first_err.is_none() { first_err = Some((Err(err.clone()), *transaction.signature())); } - warn!( - "Unexpected validator error: {:?}, transaction: {:?}", - err, transaction - ); - datapoint_error!( - "validator_process_entry_error", - ( - "error", - format!("error: {err:?}, transaction: {transaction:?}"), - String - ) - ); + if !is_unified_scheduler_for_block_production { + warn!( + "Unexpected validator error: {:?}, transaction: {:?}", + err, transaction + ); + datapoint_error!( + "validator_process_entry_error", + ( + "error", + format!("error: {err:?}, transaction: {transaction:?}"), + String + ) + ); + } } } first_err @@ -150,12 +154,14 @@ pub fn execute_batch( timings: &mut ExecuteTimings, log_messages_bytes_limit: Option, prioritization_fee_cache: &PrioritizationFeeCache, + pre_commit_callback: Option Option>>, ) -> Result<()> { let TransactionBatchWithIndexes { batch, transaction_indexes, } = batch; let record_token_balances = transaction_status_sender.is_some(); + let mut transaction_indexes = transaction_indexes.to_vec(); let mut mint_decimals: HashMap = HashMap::new(); @@ -165,14 +171,32 @@ pub fn execute_batch( vec![] }; - let (commit_results, balances) = batch.bank().load_execute_and_commit_transactions( + let is_unified_scheduler_for_block_production = pre_commit_callback.is_some(); + let pre_commit_callback = pre_commit_callback.map(|original_callback| { + || { + if let Some(maybe_index) = original_callback() { + if let Some(index) = maybe_index { + assert!(transaction_indexes.is_empty()); + transaction_indexes.push(index); + } + true + } else { + false + } + } + }); + + let Some((commit_results, balances)) = batch.bank().do_load_execute_and_commit_transactions( batch, MAX_PROCESSING_AGE, transaction_status_sender.is_some(), ExecutionRecordingConfig::new_single_setting(transaction_status_sender.is_some()), timings, log_messages_bytes_limit, - ); + pre_commit_callback, + ) else { + return Err(TransactionError::CommitFailed); + }; bank_utils::find_and_send_votes( batch.sanitized_transactions(), @@ -201,7 +225,11 @@ pub fn execute_batch( .filter_map(|(commit_result, tx)| commit_result.was_committed().then_some(tx)) .collect_vec(); - let first_err = get_first_error(batch, &commit_results); + let first_err = get_first_error( + batch, + &commit_results, + is_unified_scheduler_for_block_production, + ); if let Some(transaction_status_sender) = transaction_status_sender { let transactions: Vec = batch @@ -224,7 +252,7 @@ pub fn execute_batch( commit_results, balances, token_balances, - transaction_indexes.to_vec(), + transaction_indexes, ); } @@ -322,6 +350,7 @@ fn execute_batches_internal( &mut timings, log_messages_bytes_limit, prioritization_fee_cache, + None:: Option>>, )); let thread_index = replay_tx_thread_pool.current_thread_index().unwrap(); @@ -448,6 +477,7 @@ fn schedule_batches_for_execution( // scheduling is skipped if we have already detected an error in this loop let indexes = starting_index..starting_index + transactions.len(); first_err = first_err.and_then(|()| { + let indexes = indexes.into_iter().map(|i| i as TaskKey); bank.schedule_transaction_executions(transactions.into_iter().zip_eq(indexes)) }); } @@ -598,7 +628,7 @@ pub fn process_entries_for_tests( ) -> Result<()> { let replay_tx_thread_pool = create_thread_pool(1); let verify_transaction = { - let bank = bank.clone_with_scheduler(); + let bank = bank.clone_without_scheduler(); move |versioned_tx: VersionedTransaction| -> Result> { bank.verify_transaction(versioned_tx, TransactionVerificationMode::FullVerification) } @@ -4427,7 +4457,7 @@ pub mod tests { &mut ExecuteTimings::default(), None, ); - let (err, signature) = get_first_error(&batch, &commit_results).unwrap(); + let (err, signature) = get_first_error(&batch, &commit_results, false).unwrap(); assert_eq!(err.unwrap_err(), TransactionError::AccountNotFound); assert_eq!(signature, account_not_found_sig); } @@ -4992,16 +5022,17 @@ pub mod tests { .. } = create_genesis_config_with_leader(500, &dummy_leader_pubkey, 100); let bank = Arc::new(Bank::new_for_tests(&genesis_config)); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); let txs = create_test_transactions(&mint_keypair, &genesis_config.hash()); let mut mocked_scheduler = MockInstalledScheduler::new(); let seq = Arc::new(Mutex::new(mockall::Sequence::new())); let seq_cloned = seq.clone(); + // Used for assertions in BankWithScheduler::{new, schedule_transaction_executions} mocked_scheduler .expect_context() - .times(1) + .times(2) .in_sequence(&mut seq.lock().unwrap()) .return_const(context); if should_succeed { diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 7f847d848a0..3989ba5f649 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -16,7 +16,7 @@ use { }, optimistic_confirmation_verifier::OptimisticConfirmationVerifier, replay_stage::DUPLICATE_THRESHOLD, - validator::{BlockVerificationMethod, ValidatorConfig}, + validator::{BlockProductionMethod, BlockVerificationMethod, ValidatorConfig}, }, solana_download_utils::download_snapshot_archive, solana_entry::entry::create_ticks, @@ -5799,6 +5799,43 @@ fn test_randomly_mixed_block_verification_methods_between_bootstrap_and_not() { ); } +#[test] +#[serial] +fn test_randomly_mixed_block_production_methods_between_bootstrap_and_not() { + // tailored logging just to see two block production methods are working correctly + solana_logger::setup_with_default( + "solana_metrics::metrics=warn,\ + solana_core=warn,\ + solana_runtime::installed_scheduler_pool=trace,\ + solana_ledger::blockstore_processor=debug,\ + info", + ); + + let num_nodes = BlockVerificationMethod::COUNT; + let mut config = ClusterConfig::new_with_equal_stakes( + num_nodes, + DEFAULT_CLUSTER_LAMPORTS, + DEFAULT_NODE_STAKE, + ); + + // Overwrite block_production_method with shuffled variants + let mut methods = BlockProductionMethod::iter().collect::>(); + methods.shuffle(&mut rand::thread_rng()); + for (validator_config, method) in config.validator_configs.iter_mut().zip_eq(methods) { + validator_config.block_production_method = method; + } + + let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified); + cluster_tests::spend_and_verify_all_nodes( + &local.entry_point_info, + &local.funding_keypair, + num_nodes, + HashSet::new(), + SocketAddrSpace::Unspecified, + &local.connection_cache, + ); +} + /// Forks previous marked invalid should be marked as such in fork choice on restart #[test] #[ignore] diff --git a/perf/Cargo.toml b/perf/Cargo.toml index 4af056343ba..16b28dbab2d 100644 --- a/perf/Cargo.toml +++ b/perf/Cargo.toml @@ -13,6 +13,7 @@ edition = { workspace = true } ahash = { workspace = true } bincode = { workspace = true } bv = { workspace = true, features = ["serde"] } +crossbeam-channel = { workspace = true } curve25519-dalek = { workspace = true } dlopen2 = { workspace = true } fnv = { workspace = true } diff --git a/perf/src/packet.rs b/perf/src/packet.rs index 73c29bc5378..a5a8ae5dda9 100644 --- a/perf/src/packet.rs +++ b/perf/src/packet.rs @@ -8,8 +8,10 @@ use { std::{ io::Read, net::SocketAddr, + num::Saturating, ops::{Index, IndexMut}, slice::{Iter, IterMut, SliceIndex}, + sync::Arc, }, }; @@ -226,6 +228,38 @@ pub fn to_packet_batches(items: &[T], chunk_size: usize) -> Vec, Option)>; +pub type BankingPacketReceiver = crossbeam_channel::Receiver< + std::sync::Arc<( + Vec, + std::option::Option, + )>, +>; +#[cfg_attr(feature = "frozen-abi", derive(AbiExample))] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SigverifyTracerPacketStats { + pub total_removed_before_sigverify_stage: Saturating, + pub total_tracer_packets_received_in_sigverify_stage: Saturating, + pub total_tracer_packets_deduped: Saturating, + pub total_excess_tracer_packets: Saturating, + pub total_tracker_packets_passed_sigverify: Saturating, +} + +impl SigverifyTracerPacketStats { + pub fn is_default(&self) -> bool { + *self == SigverifyTracerPacketStats::default() + } + + pub fn aggregate(&mut self, other: &SigverifyTracerPacketStats) { + self.total_removed_before_sigverify_stage += other.total_removed_before_sigverify_stage; + self.total_tracer_packets_received_in_sigverify_stage += + other.total_tracer_packets_received_in_sigverify_stage; + self.total_tracer_packets_deduped += other.total_tracer_packets_deduped; + self.total_excess_tracer_packets += other.total_excess_tracer_packets; + self.total_tracker_packets_passed_sigverify += other.total_tracker_packets_passed_sigverify; + } +} + #[cfg(test)] fn to_packet_batches_for_tests(items: &[T]) -> Vec { to_packet_batches(items, NUM_PACKETS) diff --git a/poh/src/poh_recorder.rs b/poh/src/poh_recorder.rs index 8b95ecec039..542403a681f 100644 --- a/poh/src/poh_recorder.rs +++ b/poh/src/poh_recorder.rs @@ -140,7 +140,7 @@ pub struct RecordTransactionsSummary { pub starting_transaction_index: Option, } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct TransactionRecorder { // shared by all users of PohRecorder pub record_sender: Sender, @@ -155,6 +155,13 @@ impl TransactionRecorder { } } + pub fn new_dummy() -> Self { + Self { + record_sender: crossbeam_channel::unbounded().0, + is_exited: Arc::new(AtomicBool::default()), + } + } + /// Hashes `transactions` and sends to PoH service for recording. Waits for response up to 1s. /// Panics on unexpected (non-`MaxHeightReached`) errors. pub fn record_transactions( @@ -313,8 +320,12 @@ pub struct PohRecorder { pub is_exited: Arc, } +pub type NewPohRecorder = (PohRecorder, Receiver, Receiver); + impl PohRecorder { - fn clear_bank(&mut self) { + #[must_use] + fn clear_bank(&mut self) -> Option { + let mut cleared_bank = None; if let Some(WorkingBank { bank, start, .. }) = self.working_bank.take() { self.leader_bank_notifier.set_completed(bank.slot()); let next_leader_slot = self.leader_schedule_cache.next_leader_slot( @@ -340,6 +351,7 @@ impl PohRecorder { ("slot", bank.slot(), i64), ("elapsed", start.elapsed().as_millis(), i64), ); + cleared_bank = Some(bank); } if let Some(ref signal) = self.clear_bank_signal { @@ -353,6 +365,7 @@ impl PohRecorder { } } } + cleared_bank } pub fn would_be_leader(&self, within_next_n_ticks: u64) -> bool { @@ -382,6 +395,10 @@ impl PohRecorder { .slot_leader_at(current_slot + slots, None) } + pub fn current_slot(&self) -> Slot { + self.slot_for_tick_height(self.tick_height) + } + /// Return the leader and slot pair after `slots_in_the_future` slots. pub fn leader_and_slot_after_n_slots( &self, @@ -436,6 +453,10 @@ impl PohRecorder { self.ticks_per_slot } + pub fn slot(&self) -> Slot { + self.tick_height() / self.ticks_per_slot() + } + pub fn new_recorder(&self) -> TransactionRecorder { TransactionRecorder::new(self.record_sender.clone(), self.is_exited.clone()) } @@ -661,8 +682,13 @@ impl PohRecorder { } // synchronize PoH with a bank - pub fn reset(&mut self, reset_bank: Arc, next_leader_slot: Option<(Slot, Slot)>) { - self.clear_bank(); + #[must_use] + pub fn reset( + &mut self, + reset_bank: Arc, + next_leader_slot: Option<(Slot, Slot)>, + ) -> Option { + let cleared_bank = self.clear_bank(); self.reset_poh(reset_bank, true); if let Some(ref sender) = self.poh_timing_point_sender { @@ -683,6 +709,7 @@ impl PohRecorder { self.leader_first_tick_height_including_grace_ticks = leader_first_tick_height_including_grace_ticks; self.leader_last_tick_height = leader_last_tick_height; + cleared_bank } pub fn set_bank(&mut self, bank: BankWithScheduler, track_transaction_indexes: bool) { @@ -743,7 +770,7 @@ impl PohRecorder { #[cfg(feature = "dev-context-only-utils")] pub fn clear_bank_for_test(&mut self) { - self.clear_bank(); + let _ = self.clear_bank(); } // Flush cache will delay flushing the cache for a bank until it past the WorkingBank::min_tick_height @@ -790,19 +817,19 @@ impl PohRecorder { } if self.tick_height >= working_bank.max_tick_height { info!( - "poh_record: max_tick_height {} reached, clearing working_bank {}", + "poh_record: max_tick_height {} reached, clearing working_bank {:?}", working_bank.max_tick_height, - working_bank.bank.slot() + working_bank.bank.id_and_slot_with_scheduler_status(), ); self.start_bank = working_bank.bank.clone(); let working_slot = self.start_slot(); self.start_tick_height = working_slot * self.ticks_per_slot + 1; - self.clear_bank(); + let _ = self.clear_bank(); } if send_result.is_err() { info!("WorkingBank::sender disconnected {:?}", send_result); // revert the cache, but clear the working bank - self.clear_bank(); + let _ = self.clear_bank(); } else { // commit the flush let _ = self.tick_cache.drain(..entry_count); @@ -1028,7 +1055,7 @@ impl PohRecorder { poh_config: &PohConfig, poh_timing_point_sender: Option, is_exited: Arc, - ) -> (Self, Receiver, Receiver) { + ) -> NewPohRecorder { let tick_number = 0; let poh = Arc::new(Mutex::new(Poh::new_with_slot_info( last_entry_hash, @@ -1098,7 +1125,7 @@ impl PohRecorder { leader_schedule_cache: &Arc, poh_config: &PohConfig, is_exited: Arc, - ) -> (Self, Receiver, Receiver) { + ) -> NewPohRecorder { let delay_leader_block_for_pending_fork = false; Self::new_with_clear_signal( tick_height, @@ -1140,7 +1167,7 @@ impl PohRecorder { pub fn schedule_dummy_max_height_reached_failure(&mut self) { let GenesisConfigInfo { genesis_config, .. } = create_genesis_config(2); let bank = Arc::new(Bank::new_for_tests(&genesis_config)); - self.reset(bank, None); + let _ = self.reset(bank, None); } } @@ -1275,7 +1302,7 @@ mod tests { ); poh_recorder.tick(); assert_eq!(poh_recorder.tick_cache.len(), 1); - poh_recorder.reset(bank0, Some((4, 4))); + let _ = poh_recorder.reset(bank0, Some((4, 4))); assert_eq!(poh_recorder.tick_cache.len(), 0); } @@ -1301,7 +1328,7 @@ mod tests { poh_recorder.set_bank_for_test(bank); assert!(poh_recorder.working_bank.is_some()); - poh_recorder.clear_bank(); + let _ = poh_recorder.clear_bank(); assert!(poh_recorder.working_bank.is_none()); } @@ -1715,7 +1742,7 @@ mod tests { poh_recorder.tick(); poh_recorder.tick(); assert_eq!(poh_recorder.tick_cache.len(), 2); - poh_recorder.reset(bank, Some((4, 4))); + let _ = poh_recorder.reset(bank, Some((4, 4))); assert_eq!(poh_recorder.tick_cache.len(), 0); } @@ -1740,7 +1767,7 @@ mod tests { poh_recorder.tick(); poh_recorder.tick(); assert_eq!(poh_recorder.tick_cache.len(), 2); - poh_recorder.reset(bank, Some((4, 4))); + let _ = poh_recorder.reset(bank, Some((4, 4))); assert_eq!(poh_recorder.tick_cache.len(), 0); } @@ -1770,7 +1797,7 @@ mod tests { poh_recorder.tick(); assert_eq!(poh_recorder.tick_cache.len(), 4); assert_eq!(poh_recorder.tick_height, 4); - poh_recorder.reset(bank, Some((4, 4))); // parent slot 0 implies tick_height of 3 + let _ = poh_recorder.reset(bank, Some((4, 4))); // parent slot 0 implies tick_height of 3 assert_eq!(poh_recorder.tick_cache.len(), 0); poh_recorder.tick(); assert_eq!(poh_recorder.tick_height, DEFAULT_TICKS_PER_SLOT + 1); @@ -1797,7 +1824,7 @@ mod tests { poh_recorder.set_bank_for_test(bank.clone()); assert_eq!(bank.slot(), 0); - poh_recorder.reset(bank, Some((4, 4))); + let _ = poh_recorder.reset(bank, Some((4, 4))); assert!(poh_recorder.working_bank.is_none()); } @@ -1825,7 +1852,7 @@ mod tests { Arc::new(AtomicBool::default()), ); poh_recorder.set_bank_for_test(bank); - poh_recorder.clear_bank(); + let _ = poh_recorder.clear_bank(); assert!(receiver.try_recv().is_ok()); } @@ -2032,14 +2059,14 @@ mod tests { // Test that with no next leader slot in reset(), we don't reach the leader slot assert_eq!(bank0.slot(), 0); - poh_recorder.reset(bank0.clone(), None); + let _ = poh_recorder.reset(bank0.clone(), None); assert_eq!( poh_recorder.reached_leader_slot(&validator_pubkey), PohLeaderStatus::NotReached ); // Provide a leader slot one slot down - poh_recorder.reset(bank0.clone(), Some((2, 2))); + let _ = poh_recorder.reset(bank0.clone(), Some((2, 2))); let init_ticks = poh_recorder.tick_height(); @@ -2076,7 +2103,7 @@ mod tests { // reset poh now. we should immediately be leader let bank1 = Arc::new(Bank::new_from_parent(bank0, &Pubkey::default(), 1)); assert_eq!(bank1.slot(), 1); - poh_recorder.reset(bank1.clone(), Some((2, 2))); + let _ = poh_recorder.reset(bank1.clone(), Some((2, 2))); assert_eq!( poh_recorder.reached_leader_slot(&validator_pubkey), PohLeaderStatus::Reached { @@ -2087,7 +2114,7 @@ mod tests { // Now test that with grace ticks we can reach leader slot // Set the leader slot one slot down - poh_recorder.reset(bank1.clone(), Some((3, 3))); + let _ = poh_recorder.reset(bank1.clone(), Some((3, 3))); // Send one slot worth of ticks ("skips" slot 2) for _ in 0..bank1.ticks_per_slot() { @@ -2126,7 +2153,7 @@ mod tests { // Let's test that correct grace ticks are reported // Set the leader slot one slot down let bank2 = Arc::new(Bank::new_from_parent(bank1.clone(), &Pubkey::default(), 2)); - poh_recorder.reset(bank2.clone(), Some((4, 4))); + let _ = poh_recorder.reset(bank2.clone(), Some((4, 4))); // send ticks for a slot for _ in 0..bank1.ticks_per_slot() { @@ -2140,7 +2167,7 @@ mod tests { ); let bank3 = Arc::new(Bank::new_from_parent(bank2, &Pubkey::default(), 3)); assert_eq!(bank3.slot(), 3); - poh_recorder.reset(bank3.clone(), Some((4, 4))); + let _ = poh_recorder.reset(bank3.clone(), Some((4, 4))); // without sending more ticks, we should be leader now assert_eq!( @@ -2155,7 +2182,7 @@ mod tests { // leader slot, reached_leader_slot() will return true, because it's overdue // Set the leader slot one slot down let bank4 = Arc::new(Bank::new_from_parent(bank3, &Pubkey::default(), 4)); - poh_recorder.reset(bank4.clone(), Some((5, 5))); + let _ = poh_recorder.reset(bank4.clone(), Some((5, 5))); // Overshoot ticks for the slot let overshoot_factor = 4; @@ -2175,7 +2202,7 @@ mod tests { // Test that grace ticks are not required if the previous leader's 4 // slots got skipped. { - poh_recorder.reset(bank4.clone(), Some((9, 9))); + let _ = poh_recorder.reset(bank4.clone(), Some((9, 9))); // Tick until leader slot for _ in 0..4 * bank4.ticks_per_slot() { @@ -2252,13 +2279,13 @@ mod tests { assert!(!poh_recorder.would_be_leader(2 * bank.ticks_per_slot())); assert_eq!(bank.slot(), 0); - poh_recorder.reset(bank.clone(), None); + let _ = poh_recorder.reset(bank.clone(), None); assert!(!poh_recorder.would_be_leader(2 * bank.ticks_per_slot())); // We reset with leader slot after 3 slots let bank_slot = bank.slot() + 3; - poh_recorder.reset(bank.clone(), Some((bank_slot, bank_slot))); + let _ = poh_recorder.reset(bank.clone(), Some((bank_slot, bank_slot))); // Test that the node won't be leader in next 2 slots assert!(!poh_recorder.would_be_leader(2 * bank.ticks_per_slot())); diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 4aabc16f3c9..2a680419e98 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -103,6 +103,7 @@ dependencies = [ "libc", "libloading 0.7.4", "log", + "mimalloc", "num_cpus", "rand 0.8.5", "rayon", @@ -143,7 +144,6 @@ dependencies = [ "solana-vote-program", "symlink", "thiserror 2.0.4", - "tikv-jemallocator", "tokio", ] @@ -668,6 +668,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "bitfield-struct" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de05f8756f1c68937349406d4632ae96ae35901019b5e59c508d9c38c64715fb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -792,6 +803,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "branches" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7958fb9748a08a6f46ef773e87c43997a844709bc293b4c3de48135debaf9d2a" + [[package]] name = "brotli" version = "3.3.4" @@ -848,6 +865,12 @@ dependencies = [ "serde", ] +[[package]] +name = "by_address" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06" + [[package]] name = "bytemuck" version = "1.20.0" @@ -1109,6 +1132,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" +[[package]] +name = "convert_case" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "core-foundation" version = "0.9.3" @@ -1137,6 +1169,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1277,9 +1319,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -1287,29 +1329,35 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim 0.11.1", "syn 2.0.87", ] [[package]] name = "darling_macro" -version = "0.20.1" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", "syn 2.0.87", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -1387,13 +1435,35 @@ version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ - "convert_case", + "convert_case 0.4.0", "proc-macro2", "quote", "rustc_version", "syn 1.0.109", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "convert_case 0.6.0", + "proc-macro2", + "quote", + "syn 2.0.87", + "unicode-xid", +] + [[package]] name = "dialoguer" version = "0.10.4" @@ -1502,6 +1572,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + [[package]] name = "eager" version = "0.1.0" @@ -1618,6 +1694,26 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "enum-ptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b9955cf577337ddbfe2a03307e48bde02ae714346d587fd6f8bb5f262a7e574" +dependencies = [ + "enum-ptr-derive", +] + +[[package]] +name = "enum-ptr-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "367a8dac40699e965e2fb8ac3b272b20058a107ac285b40041155227e4e93fba" +dependencies = [ + "darling", + "quote", + "syn 2.0.87", +] + [[package]] name = "env_logger" version = "0.9.3" @@ -2665,7 +2761,7 @@ version = "18.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2b99d4207e2a04fb4581746903c2bb7eb376f88de9c699d0f3e10feeac0cd3a" dependencies = [ - "derive_more", + "derive_more 0.99.17", "futures 0.3.31", "jsonrpc-core", "jsonrpc-pubsub", @@ -2851,6 +2947,16 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libmimalloc-sys" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "librocksdb-sys" version = "0.16.0+8.10.0" @@ -3095,6 +3201,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "mimalloc" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.16" @@ -3200,6 +3315,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + [[package]] name = "multimap" version = "0.8.3" @@ -4137,6 +4258,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rclite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee9f0c2e8b8ef3ea8b0d074b9a0a192d99d47e2023bec8fd6336f2d8543a43b9" +dependencies = [ + "branches", +] + [[package]] name = "redox_syscall" version = "0.1.56" @@ -5375,6 +5505,7 @@ dependencies = [ "chrono", "crossbeam-channel", "dashmap", + "derive_more 1.0.0", "etcd-client", "futures 0.3.31", "histogram", @@ -5432,6 +5563,7 @@ dependencies = [ "solana-tpu-client", "solana-transaction-status", "solana-turbine", + "solana-unified-scheduler-logic", "solana-unified-scheduler-pool", "solana-version", "solana-vote", @@ -6053,6 +6185,7 @@ dependencies = [ "bincode", "bv", "caps", + "crossbeam-channel", "curve25519-dalek 4.1.3", "dlopen2", "fnv", @@ -6592,6 +6725,7 @@ dependencies = [ "ahash 0.8.11", "aquamarine", "arrayref", + "assert_matches", "base64 0.22.1", "bincode", "blake3", @@ -7966,6 +8100,12 @@ name = "solana-unified-scheduler-logic" version = "2.2.0" dependencies = [ "assert_matches", + "bitfield-struct", + "by_address", + "dary_heap", + "enum-ptr", + "more-asserts", + "rclite", "solana-runtime-transaction", "solana-sdk", "static_assertions", @@ -7975,20 +8115,32 @@ dependencies = [ name = "solana-unified-scheduler-pool" version = "2.2.0" dependencies = [ + "ahash 0.8.11", "assert_matches", + "cpu-time", "crossbeam-channel", "dashmap", "derive-where", + "derive_more 1.0.0", + "dyn-clone", + "enum-ptr", "log", - "qualifier_attr", "scopeguard", + "serde", + "solana-cost-model", + "solana-feature-set", "solana-ledger", + "solana-perf", + "solana-poh", + "solana-program-runtime", "solana-runtime", "solana-runtime-transaction", "solana-sdk", + "solana-svm", "solana-timings", "solana-unified-scheduler-logic", "static_assertions", + "trait-set", "vec_extract_if_polyfill", ] @@ -8542,9 +8694,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" @@ -8817,27 +8969,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "tikv-jemalloc-sys" -version = "0.4.3+5.2.1-patched.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1792ccb507d955b46af42c123ea8863668fae24d03721e40cad6a41773dbb49" -dependencies = [ - "cc", - "fs_extra", - "libc", -] - -[[package]] -name = "tikv-jemallocator" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5b7bcecfafe4998587d636f9ae9d55eb9d0499877b88757767c346875067098" -dependencies = [ - "libc", - "tikv-jemalloc-sys", -] - [[package]] name = "time" version = "0.3.36" @@ -9206,6 +9337,17 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "trait-set" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b79e2e9c9ab44c6d7c20d5976961b47e8f49ac199154daa514b77cd1ab536625" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "trees" version = "0.4.2" @@ -9281,6 +9423,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.1.8" diff --git a/runtime-transaction/src/compute_budget_instruction_details.rs b/runtime-transaction/src/compute_budget_instruction_details.rs index 5b3ef8de0e7..1ac9ac235db 100644 --- a/runtime-transaction/src/compute_budget_instruction_details.rs +++ b/runtime-transaction/src/compute_budget_instruction_details.rs @@ -14,8 +14,7 @@ use { }; #[cfg_attr(test, derive(Eq, PartialEq))] -#[cfg_attr(feature = "dev-context-only-utils", derive(Clone))] -#[derive(Default, Debug)] +#[derive(Clone, Default, Debug)] pub struct ComputeBudgetInstructionDetails { // compute-budget instruction details: // the first field in tuple is instruction index, second field is the unsanitized value set by user diff --git a/runtime-transaction/src/runtime_transaction.rs b/runtime-transaction/src/runtime_transaction.rs index c4fa24d99ce..599264be790 100644 --- a/runtime-transaction/src/runtime_transaction.rs +++ b/runtime-transaction/src/runtime_transaction.rs @@ -30,8 +30,7 @@ use { mod sdk_transactions; mod transaction_view; -#[cfg_attr(feature = "dev-context-only-utils", derive(Clone))] -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct RuntimeTransaction { transaction: T, // transaction meta is a collection of fields, it is updated diff --git a/runtime-transaction/src/transaction_meta.rs b/runtime-transaction/src/transaction_meta.rs index d779fbd15cb..564bff04a50 100644 --- a/runtime-transaction/src/transaction_meta.rs +++ b/runtime-transaction/src/transaction_meta.rs @@ -32,8 +32,7 @@ pub trait StaticMeta { /// on-chain ALT, examples are: transaction usage costs, nonce account. pub trait DynamicMeta: StaticMeta {} -#[cfg_attr(feature = "dev-context-only-utils", derive(Clone))] -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct TransactionMeta { pub(crate) message_hash: Hash, pub(crate) is_simple_vote_transaction: bool, diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index b7eefcea5a7..f3fd96ef7e9 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -13,6 +13,7 @@ edition = { workspace = true } ahash = { workspace = true } aquamarine = { workspace = true } arrayref = { workspace = true } +assert_matches = { workspace = true } base64 = { workspace = true } bincode = { workspace = true } blake3 = { workspace = true } @@ -104,7 +105,6 @@ name = "solana_runtime" [dev-dependencies] agave-transaction-view = { workspace = true } -assert_matches = { workspace = true } ed25519-dalek = { workspace = true } libsecp256k1 = { workspace = true } memoffset = { workspace = true } diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 68f735b3d01..7087a5d8568 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -144,6 +144,7 @@ use { rent_debits::RentDebits, reserved_account_keys::ReservedAccountKeys, reward_info::RewardInfo, + scheduling::MaxAge, signature::{Keypair, Signature}, slot_hashes::SlotHashes, slot_history::{Check, SlotHistory}, @@ -256,7 +257,7 @@ struct RentMetrics { pub type BankStatusCache = StatusCache>; #[cfg_attr( feature = "frozen-abi", - frozen_abi(digest = "BHg4qpwegtaJypLUqAdjQYzYeLfEGf6tA4U5cREbHMHi") + frozen_abi(digest = "Fj6ATu6Rr5ossAykzbRSkCsuUzjdAZbYo5JaqfR1A72G") )] pub type BankSlotDelta = SlotDelta>; @@ -4174,7 +4175,8 @@ impl Bank { ) -> Vec { assert!( !self.freeze_started(), - "commit_transactions() working on a bank that is already frozen or is undergoing freezing!" + "commit_transactions() working on a bank (slot: {}) that is already frozen or is undergoing freezing!", + self.slot(), ); let ProcessedTransactionCounts { @@ -5029,6 +5031,29 @@ impl Bank { timings: &mut ExecuteTimings, log_messages_bytes_limit: Option, ) -> (Vec, TransactionBalancesSet) { + self.do_load_execute_and_commit_transactions( + batch, + max_age, + collect_balances, + recording_config, + timings, + log_messages_bytes_limit, + None:: bool>, + ) + .unwrap() + } + + #[must_use] + pub fn do_load_execute_and_commit_transactions( + &self, + batch: &TransactionBatch, + max_age: usize, + collect_balances: bool, + recording_config: ExecutionRecordingConfig, + timings: &mut ExecuteTimings, + log_messages_bytes_limit: Option, + pre_commit_callback: Option bool>, + ) -> Option<(Vec, TransactionBalancesSet)> { let pre_balances = if collect_balances { self.collect_balances(batch) } else { @@ -5054,6 +5079,15 @@ impl Bank { }, ); + if let Some(pre_commit_callback) = pre_commit_callback { + if let Some(e) = processing_results.first() { + assert_eq!(processing_results.len(), 1); + if e.is_ok() && !pre_commit_callback() { + return None; + } + } + } + let commit_results = self.commit_transactions( batch.sanitized_transactions(), processing_results, @@ -5065,10 +5099,48 @@ impl Bank { } else { vec![] }; - ( + Some(( commit_results, TransactionBalancesSet::new(pre_balances, post_balances), - ) + )) + } + + pub fn refilter_prebuilt_block_production_transaction( + &self, + tx: &impl TransactionWithMeta, + max_age: &MaxAge, + move_precompile_verification_to_svm: bool, + ) -> Result<()> { + // Need to filter out transactions since they were sanitized earlier. + // This means that the transaction may cross and epoch boundary (not allowed), + // or account lookup tables may have been closed. + + // If the transaction was sanitized before this bank's epoch, + // additional checks are necessary. + if self.epoch() != max_age.sanitized_epoch { + // Reserved key set may have changed, so we must verify that + // no writable keys are reserved. + self.check_reserved_keys(tx)?; + } + + if self.slot() > max_age.alt_invalidation_slot { + // The address table lookup **may** have expired, but the + // expiration is not guaranteed since there may have been + // skipped slot. + // If the addresses still resolve here, then the transaction is still + // valid, and we can continue with processing. + // If they do not, then the ATL has expired and the transaction + // can be dropped. + let (_addresses, _deactivation_slot) = + self.load_addresses_from_ref(tx.message_address_table_lookups())?; + } + + // Verify pre-compiles. + if !move_precompile_verification_to_svm { + verify_precompiles(tx, &self.feature_set)?; + } + + Ok(()) } /// Process a Transaction. This is used for unit tests and simply calls the vector @@ -5750,6 +5822,10 @@ impl Bank { hash } + pub fn collector_fees(&self) -> u64 { + self.collector_fees.load(Relaxed) + } + /// The epoch accounts hash is hashed into the bank's hash once per epoch at a predefined slot. /// Should it be included in *this* bank? fn should_include_epoch_accounts_hash(&self) -> bool { diff --git a/runtime/src/bank_forks.rs b/runtime/src/bank_forks.rs index 83c2e0ab3fd..4ff20a1503f 100644 --- a/runtime/src/bank_forks.rs +++ b/runtime/src/bank_forks.rs @@ -17,6 +17,7 @@ use { solana_sdk::{ clock::{BankId, Slot}, hash::Hash, + scheduling::SchedulingMode, }, std::{ collections::{hash_map::Entry, HashMap, HashSet}, @@ -226,18 +227,22 @@ impl BankForks { ); } - pub fn insert(&mut self, mut bank: Bank) -> BankWithScheduler { + pub fn insert(&mut self, bank: Bank) -> BankWithScheduler { + self.insert_with_scheduling_mode(SchedulingMode::BlockVerification, bank) + } + + pub fn insert_with_scheduling_mode( + &mut self, + mode: SchedulingMode, + mut bank: Bank, + ) -> BankWithScheduler { if self.root.load(Ordering::Relaxed) < self.highest_slot_at_startup { bank.set_check_program_modification_slot(true); } let bank = Arc::new(bank); let bank = if let Some(scheduler_pool) = &self.scheduler_pool { - let context = SchedulingContext::new(bank.clone()); - let scheduler = scheduler_pool.take_scheduler(context); - let bank_with_scheduler = BankWithScheduler::new(bank, Some(scheduler)); - scheduler_pool.register_timeout_listener(bank_with_scheduler.create_timeout_listener()); - bank_with_scheduler + Self::install_scheduler_into_bank(scheduler_pool, mode, bank, false) } else { BankWithScheduler::new_without_scheduler(bank) }; @@ -251,6 +256,42 @@ impl BankForks { bank } + fn install_scheduler_into_bank( + scheduler_pool: &InstalledSchedulerPoolArc, + mode: SchedulingMode, + bank: Arc, + is_reinstall: bool, + ) -> BankWithScheduler { + trace!( + "Inserting bank (slot: {}) with scheduler (mode: {:?}, reinstall: {:?})", + bank.slot(), + mode, + is_reinstall, + ); + let context = SchedulingContext::new(mode, Some(bank.clone())); + let Some(scheduler) = scheduler_pool.take_scheduler(context) else { + return BankWithScheduler::new_without_scheduler(bank); + }; + let bank_with_scheduler = BankWithScheduler::new(bank, Some(scheduler)); + scheduler_pool.register_timeout_listener(bank_with_scheduler.create_timeout_listener()); + bank_with_scheduler + } + + #[cfg(feature = "dev-context-only-utils")] + pub fn reinstall_block_production_scheduler_into_working_genesis_bank( + &mut self, + ) -> BankWithScheduler { + let bank = self.working_bank(); + assert!(self.banks.len() == 1 && bank.slot() == 0 && !bank.is_frozen()); + let pool = self.scheduler_pool.as_ref().unwrap(); + let mode = SchedulingMode::BlockProduction; + let bank = Self::install_scheduler_into_bank(pool, mode, bank, true); + self.banks + .insert(bank.slot(), bank.clone_with_scheduler()) + .expect("some removed bank"); + bank + } + pub fn insert_from_ledger(&mut self, bank: Bank) -> BankWithScheduler { self.highest_slot_at_startup = std::cmp::max(self.highest_slot_at_startup, bank.slot()); self.insert(bank) @@ -284,8 +325,8 @@ impl BankForks { self[self.highest_slot()].clone() } - pub fn working_bank_with_scheduler(&self) -> &BankWithScheduler { - &self.banks[&self.highest_slot()] + pub fn working_bank_with_scheduler(&self) -> BankWithScheduler { + self.banks[&self.highest_slot()].clone_with_scheduler() } /// Register to be notified when a bank has been dumped (due to duplicate block handling) diff --git a/runtime/src/installed_scheduler_pool.rs b/runtime/src/installed_scheduler_pool.rs index 9aa4a20e09c..437acb5efd6 100644 --- a/runtime/src/installed_scheduler_pool.rs +++ b/runtime/src/installed_scheduler_pool.rs @@ -22,11 +22,13 @@ use { crate::bank::Bank, + assert_matches::assert_matches, log::*, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, solana_sdk::{ clock::Slot, hash::Hash, + scheduling::{SchedulingMode, TaskKey}, transaction::{Result, SanitizedTransaction, TransactionError}, }, solana_timings::ExecuteTimings, @@ -46,7 +48,7 @@ pub fn initialized_result_with_timings() -> ResultWithTimings { } pub trait InstalledSchedulerPool: Send + Sync + Debug { - fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox { + fn take_scheduler(&self, context: SchedulingContext) -> Option { self.take_resumed_scheduler(context, initialized_result_with_timings()) } @@ -54,7 +56,7 @@ pub trait InstalledSchedulerPool: Send + Sync + Debug { &self, context: SchedulingContext, result_with_timings: ResultWithTimings, - ) -> InstalledSchedulerBox; + ) -> Option; fn register_timeout_listener(&self, timeout_listener: TimeoutListener); } @@ -165,7 +167,7 @@ pub trait InstalledScheduler: Send + Sync + Debug + 'static { fn schedule_execution( &self, transaction: RuntimeTransaction, - index: usize, + index: TaskKey, ) -> ScheduleResult; /// Return the error which caused the scheduler to abort. @@ -227,21 +229,30 @@ pub type SchedulerId = u64; /// `SchedulingContext`s. #[derive(Clone, Debug)] pub struct SchedulingContext { - // mode: SchedulingMode, // this will be added later. - bank: Arc, + mode: SchedulingMode, + bank: Option>, } impl SchedulingContext { - pub fn new(bank: Arc) -> Self { - Self { bank } + pub fn new(mode: SchedulingMode, bank: Option>) -> Self { + Self { mode, bank } + } + + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] + fn for_verification(bank: Arc) -> Self { + Self::new(SchedulingMode::BlockVerification, Some(bank)) + } + + pub fn mode(&self) -> SchedulingMode { + self.mode } pub fn bank(&self) -> &Arc { - &self.bank + self.bank.as_ref().unwrap() } - pub fn slot(&self) -> Slot { - self.bank().slot() + pub fn slot(&self) -> Option { + self.bank.as_ref().map(|bank| bank.slot()) } } @@ -301,7 +312,7 @@ pub enum SchedulerStatus { /// Scheduler is idling for long time, returning scheduler back to the pool. /// This will be immediately (i.e. transaparently) transitioned to Active as soon as there's /// new transaction to be executed. - Stale(InstalledSchedulerPoolArc, ResultWithTimings), + Stale(InstalledSchedulerPoolArc, SchedulingMode, ResultWithTimings), } impl SchedulerStatus { @@ -312,13 +323,31 @@ impl SchedulerStatus { } } + fn scheduling_mode(&self) -> Option { + match self { + SchedulerStatus::Unavailable => None, + SchedulerStatus::Active(scheduler) => Some(scheduler.context().mode()), + SchedulerStatus::Stale(_, mode, _) => Some(*mode), + } + } + + fn status(&self) -> String { + match self { + SchedulerStatus::Unavailable => "Unavailable".to_owned(), + SchedulerStatus::Active(scheduler) => format!("Active({})", scheduler.id()), + SchedulerStatus::Stale(_, _, _) => "Stale".to_owned(), + } + } + fn transition_from_stale_to_active( &mut self, f: impl FnOnce(InstalledSchedulerPoolArc, ResultWithTimings) -> InstalledSchedulerBox, ) { - let Self::Stale(pool, result_with_timings) = mem::replace(self, Self::Unavailable) else { + let Self::Stale(pool, mode, result_with_timings) = mem::replace(self, Self::Unavailable) + else { panic!("transition to Active failed: {self:?}"); }; + assert_matches!(mode, SchedulingMode::BlockVerification); *self = Self::Active(f(pool, result_with_timings)); } @@ -332,8 +361,9 @@ impl SchedulerStatus { let Self::Active(scheduler) = mem::replace(self, Self::Unavailable) else { unreachable!("not active: {self:?}"); }; + let mode = scheduler.context().mode; let (pool, result_with_timings) = f(scheduler); - *self = Self::Stale(pool, result_with_timings); + *self = Self::Stale(pool, mode, result_with_timings); } fn transition_from_active_to_unavailable(&mut self) -> InstalledSchedulerBox { @@ -344,7 +374,8 @@ impl SchedulerStatus { } fn transition_from_stale_to_unavailable(&mut self) -> ResultWithTimings { - let Self::Stale(_pool, result_with_timings) = mem::replace(self, Self::Unavailable) else { + let Self::Stale(_pool, _mode, result_with_timings) = mem::replace(self, Self::Unavailable) + else { panic!("transition to Unavailable failed: {self:?}"); }; result_with_timings @@ -445,7 +476,7 @@ impl BankWithScheduler { pub fn schedule_transaction_executions( &self, transactions_with_indexes: impl ExactSizeIterator< - Item = (RuntimeTransaction, usize), + Item = (RuntimeTransaction, TaskKey), >, ) -> Result<()> { trace!( @@ -454,6 +485,10 @@ impl BankWithScheduler { ); let schedule_result: ScheduleResult = self.inner.with_active_scheduler(|scheduler| { + assert_matches!( + scheduler.context().mode(), + SchedulingMode::BlockVerification + ); for (sanitized_transaction, index) in transactions_with_indexes { scheduler.schedule_execution(sanitized_transaction, index)?; } @@ -507,9 +542,20 @@ impl BankWithScheduler { ) } + pub fn scheduling_mode(&self) -> Option { + self.inner.scheduler.read().unwrap().scheduling_mode() + } + pub const fn no_scheduler_available() -> InstalledSchedulerRwLock { RwLock::new(SchedulerStatus::Unavailable) } + + pub fn id_and_slot_with_scheduler_status(&self) -> (SchedulerId, String) { + ( + self.inner.bank.slot(), + self.inner.scheduler.read().unwrap().status(), + ) + } } impl BankWithSchedulerInner { @@ -523,22 +569,26 @@ impl BankWithSchedulerInner { // This is the fast path, needing single read-lock most of time. f(scheduler) } - SchedulerStatus::Stale(_pool, (result, _timings)) if result.is_err() => { + SchedulerStatus::Stale(_pool, mode, (result, _timings)) if result.is_err() => { + assert_matches!(mode, SchedulingMode::BlockVerification); trace!( "with_active_scheduler: bank (slot: {}) has a stale aborted scheduler...", self.bank.slot(), ); Err(SchedulerAborted) } - SchedulerStatus::Stale(pool, _result_with_timings) => { + SchedulerStatus::Stale(pool, mode, _result_with_timings) => { + assert_matches!(mode, SchedulingMode::BlockVerification); let pool = pool.clone(); drop(scheduler); - let context = SchedulingContext::new(self.bank.clone()); + let context = SchedulingContext::for_verification(self.bank.clone()); let mut scheduler = self.scheduler.write().unwrap(); trace!("with_active_scheduler: {:?}", scheduler); scheduler.transition_from_stale_to_active(|pool, result_with_timings| { - let scheduler = pool.take_resumed_scheduler(context, result_with_timings); + let scheduler = pool + .take_resumed_scheduler(context, result_with_timings) + .expect("successful retaking"); info!( "with_active_scheduler: bank (slot: {}) got active, taking scheduler (id: {})", self.bank.slot(), @@ -571,23 +621,24 @@ impl BankWithSchedulerInner { return; }; + let mut id = None; scheduler.maybe_transition_from_active_to_stale(|scheduler| { // The scheduler hasn't still been wait_for_termination()-ed after awhile... // Return the installed scheduler back to the scheduler pool as soon as the // scheduler gets idle after executing all currently-scheduled transactions. - let id = scheduler.id(); + id = Some(scheduler.id()); let (result_with_timings, uninstalled_scheduler) = scheduler.wait_for_termination(false); uninstalled_scheduler.return_to_pool(); info!( - "timeout_listener: bank (slot: {}) got stale, returning scheduler (id: {})", + "timeout_listener: bank (slot: {}) got stale, returned scheduler (id: {:?})", bank.bank.slot(), id, ); (pool, result_with_timings) }); - trace!("timeout_listener: {:?}", scheduler); + trace!("timeout_listener: {:?}", id); }) } @@ -597,7 +648,8 @@ impl BankWithSchedulerInner { let mut scheduler = self.scheduler.write().unwrap(); match &mut *scheduler { SchedulerStatus::Active(scheduler) => scheduler.recover_error_after_abort(), - SchedulerStatus::Stale(_pool, (result, _timings)) if result.is_err() => { + SchedulerStatus::Stale(_pool, mode, (result, _timings)) if result.is_err() => { + assert_matches!(mode, SchedulingMode::BlockVerification); result.clone().unwrap_err() } _ => unreachable!("no error in {:?}", self.scheduler), @@ -639,12 +691,12 @@ impl BankWithSchedulerInner { uninstalled_scheduler.return_to_pool(); (false, Some(result_with_timings)) } - SchedulerStatus::Stale(_pool, _result_with_timings) if reason.is_paused() => { + SchedulerStatus::Stale(_pool, _mode, _result_with_timings) if reason.is_paused() => { // Do nothing for pauses because the scheduler termination is guaranteed to be // called later. (true, None) } - SchedulerStatus::Stale(_pool, _result_with_timings) => { + SchedulerStatus::Stale(_pool, _mode, _result_with_timings) => { let result_with_timings = scheduler.transition_from_stale_to_unavailable(); (true, Some(result_with_timings)) } @@ -711,7 +763,6 @@ mod tests { bank::test_utils::goto_end_of_slot_with_scheduler, genesis_utils::{create_genesis_config, GenesisConfigInfo}, }, - assert_matches::assert_matches, mockall::Sequence, solana_sdk::system_transaction, std::sync::Mutex, @@ -721,14 +772,16 @@ mod tests { bank: Arc, is_dropped_flags: impl Iterator, f: Option, + extra_context_use: usize, ) -> InstalledSchedulerBox { let mut mock = MockInstalledScheduler::new(); let seq = Arc::new(Mutex::new(Sequence::new())); + // Could be used for assertions in BankWithScheduler::{new, schedule_transaction_executions} mock.expect_context() - .times(1) + .times(1 + extra_context_use) .in_sequence(&mut seq.lock().unwrap()) - .return_const(SchedulingContext::new(bank)); + .return_const(SchedulingContext::for_verification(bank)); for wait_reason in is_dropped_flags { let seq_cloned = seq.clone(); @@ -765,6 +818,7 @@ mod tests { bank, is_dropped_flags, None:: ()>, + 0, ) } @@ -826,6 +880,7 @@ mod tests { .times(1) .returning(|| ()); }), + 0, )), ); goto_end_of_slot_with_scheduler(&bank); @@ -867,6 +922,7 @@ mod tests { .returning(|| TransactionError::InsufficientFundsForFee); } }), + 1, ); let bank = BankWithScheduler::new(bank, Some(mocked_scheduler)); diff --git a/sdk/frozen-abi/src/abi_example.rs b/sdk/frozen-abi/src/abi_example.rs index 63b3c1d68c2..46eae5a5ee4 100644 --- a/sdk/frozen-abi/src/abi_example.rs +++ b/sdk/frozen-abi/src/abi_example.rs @@ -189,6 +189,13 @@ example_impls! { i32, 0 } example_impls! { i64, 0 } example_impls! { i128, 0 } +impl AbiExample for std::num::Saturating { + fn example() -> Self { + info!("AbiExample for (Saturating): {}", type_name::()); + std::num::Saturating(T::example()) + } +} + example_impls! { f32, 0.0f32 } example_impls! { f64, 0.0f64 } example_impls! { String, String::new() } diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs index 752ad612a5c..6d230318bbd 100644 --- a/sdk/src/lib.rs +++ b/sdk/src/lib.rs @@ -92,6 +92,7 @@ pub mod reward_type { pub use solana_reward_info::RewardType; } pub mod rpc_port; +pub mod scheduling; pub mod shred_version; pub mod signature; pub mod signer; diff --git a/sdk/src/scheduling.rs b/sdk/src/scheduling.rs new file mode 100644 index 00000000000..210dfd2c06b --- /dev/null +++ b/sdk/src/scheduling.rs @@ -0,0 +1,25 @@ +//! Primitive types relevant to transaction scheduling +#![cfg(feature = "full")] + +use solana_sdk::clock::{Epoch, Slot}; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum SchedulingMode { + BlockVerification, + BlockProduction, +} + +pub type TaskKey = u128; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MaxAge { + pub sanitized_epoch: Epoch, + pub alt_invalidation_slot: Slot, +} + +impl MaxAge { + pub const MAX: Self = Self { + sanitized_epoch: Epoch::MAX, + alt_invalidation_slot: Slot::MAX, + }; +} diff --git a/sdk/transaction-error/src/lib.rs b/sdk/transaction-error/src/lib.rs index 433a48b0122..db08f4fe6ed 100644 --- a/sdk/transaction-error/src/lib.rs +++ b/sdk/transaction-error/src/lib.rs @@ -137,6 +137,9 @@ pub enum TransactionError { /// Program cache hit max limit. ProgramCacheHitMaxLimit, + + /// Commit failed internally. + CommitFailed, } impl std::error::Error for TransactionError {} @@ -220,6 +223,8 @@ impl fmt::Display for TransactionError { => f.write_str("Sum of account balances before and after transaction do not match"), Self::ProgramCacheHitMaxLimit => f.write_str("Program cache hit max limit"), + Self::CommitFailed + => f.write_str("CommitFailed"), } } } diff --git a/sdk/transaction/src/sanitized.rs b/sdk/transaction/src/sanitized.rs index 2d0f3639c2c..81f209030da 100644 --- a/sdk/transaction/src/sanitized.rs +++ b/sdk/transaction/src/sanitized.rs @@ -1,3 +1,5 @@ +#[cfg(any(feature = "verify", feature = "precompiles"))] +use solana_transaction_error::TransactionError; use { crate::versioned::{sanitized::SanitizedVersionedTransaction, VersionedTransaction}, solana_hash::Hash, @@ -9,7 +11,7 @@ use { }, solana_pubkey::Pubkey, solana_signature::Signature, - solana_transaction_error::{TransactionError, TransactionResult as Result}, + solana_transaction_error::TransactionResult as Result, std::collections::HashSet, }; #[cfg(feature = "blake3")] @@ -214,38 +216,6 @@ impl SanitizedTransaction { } } - /// Validate and return the account keys locked by this transaction - pub fn get_account_locks( - &self, - tx_account_lock_limit: usize, - ) -> Result { - Self::validate_account_locks(self.message(), tx_account_lock_limit)?; - Ok(self.get_account_locks_unchecked()) - } - - /// Return the list of accounts that must be locked during processing this transaction. - pub fn get_account_locks_unchecked(&self) -> TransactionAccountLocks { - let message = &self.message; - let account_keys = message.account_keys(); - let num_readonly_accounts = message.num_readonly_accounts(); - let num_writable_accounts = account_keys.len().saturating_sub(num_readonly_accounts); - - let mut account_locks = TransactionAccountLocks { - writable: Vec::with_capacity(num_writable_accounts), - readonly: Vec::with_capacity(num_readonly_accounts), - }; - - for (i, key) in account_keys.iter().enumerate() { - if message.is_writable(i) { - account_locks.writable.push(key); - } else { - account_locks.readonly.push(key); - } - } - - account_locks - } - /// Return the list of addresses loaded from on-chain address lookup tables pub fn get_loaded_addresses(&self) -> LoadedAddresses { match &self.message { @@ -308,20 +278,6 @@ impl SanitizedTransaction { Ok(()) } - /// Validate a transaction message against locked accounts - pub fn validate_account_locks( - message: &SanitizedMessage, - tx_account_lock_limit: usize, - ) -> Result<()> { - if message.has_duplicates() { - Err(TransactionError::AccountLoadedTwice) - } else if message.account_keys().len() > tx_account_lock_limit { - Err(TransactionError::TooManyAccountLocks) - } else { - Ok(()) - } - } - #[cfg(feature = "dev-context-only-utils")] pub fn new_for_tests( message: SanitizedMessage, diff --git a/storage-proto/proto/transaction_by_addr.proto b/storage-proto/proto/transaction_by_addr.proto index d0fa74a2104..c4025dbafe8 100644 --- a/storage-proto/proto/transaction_by_addr.proto +++ b/storage-proto/proto/transaction_by_addr.proto @@ -63,6 +63,7 @@ enum TransactionErrorType { PROGRAM_EXECUTION_TEMPORARILY_RESTRICTED = 35; UNBALANCED_TRANSACTION = 36; PROGRAM_CACHE_HIT_MAX_LIMIT = 37; + COMMIT_FAILED = 38; } message InstructionError { diff --git a/storage-proto/src/convert.rs b/storage-proto/src/convert.rs index 6a6e451b485..55a54c3d06d 100644 --- a/storage-proto/src/convert.rs +++ b/storage-proto/src/convert.rs @@ -852,6 +852,7 @@ impl TryFrom for TransactionError { 34 => TransactionError::ResanitizationNeeded, 36 => TransactionError::UnbalancedTransaction, 37 => TransactionError::ProgramCacheHitMaxLimit, + 38 => TransactionError::CommitFailed, _ => return Err("Invalid TransactionError"), }) } @@ -973,6 +974,9 @@ impl From for tx_by_addr::TransactionError { TransactionError::ProgramCacheHitMaxLimit => { tx_by_addr::TransactionErrorType::ProgramCacheHitMaxLimit } + TransactionError::CommitFailed => { + tx_by_addr::TransactionErrorType::CommitFailed + } } as i32, instruction_error: match transaction_error { TransactionError::InstructionError(index, ref instruction_error) => { diff --git a/svm/examples/Cargo.lock b/svm/examples/Cargo.lock index 7bd58e61e95..cfc9b8f4942 100644 --- a/svm/examples/Cargo.lock +++ b/svm/examples/Cargo.lock @@ -588,6 +588,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "bitfield-struct" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de05f8756f1c68937349406d4632ae96ae35901019b5e59c508d9c38c64715fb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -712,6 +723,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "branches" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7958fb9748a08a6f46ef773e87c43997a844709bc293b4c3de48135debaf9d2a" + [[package]] name = "brotli" version = "7.0.0" @@ -768,6 +785,12 @@ dependencies = [ "serde", ] +[[package]] +name = "by_address" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06" + [[package]] name = "bytemuck" version = "1.20.0" @@ -1023,6 +1046,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" +[[package]] +name = "convert_case" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -1051,6 +1083,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.15" @@ -1220,6 +1262,12 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -1297,13 +1345,35 @@ version = "0.99.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" dependencies = [ - "convert_case", + "convert_case 0.4.0", "proc-macro2", "quote", "rustc_version", "syn 2.0.87", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "convert_case 0.6.0", + "proc-macro2", + "quote", + "syn 2.0.87", + "unicode-xid", +] + [[package]] name = "dialoguer" version = "0.10.4" @@ -1412,6 +1482,12 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + [[package]] name = "eager" version = "0.1.0" @@ -1519,6 +1595,26 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "enum-ptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b9955cf577337ddbfe2a03307e48bde02ae714346d587fd6f8bb5f262a7e574" +dependencies = [ + "enum-ptr-derive", +] + +[[package]] +name = "enum-ptr-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "367a8dac40699e965e2fb8ac3b272b20058a107ac285b40041155227e4e93fba" +dependencies = [ + "darling", + "quote", + "syn 2.0.87", +] + [[package]] name = "env_logger" version = "0.9.3" @@ -2589,6 +2685,7 @@ dependencies = [ "serde", "serde_json", "solana-account-decoder", + "solana-accounts-db", "solana-bpf-loader-program", "solana-compute-budget", "solana-perf", @@ -2596,6 +2693,7 @@ dependencies = [ "solana-rpc-client-api", "solana-sdk", "solana-svm", + "solana-svm-transaction", "solana-system-program", "solana-transaction-status", "solana-version", @@ -2621,7 +2719,7 @@ version = "18.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2b99d4207e2a04fb4581746903c2bb7eb376f88de9c699d0f3e10feeac0cd3a" dependencies = [ - "derive_more", + "derive_more 0.99.18", "futures 0.3.31", "jsonrpc-core", "jsonrpc-pubsub", @@ -3102,6 +3200,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + [[package]] name = "multimap" version = "0.8.3" @@ -4029,6 +4133,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rclite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee9f0c2e8b8ef3ea8b0d074b9a0a192d99d47e2023bec8fd6336f2d8543a43b9" +dependencies = [ + "branches", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -5226,6 +5339,7 @@ dependencies = [ "chrono", "crossbeam-channel", "dashmap", + "derive_more 1.0.0", "etcd-client", "futures 0.3.31", "histogram", @@ -5283,6 +5397,7 @@ dependencies = [ "solana-tpu-client", "solana-transaction-status", "solana-turbine", + "solana-unified-scheduler-logic", "solana-unified-scheduler-pool", "solana-version", "solana-vote", @@ -5873,6 +5988,7 @@ dependencies = [ "bincode", "bv", "caps", + "crossbeam-channel", "curve25519-dalek 4.1.3", "dlopen2", "fnv", @@ -6412,6 +6528,7 @@ dependencies = [ "ahash 0.8.11", "aquamarine", "arrayref", + "assert_matches", "base64 0.22.1", "bincode", "blake3", @@ -7311,6 +7428,12 @@ name = "solana-unified-scheduler-logic" version = "2.2.0" dependencies = [ "assert_matches", + "bitfield-struct", + "by_address", + "dary_heap", + "enum-ptr", + "more-asserts", + "rclite", "solana-runtime-transaction", "solana-sdk", "static_assertions", @@ -7320,20 +7443,32 @@ dependencies = [ name = "solana-unified-scheduler-pool" version = "2.2.0" dependencies = [ + "ahash 0.8.11", "assert_matches", + "cpu-time", "crossbeam-channel", "dashmap", "derive-where", + "derive_more 1.0.0", + "dyn-clone", + "enum-ptr", "log", - "qualifier_attr", "scopeguard", + "serde", + "solana-cost-model", + "solana-feature-set", "solana-ledger", + "solana-perf", + "solana-poh", + "solana-program-runtime", "solana-runtime", "solana-runtime-transaction", "solana-sdk", + "solana-svm", "solana-timings", "solana-unified-scheduler-logic", "static_assertions", + "trait-set", "vec_extract_if_polyfill", ] @@ -8520,6 +8655,17 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "trait-set" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b79e2e9c9ab44c6d7c20d5976961b47e8f49ac199154daa514b77cd1ab536625" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "trees" version = "0.4.2" @@ -8592,6 +8738,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.1.14" diff --git a/svm/examples/Cargo.toml b/svm/examples/Cargo.toml index a5df7288e95..b7a9db2db9c 100644 --- a/svm/examples/Cargo.toml +++ b/svm/examples/Cargo.toml @@ -31,6 +31,7 @@ jsonrpc-http-server = "18.0.0" log = "0.4.22" serde = "1.0.214" serde_json = "1.0.132" +solana-accounts-db = { path = "../../accounts-db" } solana-account-decoder = { path = "../../account-decoder" } solana-bpf-loader-program = { path = "../../programs/bpf_loader" } solana-client = { path = "../../client" } @@ -41,6 +42,7 @@ solana-program-runtime = { path = "../../program-runtime" } solana-rpc-client-api = { path = "../../rpc-client-api" } solana-sdk = { path = "../../sdk/" } solana-svm = { path = "../" } +solana-svm-transaction = { path = "../../svm-transaction" } solana-system-program = { path = "../../programs/system" } solana-version = { path = "../../version" } solana-test-validator = { path = "../../test-validator" } diff --git a/svm/examples/json-rpc/server/Cargo.toml b/svm/examples/json-rpc/server/Cargo.toml index fd0558a5cf9..740bc09e12d 100644 --- a/svm/examples/json-rpc/server/Cargo.toml +++ b/svm/examples/json-rpc/server/Cargo.toml @@ -20,6 +20,7 @@ log = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } solana-account-decoder = { workspace = true } +solana-accounts-db = { workspace = true } solana-bpf-loader-program = { workspace = true } solana-compute-budget = { workspace = true } solana-perf = { workspace = true } @@ -27,6 +28,7 @@ solana-program-runtime = { workspace = true } solana-rpc-client-api = { workspace = true } solana-sdk = { workspace = true } solana-svm = { workspace = true } +solana-svm-transaction = { workspace = true } solana-system-program = { workspace = true } solana-transaction-status = { workspace = true } solana-version = { workspace = true } diff --git a/svm/examples/json-rpc/server/src/rpc_process.rs b/svm/examples/json-rpc/server/src/rpc_process.rs index 280496f0903..c70c43a207c 100644 --- a/svm/examples/json-rpc/server/src/rpc_process.rs +++ b/svm/examples/json-rpc/server/src/rpc_process.rs @@ -15,6 +15,7 @@ use { parse_token::{get_token_account_mint, is_known_spl_token_id}, UiAccount, UiAccountEncoding, UiDataSliceConfig, MAX_BASE58_BYTES, }, + solana_accounts_db::account_locks::validate_account_locks, solana_compute_budget::compute_budget::ComputeBudget, solana_perf::packet::PACKET_DATA_SIZE, solana_program_runtime::loaded_programs::ProgramCacheEntry, @@ -56,6 +57,7 @@ use { TransactionProcessingConfig, TransactionProcessingEnvironment, }, }, + solana_svm_transaction::svm_message::SVMMessage, solana_system_program::system_processor, solana_transaction_status::{ map_inner_instructions, parse_ui_inner_instructions, TransactionBinaryEncoding, @@ -380,9 +382,7 @@ impl JsonRpcRequestProcessor { transaction: &'a SanitizedTransaction, ) -> TransactionBatch<'a> { let tx_account_lock_limit = solana_sdk::transaction::MAX_TX_ACCOUNT_LOCKS; - let lock_result = transaction - .get_account_locks(tx_account_lock_limit) - .map(|_| ()); + let lock_result = validate_account_locks(transaction.account_keys(), tx_account_lock_limit); let batch = TransactionBatch::new( vec![lock_result], std::borrow::Cow::Borrowed(std::slice::from_ref(transaction)), diff --git a/unified-scheduler-logic/Cargo.toml b/unified-scheduler-logic/Cargo.toml index b48fed86f11..4f7f64be186 100644 --- a/unified-scheduler-logic/Cargo.toml +++ b/unified-scheduler-logic/Cargo.toml @@ -11,6 +11,12 @@ edition = { workspace = true } [dependencies] assert_matches = { workspace = true } +bitfield-struct = "0.8.0" +by_address = "1.2.1" +dary_heap = "0.3.6" +enum-ptr = "0.2.0" +more-asserts = { workspace = true } +rclite = "0.2.4" solana-runtime-transaction = { workspace = true } solana-sdk = { workspace = true } static_assertions = { workspace = true } diff --git a/unified-scheduler-logic/src/lib.rs b/unified-scheduler-logic/src/lib.rs index 2e8caca3b85..1348d29e93c 100644 --- a/unified-scheduler-logic/src/lib.rs +++ b/unified-scheduler-logic/src/lib.rs @@ -1,4 +1,5 @@ #![allow(rustdoc::private_intra_doc_links)] +#![allow(clippy::mutable_key_type)] //! The task (transaction) scheduling code for the unified scheduler //! //! ### High-level API and design @@ -10,7 +11,7 @@ //! execute in parallel. Lastly, `SchedulingStateMachine` should be notified about the completion //! of the exeuction via [`::deschedule_task()`](SchedulingStateMachine::deschedule_task), so that //! conflicting tasks can be returned from -//! [`::schedule_next_unblocked_task()`](SchedulingStateMachine::schedule_next_unblocked_task) as +//! [`::schedule_next_buffered_task()`](SchedulingStateMachine::schedule_next_buffered_task) as //! newly-unblocked runnable ones. //! //! The design principle of this crate (`solana-unified-scheduler-logic`) is simplicity for the @@ -95,13 +96,24 @@ //! susceptible to the buffer bloat problem by itself as explained by the description and validated //! by the mentioned benchmark above. Thus, this should be solved elsewhere, specifically at the //! scheduler pool. +pub use utils::ShortCounter; use { - crate::utils::{ShortCounter, Token, TokenCell}, + crate::utils::{Token, TokenCell}, assert_matches::assert_matches, + by_address::ByAddress, + more_asserts::assert_gt, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, - solana_sdk::{pubkey::Pubkey, transaction::SanitizedTransaction}, + solana_sdk::{ + pubkey::Pubkey, + scheduling::{MaxAge, SchedulingMode, TaskKey}, + transaction::SanitizedTransaction, + }, static_assertions::const_assert_eq, - std::{collections::VecDeque, mem, sync::Arc}, + std::{ + collections::{BTreeSet, HashSet}, + mem, + sync::Arc, + }, }; /// Internal utilities. Namely this contains [`ShortCounter`] and [`TokenCell`]. @@ -118,50 +130,50 @@ mod utils { /// /// It's caller's reponsibility to ensure this (backed by [`u32`]) never overflow. #[derive(Debug, Clone, Copy)] - pub(super) struct ShortCounter(u32); + pub struct ShortCounter(u32); impl ShortCounter { - pub(super) fn zero() -> Self { + pub fn zero() -> Self { Self(0) } - pub(super) fn one() -> Self { + pub fn one() -> Self { Self(1) } - pub(super) fn is_one(&self) -> bool { - self.0 == 1 - } - - pub(super) fn is_zero(&self) -> bool { + pub fn is_zero(&self) -> bool { self.0 == 0 } - pub(super) fn current(&self) -> u32 { + pub fn current(&self) -> u32 { self.0 } #[must_use] - pub(super) fn increment(self) -> Self { + #[track_caller] + pub fn increment(self) -> Self { Self(self.0.checked_add(1).unwrap()) } #[must_use] - pub(super) fn decrement(self) -> Self { + #[track_caller] + pub fn decrement(self) -> Self { Self(self.0.checked_sub(1).unwrap()) } - pub(super) fn increment_self(&mut self) -> &mut Self { + #[track_caller] + pub fn increment_self(&mut self) -> &mut Self { *self = self.increment(); self } - pub(super) fn decrement_self(&mut self) -> &mut Self { + #[track_caller] + pub fn decrement_self(&mut self) -> &mut Self { *self = self.decrement(); self } - pub(super) fn reset_to_zero(&mut self) -> &mut Self { + pub fn reset_to_zero(&mut self) -> &mut Self { self.0 = 0; self } @@ -200,7 +212,7 @@ mod utils { /// time. Finally, this restriction is traded off for restoration of Rust aliasing rule at zero /// runtime cost. Without this token mechanism, there's no way to realize this. #[derive(Debug, Default)] - pub(super) struct TokenCell(UnsafeCell); + pub struct TokenCell(UnsafeCell); impl TokenCell { /// Creates a new `TokenCell` with the `value` typed as `V`. @@ -250,6 +262,7 @@ mod utils { /// existence of mutable access over them by requiring the token itself to be mutably borrowed /// to get a mutable reference to the internal value of `TokenCell`. // *mut is used to make this type !Send and !Sync + #[derive(Debug)] pub(super) struct Token(PhantomData<*mut V>); impl Token { @@ -400,7 +413,50 @@ type LockResult = Result<(), ()>; const_assert_eq!(mem::size_of::(), 1); /// Something to be scheduled; usually a wrapper of [`SanitizedTransaction`]. -pub type Task = Arc; +#[derive(Clone, Debug)] +pub struct Task(rclite::Rc); + +unsafe impl enum_ptr::Aligned for Task { + const ALIGNMENT: usize = std::mem::align_of::(); +} + +unsafe impl Sync for Task {} +unsafe impl Send for Task {} + +impl Task { + fn new(task: TaskInner) -> Self { + Self(rclite::Rc::new(task)) + } + + #[must_use] + fn try_unblock(self, token: &mut BlockedUsageCountToken) -> Option { + let did_unblock = self + .blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + let c = counter_with_status.count(); + counter_with_status.set_count(c.checked_sub(1).unwrap()); + c == 1 + }); + did_unblock.then_some(self) + } + + fn force_unblock(&self, blocked_count: u32, token: &mut BlockedUsageCountToken) { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + let c = counter_with_status.count(); + assert_eq!(c, blocked_count); + counter_with_status.set_count(0); + }); + } +} + +impl std::ops::Deref for Task { + type Target = TaskInner; + fn deref(&self) -> &::Target { + &self.0 + } +} + const_assert_eq!(mem::size_of::(), 8); /// [`Token`] for [`UsageQueue`]. @@ -408,117 +464,425 @@ type UsageQueueToken = Token; const_assert_eq!(mem::size_of::(), 0); /// [`Token`] for [task](Task)'s [internal mutable data](`TaskInner::blocked_usage_count`). -type BlockedUsageCountToken = Token; +type BlockedUsageCountToken = Token; const_assert_eq!(mem::size_of::(), 0); +#[derive(Debug, PartialEq, Clone, Copy, Default)] +#[repr(u8)] +enum TaskStatus { + #[default] + Buffered, + Executed, + Unlocked, +} + +/* +impl TaskStatus { + const fn into_bits(self) -> u8 { + self as _ + } + const fn from_bits(value: u8) -> Self { + match value { + 0 => Self::Buffered, + 1 => Self::Executed, + _ => Self::Unlocked, + } + } +} +*/ + +//use bitfield_struct::bitfield; +//#[bitfield(u32)] +#[derive(Debug)] +struct CounterWithStatus { + //#[bits(2)] + status: TaskStatus, + //#[bits(30)] + count: u32, + pending_lock_contexts: HashSet>, +} + +impl CounterWithStatus { + fn new(pending_lock_contexts: HashSet>) -> Self { + Self { + status: TaskStatus::default(), + count: u32::default(), + pending_lock_contexts, + } + } + + fn status(&self) -> TaskStatus { + self.status + } + + fn set_count(&mut self, count: u32) { + self.count = count + } + + fn count(&self) -> u32 { + self.count + } + + fn set_status(&mut self, status: TaskStatus) { + self.status = status + } +} + +#[repr(C, packed)] +#[allow(clippy::type_complexity)] +struct PackedTaskInner { + index: TaskKey, + lock_context_and_transaction: Box<(Vec>, Box)>, +} +const_assert_eq!(mem::size_of::(), 24); + +#[derive(Debug)] +struct TransactionWrapper { + transaction: RuntimeTransaction, + context: TransactionContext, +} + +#[derive(Debug, Clone)] +pub enum TransactionContext { + BlockVerification, + BlockProduction(MaxAge), +} + +impl std::fmt::Debug for PackedTaskInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let index = self.index; + f.debug_struct("PackedTaskInner") + .field("index", &index) + .field("lock_contexts", &self.lock_context_and_transaction.0) + .field("transaction", &self.lock_context_and_transaction.1) + .finish() + } +} + /// Internal scheduling data about a particular task. #[derive(Debug)] pub struct TaskInner { - transaction: RuntimeTransaction, /// The index of a transaction in ledger entries; not used by SchedulingStateMachine by itself. /// Carrying this along with the transaction is needed to properly record the execution result /// of it. - index: usize, - lock_contexts: Vec, - blocked_usage_count: TokenCell, + packed_task_inner: PackedTaskInner, + blocked_usage_count: TokenCell, } impl TaskInner { - pub fn task_index(&self) -> usize { - self.index + pub fn task_index(&self) -> TaskKey { + self.index() } pub fn transaction(&self) -> &RuntimeTransaction { - &self.transaction + &self + .packed_task_inner + .lock_context_and_transaction + .1 + .transaction } - fn lock_contexts(&self) -> &[LockContext] { - &self.lock_contexts + pub fn context(&self) -> &TransactionContext { + &self + .packed_task_inner + .lock_context_and_transaction + .1 + .context + } + + pub fn index(&self) -> TaskKey { + self.packed_task_inner.index + } + + fn lock_contexts(&self) -> &[Compact] { + &self.packed_task_inner.lock_context_and_transaction.0 + } + + fn blocked_usage_count(&self, token: &mut BlockedUsageCountToken) -> u32 { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| counter_with_status.count()) + } + + fn has_blocked_usage(&self, token: &mut BlockedUsageCountToken) -> bool { + self.blocked_usage_count(token) > 0 } fn set_blocked_usage_count(&self, token: &mut BlockedUsageCountToken, count: ShortCounter) { self.blocked_usage_count - .with_borrow_mut(token, |usage_count| { - *usage_count = count; + .with_borrow_mut(token, |counter_with_status| { + counter_with_status.set_count(count.current()); }) } - #[must_use] - fn try_unblock(self: Task, token: &mut BlockedUsageCountToken) -> Option { - let did_unblock = self - .blocked_usage_count - .with_borrow_mut(token, |usage_count| usage_count.decrement_self().is_zero()); - did_unblock.then_some(self) + fn increment_blocked_usage_count(&self, token: &mut BlockedUsageCountToken) -> bool { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + let c = counter_with_status.count(); + counter_with_status.set_count(c.checked_add(1).unwrap()); + c == 0 + }) + } + + fn with_pending_mut( + &self, + token: &mut BlockedUsageCountToken, + f: impl FnOnce(&mut CounterWithStatus) -> R, + ) -> R { + self.blocked_usage_count.with_borrow_mut(token, f) + } + + fn mark_as_executed(&self, token: &mut BlockedUsageCountToken) { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + counter_with_status.set_status(TaskStatus::Executed); + }) + } + + /* + fn mark_as_buffered(&self, token: &mut BlockedUsageCountToken) { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + counter_with_status.set_status(TaskStatus::Buffered); + }) + } + */ + + fn mark_as_unlocked(&self, token: &mut BlockedUsageCountToken) { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + counter_with_status.set_status(TaskStatus::Unlocked); + }) + } + + fn is_buffered(&self, token: &mut BlockedUsageCountToken) -> bool { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + matches!(counter_with_status.status(), TaskStatus::Buffered) + }) + } + + fn is_executed(&self, token: &mut BlockedUsageCountToken) -> bool { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + matches!(counter_with_status.status(), TaskStatus::Executed) + }) + } + + fn is_unlocked(&self, token: &mut BlockedUsageCountToken) -> bool { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| { + matches!(counter_with_status.status(), TaskStatus::Unlocked) + }) + } + + fn status(&self, token: &mut BlockedUsageCountToken) -> TaskStatus { + self.blocked_usage_count + .with_borrow_mut(token, |counter_with_status| counter_with_status.status()) } } /// [`Task`]'s per-address context to lock a [usage_queue](UsageQueue) with [certain kind of /// request](RequestedUsage). -#[derive(Debug)] -struct LockContext { - usage_queue: UsageQueue, - requested_usage: RequestedUsage, +#[derive(Clone, Debug, EnumPtr)] +#[repr(C, usize)] +enum LockContext { + Readonly(UsageQueue), + Writable(UsageQueue), } const_assert_eq!(mem::size_of::(), 16); +const_assert_eq!(mem::size_of::>(), 8); + +impl std::ops::Deref for LockContext { + type Target = TokenCell; + fn deref(&self) -> &::Target { + &self.usage_queue().0 + } +} impl LockContext { fn new(usage_queue: UsageQueue, requested_usage: RequestedUsage) -> Self { - Self { - usage_queue, - requested_usage, + match requested_usage { + RequestedUsage::Readonly => Self::Readonly(usage_queue), + RequestedUsage::Writable => Self::Writable(usage_queue), } } + fn requested_usage2(&self) -> RequestedUsage { + match self { + Self::Readonly(_) => RequestedUsage::Readonly, + Self::Writable(_) => RequestedUsage::Writable, + } + } + + fn usage_from_task(&self, task: Task) -> UsageFromTask { + match self { + Self::Readonly(_) => UsageFromTask::Readonly(task), + Self::Writable(_) => UsageFromTask::Writable(task), + } + } + + fn usage_queue(&self) -> &UsageQueue { + match self { + Self::Readonly(u) | Self::Writable(u) => u, + } + } + + fn is_force_lockable(&self, usage_queue_token: &mut UsageQueueToken) -> bool { + self.with_usage_queue_mut(usage_queue_token, |u| { + u.is_force_lockable(self.requested_usage2()) + }) + } + + fn force_lock( + &self, + usage_queue_token: &mut UsageQueueToken, + new_task: Task, + count_token: &mut BlockedUsageCountToken, + blocked_task_count: &mut ShortCounter, + ) { + self.with_usage_queue_mut(usage_queue_token, |u| { + u.force_lock( + self.usage_queue(), + self.requested_usage2(), + new_task, + count_token, + blocked_task_count, + ) + }) + } + + fn increment_executing_count(&self, usage_queue_token: &mut UsageQueueToken) { + self.with_usage_queue_mut(usage_queue_token, |u| u.increment_executing_count()) + } + fn with_usage_queue_mut( &self, usage_queue_token: &mut UsageQueueToken, f: impl FnOnce(&mut UsageQueueInner) -> R, ) -> R { - self.usage_queue.0.with_borrow_mut(usage_queue_token, f) + self.usage_queue().0.with_borrow_mut(usage_queue_token, f) } } +use std::cmp::Reverse; + /// Status about how the [`UsageQueue`] is used currently. -#[derive(Copy, Clone, Debug)] +#[derive(Debug)] enum Usage { Readonly(ShortCounter), - Writable, -} -const_assert_eq!(mem::size_of::(), 8); - -impl From for Usage { - fn from(requested_usage: RequestedUsage) -> Self { - match requested_usage { - RequestedUsage::Readonly => Usage::Readonly(ShortCounter::one()), - RequestedUsage::Writable => Usage::Writable, - } - } + Writable(Task), } /// Status about how a task is requesting to use a particular [`UsageQueue`]. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq)] enum RequestedUsage { Readonly, Writable, } +//use std::collections::binary_heap::PeekMut; +use dary_heap::PeekMut; + /// Internal scheduling data about a particular address. /// /// Specifically, it holds the current [`Usage`] (or no usage with [`Usage::Unused`]) and which /// [`Task`]s are blocked to be executed after the current task is notified to be finished via /// [`::deschedule_task`](`SchedulingStateMachine::deschedule_task`) #[derive(Debug)] -struct UsageQueueInner { +pub struct UsageQueueInner { current_usage: Option, - blocked_usages_from_tasks: VecDeque, + executing_count: ShortCounter, + current_readonly_tasks: dary_heap::OctonaryHeap>, + blocked_usages_from_tasks: dary_heap::OctonaryHeap>, +} + +use enum_ptr::{Compact, EnumPtr}; + +#[repr(C, usize)] +#[derive(Debug, EnumPtr)] +enum UsageFromTask { + Readonly(Task), + Writable(Task), } +const_assert_eq!(mem::size_of::(), 16); +const_assert_eq!(mem::size_of::>(), 8); + +impl UsageFromTask { + fn index(&self) -> TaskKey { + match self { + Self::Readonly(t) => t.index(), + Self::Writable(t) => t.index(), + } + } + + fn usage(&self) -> RequestedUsage { + match self { + Self::Readonly(_t) => RequestedUsage::Readonly, + Self::Writable(_t) => RequestedUsage::Writable, + } + } -type UsageFromTask = (RequestedUsage, Task); + fn task(&self) -> &Task { + match self { + Self::Readonly(t) | Self::Writable(t) => t, + } + } +} + +impl From<(RequestedUsage, Task)> for UsageFromTask { + fn from((usage, task): (RequestedUsage, Task)) -> Self { + match usage { + RequestedUsage::Readonly => Self::Readonly(task), + RequestedUsage::Writable => Self::Writable(task), + } + } +} + +impl Ord for Task { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other.index().cmp(&self.index()) + //self.index.cmp(&other.index) + } +} + +impl PartialOrd for Task { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for Task {} +impl PartialEq for Task { + fn eq(&self, other: &Self) -> bool { + self.index() == other.index() + } +} + +impl Ord for UsageFromTask { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other.index().cmp(&self.index()) + //self.index().cmp(&other.index()) + } +} + +impl PartialOrd for UsageFromTask { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for UsageFromTask {} +impl PartialEq for UsageFromTask { + fn eq(&self, other: &Self) -> bool { + self.index() == other.index() + } +} impl Default for UsageQueueInner { fn default() -> Self { Self { current_usage: None, + executing_count: ShortCounter::zero(), // Capacity should be configurable to create with large capacity like 1024 inside the // (multi-threaded) closures passed to create_task(). In this way, reallocs can be // avoided happening in the scheduler thread. Also, this configurability is desired for @@ -528,75 +892,246 @@ impl Default for UsageQueueInner { // // Note that large cap should be accompanied with proper scheduler cleaning after use, // which should be handled by higher layers (i.e. scheduler pool). - blocked_usages_from_tasks: VecDeque::with_capacity(128), + current_readonly_tasks: dary_heap::OctonaryHeap::with_capacity(128), + blocked_usages_from_tasks: dary_heap::OctonaryHeap::with_capacity(128), } } } impl UsageQueueInner { - fn try_lock(&mut self, requested_usage: RequestedUsage) -> LockResult { - match self.current_usage { - None => Some(Usage::from(requested_usage)), + fn try_lock(&mut self, requested_usage: RequestedUsage, task: &Task) -> LockResult { + match &mut self.current_usage { + None => { + match requested_usage { + RequestedUsage::Readonly => { + self.current_usage = Some(Usage::Readonly(ShortCounter::one())); + self.current_readonly_tasks.push(Reverse(task.clone())); + } + RequestedUsage::Writable => { + self.current_usage = Some(Usage::Writable(task.clone())); + } + } + Ok(()) + } Some(Usage::Readonly(count)) => match requested_usage { - RequestedUsage::Readonly => Some(Usage::Readonly(count.increment())), - RequestedUsage::Writable => None, + RequestedUsage::Readonly => { + //dbg!(&self.current_readonly_tasks.keys()); + self.current_readonly_tasks.push(Reverse(task.clone())); + count.increment_self(); + Ok(()) + } + RequestedUsage::Writable => Err(()), }, - Some(Usage::Writable) => None, + Some(Usage::Writable(_current_task)) => Err(()), } - .inspect(|&new_usage| { - self.current_usage = Some(new_usage); - }) - .map(|_| ()) - .ok_or(()) } - #[must_use] - fn unlock(&mut self, requested_usage: RequestedUsage) -> Option { - let mut is_unused_now = false; + fn is_force_lockable(&self, requested_usage: RequestedUsage) -> bool { + match &self.current_usage { + None => { + unreachable!(); + } + Some(Usage::Readonly(_count)) => match requested_usage { + RequestedUsage::Readonly => true, + RequestedUsage::Writable => self.executing_count.is_zero(), + }, + Some(Usage::Writable(_current_task)) => self.executing_count.is_zero(), + } + } + + fn force_lock( + &mut self, + u: &UsageQueue, + requested_usage: RequestedUsage, + new_task: Task, + count_token: &mut BlockedUsageCountToken, + blocked_task_count: &mut ShortCounter, + ) { match &mut self.current_usage { - Some(Usage::Readonly(ref mut count)) => match requested_usage { + None => { + unreachable!(); + } + Some(Usage::Readonly(count)) => match requested_usage { RequestedUsage::Readonly => { - if count.is_one() { - is_unused_now = true; - } else { - count.decrement_self(); + self.current_readonly_tasks.push(Reverse(new_task)); + count.increment_self(); + } + RequestedUsage::Writable => { + let cc = count.current(); + let mut c = ShortCounter::zero(); + while let Some(Reverse(reblocked_task)) = self.current_readonly_tasks.pop() { + assert!(!reblocked_task.is_executed(count_token)); + if reblocked_task.is_unlocked(count_token) { + continue; + } + + if reblocked_task.increment_blocked_usage_count(count_token) { + blocked_task_count.increment_self(); + } + reblocked_task.with_pending_mut(count_token, |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Readonly, + ))) + .then_some(()) + .or_else(|| panic!()); + }); + self.insert_blocked_usage_from_task(UsageFromTask::Readonly( + reblocked_task, + )); + c.increment_self(); + //self.reblocked_lock_total.increment_self(); } + assert_eq!(c.current(), cc); + self.current_usage = Some(Usage::Writable(new_task)); } - RequestedUsage::Writable => unreachable!(), }, - Some(Usage::Writable) => match requested_usage { + Some(Usage::Writable(current_task)) => match requested_usage { + RequestedUsage::Readonly => { + let old_usage = std::mem::replace( + self.current_usage.as_mut().unwrap(), + Usage::Readonly(ShortCounter::one()), + ); + let Usage::Writable(reblocked_task) = old_usage else { + panic!() + }; + if reblocked_task.increment_blocked_usage_count(count_token) { + blocked_task_count.increment_self(); + } + reblocked_task.with_pending_mut(count_token, |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Writable, + ))) + .then_some(()) + .or_else(|| panic!()); + }); + assert!(self.current_readonly_tasks.is_empty()); + self.current_readonly_tasks.push(Reverse(new_task.clone())); + self.insert_blocked_usage_from_task(UsageFromTask::Writable(reblocked_task)); + } RequestedUsage::Writable => { - is_unused_now = true; + assert_ne!(new_task.index(), current_task.index()); + let reblocked_task = std::mem::replace(current_task, new_task); + if reblocked_task.increment_blocked_usage_count(count_token) { + blocked_task_count.increment_self(); + } + reblocked_task.with_pending_mut(count_token, |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Writable, + ))) + .then_some(()) + .or_else(|| panic!()); + }); + self.insert_blocked_usage_from_task(UsageFromTask::Writable(reblocked_task)); + //self.reblocked_lock_total.increment_self(); + } + }, + } + } + + fn increment_executing_count(&mut self) { + self.executing_count.increment_self(); + } + + #[must_use] + fn unlock( + &mut self, + unlocked_task_context: &LockContext, + unlocked_task_index: TaskKey, + token: &mut BlockedUsageCountToken, + ) -> Option { + self.executing_count.decrement_self(); + let mut is_unused_now = false; + match &mut self.current_usage { + Some(Usage::Readonly(count)) => match unlocked_task_context { + LockContext::Readonly(_) => { + count.decrement_self(); + // todo test this for unbounded growth of inifnite readable only locks.... + //dbg!(self.current_readonly_tasks.len()); + while let Some(peeked_task) = self.current_readonly_tasks.peek_mut() { + if peeked_task.0.is_unlocked(token) { + PeekMut::pop(peeked_task); + } else { + break; + } + } + if count.is_zero() { + assert_eq!( + ( + self.current_readonly_tasks.is_empty(), + self.executing_count.current() + ), + (true, 0) + ); + is_unused_now = true; + } + //dbg!(is_unused_now); } - RequestedUsage::Readonly => unreachable!(), + LockContext::Writable(_) => unreachable!(), }, + Some(Usage::Writable(blocking_task)) => { + assert_eq!( + ( + unlocked_task_index, + unlocked_task_context.requested_usage2(), + self.executing_count.current() + ), + (blocking_task.index(), RequestedUsage::Writable, 0) + ); + is_unused_now = true; + } None => unreachable!(), } if is_unused_now { self.current_usage = None; - self.blocked_usages_from_tasks.pop_front() + while let Some(task) = self.blocked_usages_from_tasks.pop() { + if !task.map_ref(|t| t.task().is_buffered(token)) { + continue; + } + return Some(task.into()); + } + None } else { None } } - fn push_blocked_usage_from_task(&mut self, usage_from_task: UsageFromTask) { - assert_matches!(self.current_usage, Some(_)); - self.blocked_usages_from_tasks.push_back(usage_from_task); + fn insert_blocked_usage_from_task(&mut self, uft: UsageFromTask) { + self.blocked_usages_from_tasks.push(uft.into()); + } + + fn first_blocked_task_index(&self) -> Option { + self.blocked_usages_from_tasks + .peek() + .map(|uft| uft.map_ref(|u| u.index())) } #[must_use] - fn pop_unblocked_readonly_usage_from_task(&mut self) -> Option { - if matches!( - self.blocked_usages_from_tasks.front(), - Some((RequestedUsage::Readonly, _)) - ) { - assert_matches!(self.current_usage, Some(Usage::Readonly(_))); - self.blocked_usages_from_tasks.pop_front() - } else { - None + fn pop_buffered_readonly_usage_from_task( + &mut self, + token: &mut BlockedUsageCountToken, + ) -> Option { + while let Some(peeked_task) = self.blocked_usages_from_tasks.peek_mut() { + if !peeked_task.map_ref(|uft| uft.task().is_buffered(token)) { + PeekMut::pop(peeked_task); + continue; + } + if matches!( + peeked_task.map_ref(|uft| uft.usage()), + RequestedUsage::Readonly + ) { + return Some(PeekMut::pop(peeked_task).into()); + } else { + break; + } } + None } fn has_no_blocked_usage(&self) -> bool { @@ -604,7 +1139,7 @@ impl UsageQueueInner { } } -const_assert_eq!(mem::size_of::>(), 40); +//const_assert_eq!(mem::size_of::>(), 40); /// Scheduler's internal data for each address ([`Pubkey`](`solana_sdk::pubkey::Pubkey`)). Very /// opaque wrapper type; no methods just with [`::clone()`](Clone::clone) and @@ -613,46 +1148,186 @@ const_assert_eq!(mem::size_of::>(), 40); pub struct UsageQueue(Arc>); const_assert_eq!(mem::size_of::(), 8); +unsafe impl enum_ptr::Aligned for UsageQueue { + const ALIGNMENT: usize = std::mem::align_of::>(); +} + /// A high-level `struct`, managing the overall scheduling of [tasks](Task), to be used by /// `solana-unified-scheduler-pool`. +#[derive(Debug)] pub struct SchedulingStateMachine { - unblocked_task_queue: VecDeque, - active_task_count: ShortCounter, - handled_task_count: ShortCounter, - unblocked_task_count: ShortCounter, - total_task_count: ShortCounter, + buffered_task_queue: dary_heap::OctonaryHeap, + alive_tasks: BTreeSet, + alive_task_count: ShortCounter, + executing_task_count: ShortCounter, + max_executing_task_count: u32, + executed_task_total: ShortCounter, + buffered_task_total: ShortCounter, + blocked_task_count: ShortCounter, + reblocked_lock_total: ShortCounter, + eager_lock_total: ShortCounter, + task_total: ShortCounter, count_token: BlockedUsageCountToken, usage_queue_token: UsageQueueToken, + scheduling_mode: SchedulingMode, + last_scan_task: Option, +} + +#[cfg(test)] +impl Drop for SchedulingStateMachine { + fn drop(&mut self) { + if !std::thread::panicking() { + self.reinitialize_for_test(); + } + } } -const_assert_eq!(mem::size_of::(), 48); +//const_assert_eq!(mem::size_of::(), 56); impl SchedulingStateMachine { - pub fn has_no_active_task(&self) -> bool { - self.active_task_count.is_zero() + pub fn mode(&self) -> SchedulingMode { + self.scheduling_mode } - pub fn has_unblocked_task(&self) -> bool { - !self.unblocked_task_queue.is_empty() + pub fn has_no_alive_task(&self) -> bool { + self.alive_task_count.is_zero() } - pub fn unblocked_task_queue_count(&self) -> usize { - self.unblocked_task_queue.len() + pub fn has_buffered_task(&mut self) -> bool { + while let Some(task) = self.buffered_task_queue.peek_mut() { + let status = task.status(&mut self.count_token); + if task.has_blocked_usage(&mut self.count_token) + || status == TaskStatus::Executed + || status == TaskStatus::Unlocked + { + PeekMut::pop(task); + continue; + } else { + return true; + } + } + false } - pub fn active_task_count(&self) -> u32 { - self.active_task_count.current() + pub fn tick_eager_scan(&mut self) -> Option { + match self.mode() { + SchedulingMode::BlockVerification => {} + SchedulingMode::BlockProduction => { + if !self.is_task_runnable() { + return None; + } + + let last_scan_task = self.last_scan_task.take(); + let highest_task = self.alive_tasks.last()?; + + let mut task_iter = if let Some(last_scan_task) = last_scan_task { + self.alive_tasks.range(..last_scan_task).rev() + } else { + self.alive_tasks.range(..=highest_task).rev() + }; + let mut task; + let mut start_task = None; + let mut scanned_task_count = ShortCounter::zero(); + loop { + task = match task_iter.next() { + Some(task) => task, + None => { + // eager can cycle count + task_iter = self.alive_tasks.range(..).rev(); + continue; + } + }; + if &task == start_task.get_or_insert(task) && !scanned_task_count.is_zero() { + break; + } + scanned_task_count.increment_self(); + if scanned_task_count.current() == 200 { + break; + } + //dbg!(("hey", scanned_task_count, self.alive_tasks.len(), task.index(), start_task.map(|t| t.index()))); + + if !task.is_buffered(&mut self.count_token) { + continue; + } + + let force_lockable: bool = task.with_pending_mut(&mut self.count_token, |c| { + if c.pending_lock_contexts.is_empty() { + false + } else { + c.pending_lock_contexts.iter().all(|pending_lock_context| { + pending_lock_context.is_force_lockable(&mut self.usage_queue_token) + }) + } + }); + if force_lockable { + let p = task.with_pending_mut(&mut self.count_token, |c| { + std::mem::take(&mut c.pending_lock_contexts) + }); + let blocked_count = p.len(); + p.into_iter().for_each(|pending_lock_context| { + pending_lock_context.force_lock( + &mut self.usage_queue_token, + task.clone(), + &mut self.count_token, + &mut self.blocked_task_count, + ) + }); + task.force_unblock(blocked_count as u32, &mut self.count_token); + self.blocked_task_count.decrement_self(); + self.eager_lock_total.increment_self(); + return Some(task.clone()); + } + //dbg!((task.index(), lockable)); + //panic!("aaa"); + } + self.last_scan_task = Some(task.clone()); + } + } + + None + } + + pub fn has_runnable_task(&mut self) -> bool { + self.is_task_runnable() && self.has_buffered_task() + } + + pub fn has_no_executing_task(&self) -> bool { + self.executing_task_count.current() == 0 + } + + pub fn is_task_runnable(&self) -> bool { + self.executing_task_count.current() < self.max_executing_task_count + } + + pub fn buffered_task_queue_count(&self) -> usize { + self.buffered_task_queue.len() + } + + pub fn alive_task_count(&self) -> u32 { + self.alive_task_count.current() } - pub fn handled_task_count(&self) -> u32 { - self.handled_task_count.current() + pub fn executed_task_total(&self) -> u32 { + self.executed_task_total.current() } - pub fn unblocked_task_count(&self) -> u32 { - self.unblocked_task_count.current() + pub fn buffered_task_total(&self) -> u32 { + self.buffered_task_total.current() } - pub fn total_task_count(&self) -> u32 { - self.total_task_count.current() + pub fn blocked_task_count(&self) -> u32 { + self.blocked_task_count.current() + } + + pub fn reblocked_lock_total(&self) -> u32 { + self.reblocked_lock_total.current() + } + + pub fn eager_lock_total(&self) -> u32 { + self.eager_lock_total.current() + } + + pub fn task_total(&self) -> u32 { + self.task_total.current() } /// Schedules given `task`, returning it if successful. @@ -663,15 +1338,90 @@ impl SchedulingStateMachine { /// Note that this function takes ownership of the task to allow for future optimizations. #[must_use] pub fn schedule_task(&mut self, task: Task) -> Option { - self.total_task_count.increment_self(); - self.active_task_count.increment_self(); - self.try_lock_usage_queues(task) + self.do_schedule_task(task, false) } + pub fn do_schedule_task(&mut self, task: Task, force_buffer_mode: bool) -> Option { + self.task_total.increment_self(); + self.alive_task_count.increment_self(); + self.alive_tasks + .insert(task.clone()) + .then_some(()) + .or_else(|| panic!()); + task.with_pending_mut(&mut self.count_token, |c| { + assert_eq!(task.lock_contexts().len(), c.pending_lock_contexts.len()); + }); + self.try_lock_usage_queues(task).and_then(|task| { + if self.is_task_runnable() && !force_buffer_mode { + self.executing_task_count.increment_self(); + task.with_pending_mut(&mut self.count_token, |c| { + assert_eq!(c.count as usize, c.pending_lock_contexts.len()); + assert!(c.pending_lock_contexts.is_empty()); + }); + task.mark_as_executed(&mut self.count_token); + for context in task.lock_contexts() { + context.map_ref(|context| { + context.increment_executing_count(&mut self.usage_queue_token) + }) + } + + Some(task) + } else { + self.buffered_task_total.increment_self(); + self.buffered_task_queue.push(task); + None + } + }) + } + + /* + pub fn rebuffer_executing_task(&mut self, task: Task) { + self.executing_task_count.decrement_self(); + self.buffered_task_total.increment_self(); + task.mark_as_buffered(&mut self.count_token); + self.buffered_task_queue.push(task); + } + */ + #[must_use] - pub fn schedule_next_unblocked_task(&mut self) -> Option { - self.unblocked_task_queue.pop_front().inspect(|_| { - self.unblocked_task_count.increment_self(); + pub fn schedule_next_buffered_task(&mut self) -> Option { + while let Some(task) = self.buffered_task_queue.pop() { + if task.has_blocked_usage(&mut self.count_token) + || !task.is_buffered(&mut self.count_token) + { + continue; + } else { + self.executing_task_count.increment_self(); + task.with_pending_mut(&mut self.count_token, |c| { + assert_eq!(c.count as usize, c.pending_lock_contexts.len()); + assert!(c.pending_lock_contexts.is_empty()); + }); + task.mark_as_executed(&mut self.count_token); + for context in task.lock_contexts() { + context.map_ref(|context| { + context.increment_executing_count(&mut self.usage_queue_token) + }) + } + return Some(task); + } + } + None + } + + #[must_use] + pub fn scan_and_schedule_next_task(&mut self) -> Option { + self.tick_eager_scan().inspect(|task| { + self.executing_task_count.increment_self(); + task.with_pending_mut(&mut self.count_token, |c| { + assert_eq!(c.count as usize, c.pending_lock_contexts.len()); + assert!(c.pending_lock_contexts.is_empty()); + }); + task.mark_as_executed(&mut self.count_token); + for context in task.lock_contexts() { + context.map_ref(|context| { + context.increment_executing_count(&mut self.usage_queue_token) + }) + } }) } @@ -686,64 +1436,290 @@ impl SchedulingStateMachine { /// tasks inside `SchedulingStateMachine` to provide an offloading-based optimization /// opportunity for callers. pub fn deschedule_task(&mut self, task: &Task) { - self.active_task_count.decrement_self(); - self.handled_task_count.increment_self(); + task.mark_as_unlocked(&mut self.count_token); + self.executing_task_count.decrement_self(); + self.alive_task_count.decrement_self(); + self.alive_tasks + .remove(task) + .then_some(()) + .or_else(|| panic!()); + self.executed_task_total.increment_self(); self.unlock_usage_queues(task); + if self.blocked_task_count() > 0 { + assert_gt!( + self.alive_task_count(), + self.blocked_task_count(), + "no deadlock" + ); + } + } + + fn try_reblock_task( + blocking_task: &Task, + blocked_task_count: &mut ShortCounter, + token: &mut BlockedUsageCountToken, + ) -> bool { + if blocking_task.has_blocked_usage(token) { + // <= this is merged into is_buffered()? + // and how about doing incrementing this???: blocked_task_count.increment_self(); + true + } else if blocking_task.is_buffered(token) { + blocked_task_count.increment_self(); + true + } else { + // don't reblock if no blocked usage and not buffered + false + } } #[must_use] - fn try_lock_usage_queues(&mut self, task: Task) -> Option { + fn try_lock_usage_queues(&mut self, new_task: Task) -> Option { let mut blocked_usage_count = ShortCounter::zero(); - for context in task.lock_contexts() { - context.with_usage_queue_mut(&mut self.usage_queue_token, |usage_queue| { - let lock_result = if usage_queue.has_no_blocked_usage() { - usage_queue.try_lock(context.requested_usage) - } else { - LockResult::Err(()) - }; - if let Err(()) = lock_result { - blocked_usage_count.increment_self(); - let usage_from_task = (context.requested_usage, task.clone()); - usage_queue.push_blocked_usage_from_task(usage_from_task); - } + for context in new_task.lock_contexts() { + context.map_ref(|context| { + let u = context.usage_queue(); + context.with_usage_queue_mut(&mut self.usage_queue_token, |usage_queue| { + let lock_result = (match usage_queue.current_usage.as_mut() { + Some(mut current_usage) => { + match (&mut current_usage, context.requested_usage2()) { + (Usage::Writable(blocking_task), RequestedUsage::Writable) => { + if new_task.index() < blocking_task.index() + && Self::try_reblock_task( + blocking_task, + &mut self.blocked_task_count, + &mut self.count_token, + ) + { + let old_usage = std::mem::replace( + current_usage, + Usage::Writable(new_task.clone()), + ); + let Usage::Writable(reblocked_task) = old_usage else { + panic!() + }; + reblocked_task + .increment_blocked_usage_count(&mut self.count_token); + reblocked_task.with_pending_mut( + &mut self.count_token, + |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Writable, + ))) + .then_some(()) + .or_else(|| panic!()); + }, + ); + usage_queue.insert_blocked_usage_from_task( + UsageFromTask::Writable(reblocked_task), + ); + self.reblocked_lock_total.increment_self(); + Some(Ok(())) + } else { + None + } + } + (Usage::Writable(blocking_task), RequestedUsage::Readonly) => { + if new_task.index() < blocking_task.index() + && Self::try_reblock_task( + blocking_task, + &mut self.blocked_task_count, + &mut self.count_token, + ) + { + let old_usage = std::mem::replace( + current_usage, + Usage::Readonly(ShortCounter::one()), + ); + let Usage::Writable(reblocked_task) = old_usage else { + panic!() + }; + reblocked_task + .increment_blocked_usage_count(&mut self.count_token); + reblocked_task.with_pending_mut( + &mut self.count_token, + |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Writable, + ))) + .then_some(()) + .or_else(|| panic!()); + }, + ); + assert!(usage_queue.current_readonly_tasks.is_empty()); + usage_queue + .current_readonly_tasks + .push(Reverse(new_task.clone())); + usage_queue.insert_blocked_usage_from_task( + UsageFromTask::Writable(reblocked_task), + ); + self.reblocked_lock_total.increment_self(); + Some(Ok(())) + } else { + None + } + } + (Usage::Readonly(_count), RequestedUsage::Readonly) => { + let first_blocked_task_index = + usage_queue.first_blocked_task_index(); + if let Some(first_blocked_task_index) = first_blocked_task_index + { + if new_task.index() < first_blocked_task_index { + usage_queue + .try_lock(context.requested_usage2(), &new_task) + .unwrap(); + Some(Ok(())) + // even the following passes the unit tests... think about this + /* + if usage_queue.has_no_blocked_usage() { + usage_queue.try_lock(context.requested_usage, &new_task) + } else { + Err(()) + } + */ + } else { + None + } + } else { + None + } + } + (Usage::Readonly(count), RequestedUsage::Writable) => { + let mut reblocked_tasks = vec![]; + while let Some(blocking_task) = + usage_queue.current_readonly_tasks.peek_mut() + { + let index = blocking_task.0 .0.index(); + if new_task.index() < index + || blocking_task.0.is_unlocked(&mut self.count_token) + { + let blocking_task = PeekMut::pop(blocking_task).0; + + if Self::try_reblock_task( + &blocking_task, + &mut self.blocked_task_count, + &mut self.count_token, + ) { + count.decrement_self(); + reblocked_tasks.push(blocking_task); + } + } else { + break; + } + } + if !reblocked_tasks.is_empty() { + let lock_result = if count.is_zero() { + *current_usage = Usage::Writable(new_task.clone()); + Ok(()) + } else { + Err(()) + }; + for reblocked_task in reblocked_tasks { + reblocked_task.increment_blocked_usage_count( + &mut self.count_token, + ); + reblocked_task.with_pending_mut( + &mut self.count_token, + |c| { + c.pending_lock_contexts + .insert(ByAddress(LockContext::new( + u.clone(), + RequestedUsage::Readonly, + ))) + .then_some(()) + .or_else(|| panic!()); + }, + ); + usage_queue.insert_blocked_usage_from_task( + UsageFromTask::Readonly(reblocked_task), + ); + self.reblocked_lock_total.increment_self(); + } + Some(lock_result) + } else { + None + } + } + } + } + _ => None, + }) + .unwrap_or_else(|| { + if usage_queue.has_no_blocked_usage() { + usage_queue.try_lock(context.requested_usage2(), &new_task) + } else { + Err(()) + } + }); + + if let Err(()) = lock_result { + blocked_usage_count.increment_self(); + let usage_from_task = context.usage_from_task(new_task.clone()); + usage_queue.insert_blocked_usage_from_task(usage_from_task); + } else { + new_task.with_pending_mut(&mut self.count_token, |c| { + c.pending_lock_contexts + .remove(ByAddress::from_ref(context)) + .then_some(()) + .or_else(|| panic!()); + }); + } + }); }); } // no blocked usage count means success if blocked_usage_count.is_zero() { - Some(task) + Some(new_task) } else { - task.set_blocked_usage_count(&mut self.count_token, blocked_usage_count); + self.blocked_task_count.increment_self(); + new_task.set_blocked_usage_count(&mut self.count_token, blocked_usage_count); None } } fn unlock_usage_queues(&mut self, task: &Task) { for context in task.lock_contexts() { + context.map_ref(|context| { context.with_usage_queue_mut(&mut self.usage_queue_token, |usage_queue| { - let mut unblocked_task_from_queue = usage_queue.unlock(context.requested_usage); + let mut buffered_task_from_queue = + usage_queue.unlock(context, task.index(), &mut self.count_token); - while let Some((requested_usage, task_with_unblocked_queue)) = - unblocked_task_from_queue - { + while let Some(buffered_task_from_queue2) = buffered_task_from_queue { // When `try_unblock()` returns `None` as a failure of unblocking this time, // this means the task is still blocked by other active task's usages. So, - // don't push task into unblocked_task_queue yet. It can be assumed that every + // don't push task into buffered_task_queue yet. It can be assumed that every // task will eventually succeed to be unblocked, and enter in this condition // clause as long as `SchedulingStateMachine` is used correctly. - if let Some(task) = task_with_unblocked_queue.try_unblock(&mut self.count_token) + if let Some(task) = buffered_task_from_queue2.task() + .clone() + .try_unblock(&mut self.count_token) { - self.unblocked_task_queue.push_back(task); + self.blocked_task_count.decrement_self(); + self.buffered_task_total.increment_self(); + self.buffered_task_queue.push(task); } - match usage_queue.try_lock(requested_usage) { + match usage_queue.try_lock( + buffered_task_from_queue2.usage(), + buffered_task_from_queue2.task(), /* was `task` and had bug.. write test...*/ + ) { LockResult::Ok(()) => { + assert_ne!(task.index(), buffered_task_from_queue2.task().index()); + buffered_task_from_queue2.task().with_pending_mut(&mut self.count_token, |c| { + c.pending_lock_contexts.remove(ByAddress::from_ref(context)).then_some(()).or_else(|| { + panic!("remove failed: {}", c.pending_lock_contexts.len()); + }); + }); // Try to further schedule blocked task for parallelism in the case of // readonly usages - unblocked_task_from_queue = - if matches!(requested_usage, RequestedUsage::Readonly) { - usage_queue.pop_unblocked_readonly_usage_from_task() + buffered_task_from_queue = + if matches!(buffered_task_from_queue2.usage(), RequestedUsage::Readonly) { + usage_queue.pop_buffered_readonly_usage_from_task(&mut self.count_token) } else { None }; @@ -752,6 +1728,7 @@ impl SchedulingStateMachine { } } }); + }); } } @@ -766,11 +1743,12 @@ impl SchedulingStateMachine { /// Closure is used here to delegate the responsibility of primary ownership of `UsageQueue` /// (and caching/pruning if any) to the caller. `SchedulingStateMachine` guarantees that all of /// shared owndership of `UsageQueue`s are released and UsageQueue state is identical to just - /// after created, if `has_no_active_task()` is `true`. Also note that this is desired for + /// after created, if `has_no_alive_task()` is `true`. Also note that this is desired for /// separation of concern. - pub fn create_task( + pub fn do_create_task( transaction: RuntimeTransaction, - index: usize, + context: TransactionContext, + index: TaskKey, usage_queue_loader: &mut impl FnMut(Pubkey) -> UsageQueue, ) -> Task { // It's crucial for tasks to be validated with @@ -804,29 +1782,73 @@ impl SchedulingStateMachine { // `Bank::prepare_unlocked_batch_from_single_tx()` as well. // This redundancy is known. It was just left as-is out of abundance // of caution. + let mut pending_lock_contexts = HashSet::new(); let lock_contexts = transaction .message() .account_keys() .iter() .enumerate() .map(|(index, address)| { - LockContext::new( - usage_queue_loader(*address), - if transaction.message().is_writable(index) { - RequestedUsage::Writable - } else { - RequestedUsage::Readonly - }, - ) + let u = usage_queue_loader(*address); + let ru = if transaction.message().is_writable(index) { + RequestedUsage::Writable + } else { + RequestedUsage::Readonly + }; + let lc1 = LockContext::new(u.clone(), ru); + let lc2 = LockContext::new(u, ru); + pending_lock_contexts.insert(ByAddress(lc1)); + lc2.into() }) .collect(); Task::new(TaskInner { + packed_task_inner: PackedTaskInner { + lock_context_and_transaction: Box::new(( + lock_contexts, + Box::new(TransactionWrapper { + transaction, + context, + }), + )), + index, + }, + blocked_usage_count: TokenCell::new(CounterWithStatus::new(pending_lock_contexts)), + }) + } + + pub fn create_task( + transaction: RuntimeTransaction, + index: TaskKey, + usage_queue_loader: &mut impl FnMut(Pubkey) -> UsageQueue, + ) -> Task { + Self::do_create_task( transaction, + TransactionContext::BlockVerification, index, - lock_contexts, - blocked_usage_count: TokenCell::new(ShortCounter::zero()), - }) + usage_queue_loader, + ) + } + + pub fn reset_task(&mut self, task: &Task) { + task.with_pending_mut(&mut self.count_token, |c| { + //dbg!(&c); + assert!(c.pending_lock_contexts.is_empty()); + assert_matches!(c.status, TaskStatus::Unlocked); + c.status = TaskStatus::default(); + for context in task.lock_contexts() { + c.pending_lock_contexts + .insert(ByAddress(context.clone().into())); + } + }); + } + + pub fn reset_task_total(&mut self) { + self.task_total.reset_to_zero(); + } + + pub fn reset_executed_task_total(&mut self) { + self.executed_task_total.reset_to_zero(); } /// Rewind the inactive state machine to be initialized @@ -839,24 +1861,45 @@ impl SchedulingStateMachine { /// [constructor](SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling) /// as much as possible) and its (possibly cached) associated [`UsageQueue`]s for processing /// other slots. - pub fn reinitialize(&mut self) { - assert!(self.has_no_active_task()); - assert_eq!(self.unblocked_task_queue.len(), 0); + pub fn reinitialize(&mut self, mode: SchedulingMode) { + assert!(self.has_no_alive_task()); + assert_eq!(self.executing_task_count.current(), 0); + assert_eq!(self.buffered_task_queue.len(), 0); + assert_eq!(self.alive_tasks.len(), 0); + assert_eq!(self.blocked_task_count(), 0); + + self.reset_task_total(); + self.reset_executed_task_total(); // nice trick to ensure all fields are handled here if new one is added. let Self { - unblocked_task_queue: _, - active_task_count, - handled_task_count, - unblocked_task_count, - total_task_count, + buffered_task_queue: _, + alive_tasks: _, + alive_task_count, + executing_task_count, + max_executing_task_count: _, + executed_task_total: _, + buffered_task_total, + blocked_task_count: _, + reblocked_lock_total, + eager_lock_total, + task_total: _, count_token: _, usage_queue_token: _, + scheduling_mode, + last_scan_task, // don't add ".." here } = self; - active_task_count.reset_to_zero(); - handled_task_count.reset_to_zero(); - unblocked_task_count.reset_to_zero(); - total_task_count.reset_to_zero(); + alive_task_count.reset_to_zero(); + executing_task_count.reset_to_zero(); + buffered_task_total.reset_to_zero(); + reblocked_lock_total.reset_to_zero(); + eager_lock_total.reset_to_zero(); + *scheduling_mode = mode; + *last_scan_task = None; + } + + pub fn reinitialize_for_test(&mut self) { + self.reinitialize(SchedulingMode::BlockProduction); } /// Creates a new instance of [`SchedulingStateMachine`] with its `unsafe` fields created as @@ -865,19 +1908,48 @@ impl SchedulingStateMachine { /// # Safety /// Call this exactly once for each thread. See [`TokenCell`] for details. #[must_use] - pub unsafe fn exclusively_initialize_current_thread_for_scheduling() -> Self { + pub unsafe fn exclusively_initialize_current_thread_for_scheduling( + scheduling_mode: SchedulingMode, + max_executing_task_count: u32, + ) -> Self { Self { // It's very unlikely this is desired to be configurable, like // `UsageQueueInner::blocked_usages_from_tasks`'s cap. - unblocked_task_queue: VecDeque::with_capacity(1024), - active_task_count: ShortCounter::zero(), - handled_task_count: ShortCounter::zero(), - unblocked_task_count: ShortCounter::zero(), - total_task_count: ShortCounter::zero(), + buffered_task_queue: dary_heap::OctonaryHeap::with_capacity(1024), // BTreeMap::new(), //VecDeque::with_capacity(1024), + alive_tasks: BTreeSet::default(), + alive_task_count: ShortCounter::zero(), + executing_task_count: ShortCounter::zero(), + max_executing_task_count, + executed_task_total: ShortCounter::zero(), + buffered_task_total: ShortCounter::zero(), + blocked_task_count: ShortCounter::zero(), + reblocked_lock_total: ShortCounter::zero(), + eager_lock_total: ShortCounter::zero(), + task_total: ShortCounter::zero(), count_token: unsafe { BlockedUsageCountToken::assume_exclusive_mutating_thread() }, usage_queue_token: unsafe { UsageQueueToken::assume_exclusive_mutating_thread() }, + scheduling_mode, + last_scan_task: None, } } + + /// # Safety + /// Call this exactly once for each thread. See [`TokenCell`] for details. + pub unsafe fn exclusively_initialize_current_thread_for_scheduling_for_test() -> Self { + Self::exclusively_initialize_current_thread_for_scheduling( + SchedulingMode::BlockVerification, + 200, + ) + } + + /// # Safety + /// Call this exactly once for each thread. See [`TokenCell`] for details. + pub unsafe fn exclusively_initialize_current_thread_for_scheduling_for_test2() -> Self { + Self::exclusively_initialize_current_thread_for_scheduling( + SchedulingMode::BlockProduction, + 200, + ) + } } #[cfg(test)] @@ -899,12 +1971,42 @@ mod tests { RuntimeTransaction::from_transaction_for_tests(unsigned) } - fn transaction_with_readonly_address( + fn transaction_with_readonly_address( + address: Pubkey, + ) -> RuntimeTransaction { + let instruction = Instruction { + program_id: Pubkey::default(), + accounts: vec![AccountMeta::new_readonly(address, false)], + data: vec![], + }; + let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); + let unsigned = Transaction::new_unsigned(message); + RuntimeTransaction::from_transaction_for_tests(unsigned) + } + + fn transaction_with_writable_address( + address: Pubkey, + ) -> RuntimeTransaction { + let instruction = Instruction { + program_id: Pubkey::default(), + accounts: vec![AccountMeta::new(address, false)], + data: vec![], + }; + let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); + let unsigned = Transaction::new_unsigned(message); + RuntimeTransaction::from_transaction_for_tests(unsigned) + } + + fn transaction_with_writable_address2( address: Pubkey, + address2: Pubkey, ) -> RuntimeTransaction { let instruction = Instruction { program_id: Pubkey::default(), - accounts: vec![AccountMeta::new_readonly(address, false)], + accounts: vec![ + AccountMeta::new(address, false), + AccountMeta::new(address2, false), + ], data: vec![], }; let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); @@ -912,12 +2014,16 @@ mod tests { RuntimeTransaction::from_transaction_for_tests(unsigned) } - fn transaction_with_writable_address( + fn transaction_with_writable_read2( address: Pubkey, + address2: Pubkey, ) -> RuntimeTransaction { let instruction = Instruction { program_id: Pubkey::default(), - accounts: vec![AccountMeta::new(address, false)], + accounts: vec![ + AccountMeta::new(address, false), + AccountMeta::new_readonly(address2, false), + ], data: vec![], }; let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); @@ -941,34 +2047,35 @@ mod tests { #[test] fn test_scheduling_state_machine_creation() { let state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; - assert_eq!(state_machine.active_task_count(), 0); - assert_eq!(state_machine.total_task_count(), 0); - assert!(state_machine.has_no_active_task()); + assert_eq!(state_machine.alive_task_count(), 0); + assert_eq!(state_machine.task_total(), 0); + assert!(state_machine.has_no_alive_task()); } #[test] fn test_scheduling_state_machine_good_reinitialization() { let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; - state_machine.total_task_count.increment_self(); - assert_eq!(state_machine.total_task_count(), 1); - state_machine.reinitialize(); - assert_eq!(state_machine.total_task_count(), 0); + state_machine.task_total.increment_self(); + assert_eq!(state_machine.task_total(), 1); + state_machine.reinitialize_for_test(); + assert_eq!(state_machine.task_total(), 0); } #[test] - #[should_panic(expected = "assertion failed: self.has_no_active_task()")] + #[cfg_attr(miri, ignore)] + #[should_panic(expected = "assertion failed: self.has_no_alive_task()")] fn test_scheduling_state_machine_bad_reinitialization() { let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; let address_loader = &mut create_address_loader(None); let task = SchedulingStateMachine::create_task(simplest_transaction(), 3, address_loader); state_machine.schedule_task(task).unwrap(); - state_machine.reinitialize(); + state_machine.reinitialize_for_test(); } #[test] @@ -988,15 +2095,15 @@ mod tests { let task = SchedulingStateMachine::create_task(sanitized, 3, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; let task = state_machine.schedule_task(task).unwrap(); - assert_eq!(state_machine.active_task_count(), 1); - assert_eq!(state_machine.total_task_count(), 1); + assert_eq!(state_machine.alive_task_count(), 1); + assert_eq!(state_machine.task_total(), 1); state_machine.deschedule_task(&task); - assert_eq!(state_machine.active_task_count(), 0); - assert_eq!(state_machine.total_task_count(), 1); - assert!(state_machine.has_no_active_task()); + assert_eq!(state_machine.alive_task_count(), 0); + assert_eq!(state_machine.task_total(), 1); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1008,7 +2115,7 @@ mod tests { let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1019,26 +2126,26 @@ mod tests { assert_matches!(state_machine.schedule_task(task2.clone()), None); state_machine.deschedule_task(&task1); - assert!(state_machine.has_unblocked_task()); - assert_eq!(state_machine.unblocked_task_queue_count(), 1); + assert!(state_machine.has_buffered_task()); + assert_eq!(state_machine.buffered_task_queue_count(), 1); - // unblocked_task_count() should be incremented - assert_eq!(state_machine.unblocked_task_count(), 0); + assert_eq!(state_machine.buffered_task_total(), 1); assert_eq!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); - assert_eq!(state_machine.unblocked_task_count(), 1); + // buffered_task_total() should be incremented + assert_eq!(state_machine.buffered_task_total(), 1); - // there's no blocked task anymore; calling schedule_next_unblocked_task should be noop and - // shouldn't increment the unblocked_task_count(). - assert!(!state_machine.has_unblocked_task()); - assert_matches!(state_machine.schedule_next_unblocked_task(), None); - assert_eq!(state_machine.unblocked_task_count(), 1); + // there's no blocked task anymore; calling schedule_next_buffered_task should be noop and + // shouldn't increment the buffered_task_total(). + assert!(!state_machine.has_buffered_task()); + assert_matches!(state_machine.schedule_next_buffered_task(), None); + assert_eq!(state_machine.buffered_task_total(), 1); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_eq!(state_machine.buffered_task_queue_count(), 0); state_machine.deschedule_task(&task2); assert_matches!( @@ -1048,7 +2155,7 @@ mod tests { Some(103) ); state_machine.deschedule_task(&task3); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1060,7 +2167,7 @@ mod tests { let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1070,34 +2177,35 @@ mod tests { ); assert_matches!(state_machine.schedule_task(task2.clone()), None); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_eq!(state_machine.buffered_task_queue_count(), 0); state_machine.deschedule_task(&task1); - assert_eq!(state_machine.unblocked_task_queue_count(), 1); + assert_eq!(state_machine.buffered_task_queue_count(), 1); // new task is arriving after task1 is already descheduled and task2 got unblocked assert_matches!(state_machine.schedule_task(task3.clone()), None); - assert_eq!(state_machine.unblocked_task_count(), 0); + assert_eq!(state_machine.buffered_task_total(), 1); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); - assert_eq!(state_machine.unblocked_task_count(), 1); + // buffered_task_total() should be incremented + assert_eq!(state_machine.buffered_task_total(), 1); state_machine.deschedule_task(&task2); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(103) ); - assert_eq!(state_machine.unblocked_task_count(), 2); + assert_eq!(state_machine.buffered_task_total(), 2); state_machine.deschedule_task(&task3); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1110,7 +2218,7 @@ mod tests { let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; // both of read-only tasks should be immediately runnable assert_matches!( @@ -1126,17 +2234,17 @@ mod tests { Some(102) ); - assert_eq!(state_machine.active_task_count(), 2); - assert_eq!(state_machine.handled_task_count(), 0); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_eq!(state_machine.alive_task_count(), 2); + assert_eq!(state_machine.executed_task_total(), 0); + assert_eq!(state_machine.buffered_task_queue_count(), 0); state_machine.deschedule_task(&task1); - assert_eq!(state_machine.active_task_count(), 1); - assert_eq!(state_machine.handled_task_count(), 1); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_eq!(state_machine.alive_task_count(), 1); + assert_eq!(state_machine.executed_task_total(), 1); + assert_eq!(state_machine.buffered_task_queue_count(), 0); state_machine.deschedule_task(&task2); - assert_eq!(state_machine.active_task_count(), 0); - assert_eq!(state_machine.handled_task_count(), 2); - assert!(state_machine.has_no_active_task()); + assert_eq!(state_machine.alive_task_count(), 0); + assert_eq!(state_machine.executed_task_total(), 2); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1151,7 +2259,7 @@ mod tests { let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1167,27 +2275,27 @@ mod tests { ); assert_matches!(state_machine.schedule_task(task3.clone()), None); - assert_eq!(state_machine.active_task_count(), 3); - assert_eq!(state_machine.handled_task_count(), 0); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_eq!(state_machine.alive_task_count(), 3); + assert_eq!(state_machine.executed_task_total(), 0); + assert_eq!(state_machine.buffered_task_queue_count(), 0); state_machine.deschedule_task(&task1); - assert_eq!(state_machine.active_task_count(), 2); - assert_eq!(state_machine.handled_task_count(), 1); - assert_eq!(state_machine.unblocked_task_queue_count(), 0); - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_eq!(state_machine.alive_task_count(), 2); + assert_eq!(state_machine.executed_task_total(), 1); + assert_eq!(state_machine.buffered_task_queue_count(), 0); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task2); - assert_eq!(state_machine.active_task_count(), 1); - assert_eq!(state_machine.handled_task_count(), 2); - assert_eq!(state_machine.unblocked_task_queue_count(), 1); + assert_eq!(state_machine.alive_task_count(), 1); + assert_eq!(state_machine.executed_task_total(), 2); + assert_eq!(state_machine.buffered_task_queue_count(), 1); // task3 is finally unblocked after all of readable tasks (task1 and task2) is finished. assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(103) ); state_machine.deschedule_task(&task3); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1202,7 +2310,7 @@ mod tests { let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1213,25 +2321,25 @@ mod tests { assert_matches!(state_machine.schedule_task(task2.clone()), None); assert_matches!(state_machine.schedule_task(task3.clone()), None); - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task1); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task2); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(103) ); - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task3); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1244,7 +2352,7 @@ mod tests { let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1258,12 +2366,12 @@ mod tests { state_machine.deschedule_task(&task1); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); state_machine.deschedule_task(&task2); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1280,7 +2388,7 @@ mod tests { let task4 = SchedulingStateMachine::create_task(sanitized4, 104, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1295,34 +2403,34 @@ mod tests { state_machine.deschedule_task(&task1); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(103) ); // the above deschedule_task(task1) call should only unblock task2 and task3 because these // are read-locking. And shouldn't unblock task4 because it's write-locking - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task2); // still task4 is blocked... - assert_matches!(state_machine.schedule_next_unblocked_task(), None); + assert_matches!(state_machine.schedule_next_buffered_task(), None); state_machine.deschedule_task(&task3); // finally task4 should be unblocked assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(104) ); state_machine.deschedule_task(&task4); - assert!(state_machine.has_no_active_task()); + assert!(state_machine.has_no_alive_task()); } #[test] @@ -1336,7 +2444,7 @@ mod tests { let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; assert_matches!( state_machine @@ -1350,7 +2458,7 @@ mod tests { usage_queue .0 .with_borrow_mut(&mut state_machine.usage_queue_token, |usage_queue| { - assert_matches!(usage_queue.current_usage, Some(Usage::Writable)); + assert_matches!(usage_queue.current_usage, Some(Usage::Writable(_))); }); // task2's fee payer should have been locked already even if task2 is blocked still via the // above the schedule_task(task2) call @@ -1359,60 +2467,572 @@ mod tests { usage_queue .0 .with_borrow_mut(&mut state_machine.usage_queue_token, |usage_queue| { - assert_matches!(usage_queue.current_usage, Some(Usage::Writable)); + assert_matches!(usage_queue.current_usage, Some(Usage::Writable(_))); }); state_machine.deschedule_task(&task1); assert_matches!( state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(102) + ); + state_machine.deschedule_task(&task2); + assert!(state_machine.has_no_alive_task()); + } + + #[test] + fn test_higher_priority_locking_write_read() { + let conflicting_address1 = Pubkey::new_unique(); + let conflicting_address2 = Pubkey::new_unique(); + let sanitized1 = + transaction_with_writable_address2(conflicting_address1, conflicting_address2); + let sanitized2 = + transaction_with_writable_read2(conflicting_address1, conflicting_address2); + let sanitized0_1 = transaction_with_writable_address(conflicting_address1); + //let sanitized0_2 = transaction_with_writable_address( + let usage_queues = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(usage_queues.clone())); + let task0_1 = SchedulingStateMachine::create_task(sanitized0_1, 50, address_loader); + //let task0_2 = SchedulingStateMachine::create_task(sanitized0_2, 51, address_loader); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 99, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() + }; + + assert_matches!( + state_machine + .schedule_task(task0_1.clone()) + .map(|t| t.task_index()), + Some(50) + ); + // now + // addr1: locked by task_0_1, queue: [] + // addr2: unlocked, queue: [] + + assert_matches!(state_machine.schedule_task(task1.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task1] + // addr2: locked by task1, queue: [] + // + assert_matches!(state_machine.schedule_task(task2.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task2, task1] + // addr2: locked by task2, queue: [task1] + + assert!(!state_machine.has_buffered_task()); + state_machine.deschedule_task(&task0_1); + assert!(state_machine.has_buffered_task()); + // now + // addr1: locked by task2, queue: [task1] + // addr2: locked by task2, queue: [task1] + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(99) + ); + + state_machine.deschedule_task(&task2); + assert!(state_machine.has_buffered_task()); + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(101) + ); + state_machine.deschedule_task(&task1); + + dbg!(state_machine); + // task1 + // blocked by addr1 + // locking addr2 + // task2 + // locking addr1 + // blocked by addr2 + // + /* + assert_matches!( + state_machine + .schedule_task(task0_2.clone()) + .map(|t| t.task_index()), + Some(51) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + */ + } + + #[test] + fn test_higher_priority_locking_write_write_and_read_read() { + let conflicting_address1 = Pubkey::new_unique(); + let conflicting_address2 = Pubkey::new_unique(); + let sanitized1 = + transaction_with_writable_address2(conflicting_address1, conflicting_address2); + let sanitized2 = + transaction_with_writable_address2(conflicting_address1, conflicting_address2); + let sanitized0_1 = transaction_with_writable_address(conflicting_address1); + //let sanitized0_2 = transaction_with_writable_address( + let usage_queues = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(usage_queues.clone())); + let task0_1 = SchedulingStateMachine::create_task(sanitized0_1, 50, address_loader); + //let task0_2 = SchedulingStateMachine::create_task(sanitized0_2, 51, address_loader); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 99, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() + }; + + assert_matches!( + state_machine + .schedule_task(task0_1.clone()) + .map(|t| t.task_index()), + Some(50) + ); + // now + // addr1: locked by task_0_1, queue: [] + // addr2: unlocked, queue: [] + + assert_matches!(state_machine.schedule_task(task1.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task1] + // addr2: locked by task1, queue: [] + // + assert_matches!(state_machine.schedule_task(task2.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task2, task1] + // addr2: locked by task2, queue: [task1] + + assert!(!state_machine.has_buffered_task()); + state_machine.deschedule_task(&task0_1); + assert!(state_machine.has_buffered_task()); + // now + // addr1: locked by task2, queue: [task1] + // addr2: locked by task2, queue: [task1] + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(99) + ); + + state_machine.deschedule_task(&task2); + assert!(state_machine.has_buffered_task()); + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(101) + ); + state_machine.deschedule_task(&task1); + + dbg!(state_machine); + // task1 + // blocked by addr1 + // locking addr2 + // task2 + // locking addr1 + // blocked by addr2 + // + /* + assert_matches!( + state_machine + .schedule_task(task0_2.clone()) + .map(|t| t.task_index()), + Some(51) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + */ + } + + #[test] + fn test_higher_priority_locking_read_write_simple() { + let conflicting_address1 = Pubkey::new_unique(); + let conflicting_address2 = Pubkey::new_unique(); + let sanitized1 = + transaction_with_writable_read2(conflicting_address1, conflicting_address2); + let sanitized2 = + transaction_with_writable_address2(conflicting_address1, conflicting_address2); + let sanitized0_1 = transaction_with_writable_address(conflicting_address1); + //let sanitized0_2 = transaction_with_writable_address( + let usage_queues = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(usage_queues.clone())); + let task0_1 = SchedulingStateMachine::create_task(sanitized0_1, 50, address_loader); + //let task0_2 = SchedulingStateMachine::create_task(sanitized0_2, 51, address_loader); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 99, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() + }; + + assert_matches!( + state_machine + .schedule_task(task0_1.clone()) + .map(|t| t.task_index()), + Some(50) + ); + // now + // addr1: locked by task_0_1, queue: [] + // addr2: unlocked, queue: [] + + assert_matches!(state_machine.schedule_task(task1.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task1] + // addr2: locked by task1, queue: [] + // + assert_matches!(state_machine.schedule_task(task2.clone()), None); + // now + // addr1: locked by task_0_1, queue: [task2, task1] + // addr2: locked by task2, queue: [task1] + + assert!(!state_machine.has_buffered_task()); + state_machine.deschedule_task(&task0_1); + assert!(state_machine.has_buffered_task()); + // now + // addr1: locked by task2, queue: [task1] + // addr2: locked by task2, queue: [task1] + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(99) + ); + + state_machine.deschedule_task(&task2); + assert!(state_machine.has_buffered_task()); + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(101) + ); + state_machine.deschedule_task(&task1); + + dbg!(state_machine); + // task1 + // blocked by addr1 + // locking addr2 + // task2 + // locking addr1 + // blocked by addr2 + // + /* + assert_matches!( + state_machine + .schedule_task(task0_2.clone()) + .map(|t| t.task_index()), + Some(51) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + */ + } + + #[test] + fn test_higher_priority_locking_read_write_complex() { + let conflicting_address1 = Pubkey::new_unique(); + let conflicting_address2 = Pubkey::new_unique(); + let sanitized0_1 = transaction_with_readonly_address(conflicting_address2); + let sanitized1 = transaction_with_writable_read2( + *sanitized0_1.message().fee_payer(), + conflicting_address2, + ); + let sanitized1_2 = + transaction_with_writable_read2(conflicting_address1, conflicting_address2); + let sanitized1_3 = + transaction_with_writable_read2(conflicting_address1, conflicting_address2); + let sanitized2 = + transaction_with_writable_address2(Pubkey::new_unique(), conflicting_address2); + //let sanitized0_2 = transaction_with_writable_address( + let usage_queues = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(usage_queues.clone())); + let task0_1 = SchedulingStateMachine::create_task(sanitized0_1, 50, address_loader); + //let task0_2 = SchedulingStateMachine::create_task(sanitized0_2, 51, address_loader); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task1_2 = SchedulingStateMachine::create_task(sanitized1_2, 103, address_loader); + let task1_3 = SchedulingStateMachine::create_task(sanitized1_3, 104, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 99, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() + }; + + assert_matches!( + state_machine + .schedule_task(task0_1.clone()) + .map(|t| t.task_index()), + Some(50) + ); + // now + // addr1: unlocked, queue: [] + // addr2: locked by task0_1, queue: [] + + assert_matches!(state_machine.schedule_task(task1.clone()), None); + // now + // addr1: unlocked, queue: [] + // addr2: locked by [task0_1, task1], queue: [] + + assert_matches!( + state_machine + .schedule_task(task1_2.clone()) + .map(|t| t.task_index()), + Some(103) + ); + // now + // addr1: locked by task1_2, queue: [] + // addr2: locked by [task0_1, task1, task1_2], queue: [] + + assert_matches!( + state_machine + .schedule_task(task1_3.clone()) + .map(|t| t.task_index()), + None + ); + // now + // addr1: locked by task1_2, queue: [task1_3] + // addr2: locked by [task0_1, task1, task1_2, task1_3], queue: [] + + assert_matches!(state_machine.schedule_task(task2.clone()), None); + // now + // addr1: locked by task1_2, queue: [task1_3] + // addr2: locked by [task0_1, task1_2], queue: [task2, task1, task1_3] + + assert!(!state_machine.has_buffered_task()); + dbg!(state_machine.buffered_task_queue_count()); + state_machine.deschedule_task(&task0_1); + dbg!(state_machine.buffered_task_queue_count()); + assert!(!state_machine.has_buffered_task()); + // now + // addr1: locked by task1_2, queue: [task1_3] + // addr2: locked by task1_2, queue: [task2, task1, task1_3] + // + assert!(!state_machine.has_buffered_task()); + state_machine.deschedule_task(&task1_2); + assert!(state_machine.has_buffered_task()); + // now + // addr1: unlocked, queue: [task1_3] + // addr2: unlocked, queue: [task2, task1, task1_3] + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(99) + ); + // now + // addr1: unlocked, queue: [task1_3] + // addr2: locked by task2, queue: [task1, task1_3] + + assert!(!state_machine.has_buffered_task()); + state_machine.deschedule_task(&task2); + assert!(state_machine.has_buffered_task()); + // now + // addr1: unlocked, queue: [task1_3] + // addr2: unlocked, queue: [task1, task1_3] + + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!( + state_machine + .schedule_next_buffered_task() + .map(|t| t.task_index()), + Some(104) + ); + state_machine.deschedule_task(&task1); + state_machine.deschedule_task(&task1_3); + } + + #[test] + fn test_eager_scheduling() { + let conflicting_address1 = Pubkey::new_unique(); + let conflicting_address2 = Pubkey::new_unique(); + let conflicting_address3 = Pubkey::new_unique(); + let conflicting_address4 = Pubkey::new_unique(); + + let sanitized1 = + transaction_with_writable_address2(conflicting_address1, conflicting_address2); + let sanitized2 = + transaction_with_writable_address2(conflicting_address2, conflicting_address3); + let sanitized3 = + transaction_with_writable_address2(conflicting_address3, conflicting_address4); + let usage_queues = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(usage_queues.clone())); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test2() + }; + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + assert_matches!( + state_machine + .schedule_task(task2.clone()) + .map(|t| t.task_index()), + None + ); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + assert_matches!( + state_machine + .schedule_task(task3.clone()) + .map(|t| t.task_index()), + None + ); + // now + // addr1: task1 | + // addr2: task1 | task2, task3, task4, task5, task6, task7, task8, , task10 + // addr3: | task2, task3, task4, task5, task6, task7, task8, task9, , task11 + // addr4: | task4, task5, task6, task7, task8, task9, task10 + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + Some(103) + ); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + state_machine.deschedule_task(&task1); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + state_machine.deschedule_task(&task3); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); + assert_matches!( + state_machine + .schedule_next_buffered_task() .map(|t| t.task_index()), Some(102) ); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); state_machine.deschedule_task(&task2); - assert!(state_machine.has_no_active_task()); + assert_matches!( + state_machine + .scan_and_schedule_next_task() + .map(|t| t.task_index()), + None + ); } #[test] + #[cfg_attr(miri, ignore)] #[should_panic(expected = "internal error: entered unreachable code")] fn test_unreachable_unlock_conditions1() { let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; let usage_queue = UsageQueue::default(); usage_queue .0 .with_borrow_mut(&mut state_machine.usage_queue_token, |usage_queue| { - let _ = usage_queue.unlock(RequestedUsage::Writable); + usage_queue.executing_count.increment_self(); + let _ = usage_queue.unlock( + &LockContext::new(UsageQueue::default(), RequestedUsage::Writable), + 0, + &mut state_machine.count_token, + ); }); } #[test] - #[should_panic(expected = "internal error: entered unreachable code")] + #[cfg_attr(miri, ignore)] + #[should_panic( + expected = "assertion `left == right` failed\n left: (3, Readonly, 0)\n right: (3, Writable, 0)" + )] fn test_unreachable_unlock_conditions2() { let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; let usage_queue = UsageQueue::default(); + let sanitized = simplest_transaction(); + let task = SchedulingStateMachine::create_task(sanitized.clone(), 3, &mut |_| { + UsageQueue::default() + }); + let lock_context = LockContext::new(usage_queue.clone(), RequestedUsage::Readonly); usage_queue .0 .with_borrow_mut(&mut state_machine.usage_queue_token, |usage_queue| { - usage_queue.current_usage = Some(Usage::Writable); - let _ = usage_queue.unlock(RequestedUsage::Readonly); + usage_queue.executing_count.increment_self(); + let task_index = task.index(); + usage_queue.current_usage = Some(Usage::Writable(task)); + let _ = + usage_queue.unlock(&lock_context, task_index, &mut state_machine.count_token); }); } #[test] + #[cfg_attr(miri, ignore)] #[should_panic(expected = "internal error: entered unreachable code")] fn test_unreachable_unlock_conditions3() { let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling_for_test() }; let usage_queue = UsageQueue::default(); + let sanitized = simplest_transaction(); + let task = SchedulingStateMachine::create_task(sanitized.clone(), 3, &mut |_| { + UsageQueue::default() + }); usage_queue .0 .with_borrow_mut(&mut state_machine.usage_queue_token, |usage_queue| { + usage_queue.executing_count.increment_self(); + let task_index = task.index(); usage_queue.current_usage = Some(Usage::Readonly(ShortCounter::one())); - let _ = usage_queue.unlock(RequestedUsage::Writable); + let _ = usage_queue.unlock( + &LockContext::new(UsageQueue::default(), RequestedUsage::Writable), + task_index, + &mut state_machine.count_token, + ); }); } } diff --git a/unified-scheduler-pool/Cargo.toml b/unified-scheduler-pool/Cargo.toml index e1b17308633..ff0c43227cf 100644 --- a/unified-scheduler-pool/Cargo.toml +++ b/unified-scheduler-pool/Cargo.toml @@ -10,20 +10,32 @@ license = { workspace = true } edition = { workspace = true } [dependencies] +ahash = { workspace = true } assert_matches = { workspace = true } +cpu-time = "1.0.0" crossbeam-channel = { workspace = true } dashmap = { workspace = true } derive-where = { workspace = true } +derive_more = { workspace = true } +dyn-clone = "1.0.17" +enum-ptr = "0.2.0" log = { workspace = true } -qualifier_attr = { workspace = true } scopeguard = { workspace = true } +serde = { workspace = true } +solana-cost-model = { workspace = true } +solana-feature-set = { workspace = true } solana-ledger = { workspace = true } +solana-perf = { workspace = true } +solana-poh = { workspace = true } +solana-program-runtime = { workspace = true } solana-runtime = { workspace = true } solana-runtime-transaction = { workspace = true } solana-sdk = { workspace = true } +solana-svm = { workspace = true } solana-timings = { workspace = true } solana-unified-scheduler-logic = { workspace = true } static_assertions = { workspace = true } +trait-set = "0.3.0" vec_extract_if_polyfill = { workspace = true } [dev-dependencies] @@ -31,6 +43,8 @@ assert_matches = { workspace = true } lazy_static = { workspace = true } solana-logger = { workspace = true } solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } +# See order-crates-for-publishing.py for using this unusual `path = "."` +solana-unified-scheduler-pool = { path = ".", features = ["dev-context-only-utils"] } [features] dev-context-only-utils = [] diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 61690f4ff39..4ed1ef006bd 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -8,36 +8,42 @@ //! and commits any side-effects (i.e. on-chain state changes) into the associated `Bank` via //! `solana-ledger`'s helper function called `execute_batch()`. -#[cfg(feature = "dev-context-only-utils")] -use qualifier_attr::qualifiers; use { assert_matches::assert_matches, crossbeam_channel::{self, never, select_biased, Receiver, RecvError, SendError, Sender}, - dashmap::DashMap, + dashmap::{DashMap, DashSet}, derive_where::derive_where, + dyn_clone::{clone_trait_object, DynClone}, log::*, scopeguard::defer, + solana_cost_model::cost_model::CostModel, + solana_feature_set as feature_set, solana_ledger::blockstore_processor::{ execute_batch, TransactionBatchWithIndexes, TransactionStatusSender, }, + solana_perf::packet::{BankingPacketBatch, BankingPacketReceiver}, + solana_poh::poh_recorder::TransactionRecorder, solana_runtime::{ - bank::Bank, installed_scheduler_pool::{ initialized_result_with_timings, InstalledScheduler, InstalledSchedulerBox, - InstalledSchedulerPool, InstalledSchedulerPoolArc, ResultWithTimings, ScheduleResult, - SchedulerAborted, SchedulerId, SchedulingContext, TimeoutListener, - UninstalledScheduler, UninstalledSchedulerBox, + InstalledSchedulerPool, ResultWithTimings, ScheduleResult, SchedulerAborted, + SchedulerId, SchedulingContext, TimeoutListener, UninstalledScheduler, + UninstalledSchedulerBox, }, prioritization_fee_cache::PrioritizationFeeCache, vote_sender_types::ReplayVoteSender, }, solana_runtime_transaction::runtime_transaction::RuntimeTransaction, solana_sdk::{ + hash::Hash, pubkey::Pubkey, + scheduling::{SchedulingMode, TaskKey}, transaction::{Result, SanitizedTransaction, TransactionError}, }, solana_timings::ExecuteTimings, - solana_unified_scheduler_logic::{SchedulingStateMachine, Task, UsageQueue}, + solana_unified_scheduler_logic::{ + SchedulingStateMachine, ShortCounter, Task, TransactionContext, UsageQueue, + }, static_assertions::const_assert_eq, std::{ fmt::Debug, @@ -45,14 +51,22 @@ use { mem, sync::{ atomic::{AtomicU64, Ordering::Relaxed}, - Arc, Mutex, OnceLock, Weak, + Arc, Condvar, Mutex, MutexGuard, OnceLock, Weak, }, thread::{self, sleep, JoinHandle}, time::{Duration, Instant}, }, + trait_set::trait_set, vec_extract_if_polyfill::MakeExtractIf, }; +#[derive(Clone)] +pub struct BankingStageContext { + adapter: Arc, + banking_packet_receiver: BankingPacketReceiver, + on_banking_packet_receive: Box, +} + mod sleepless_testing; use crate::sleepless_testing::BuilderTracked; @@ -60,8 +74,8 @@ use crate::sleepless_testing::BuilderTracked; #[allow(dead_code)] #[derive(Debug)] enum CheckPoint { - NewTask(usize), - TaskHandled(usize), + NewTask(TaskKey), + TaskHandled(TaskKey), SchedulerThreadAborted, IdleSchedulerCleaned(usize), TrashedSchedulerCleaned(usize), @@ -70,15 +84,40 @@ enum CheckPoint { type AtomicSchedulerId = AtomicU64; +#[derive(Debug)] +pub enum SupportedSchedulingMode { + Either(SchedulingMode), + Both, +} + +impl SupportedSchedulingMode { + fn is_supported(&self, requested_mode: SchedulingMode) -> bool { + match (self, requested_mode) { + (Self::Both, _) => true, + (Self::Either(ref supported), ref requested) if supported == requested => true, + _ => false, + } + } + + #[cfg(feature = "dev-context-only-utils")] + fn block_verification_only() -> Self { + Self::Either(SchedulingMode::BlockVerification) + } +} + // SchedulerPool must be accessed as a dyn trait from solana-runtime, because SchedulerPool // contains some internal fields, whose types aren't available in solana-runtime (currently // TransactionStatusSender; also, PohRecorder in the future)... #[derive(Debug)] pub struct SchedulerPool, TH: TaskHandler> { + supported_scheduling_mode: SupportedSchedulingMode, scheduler_inners: Mutex>, + block_production_scheduler_inner: Mutex<(Option, Option)>, + block_production_scheduler_condvar: Condvar, + block_production_scheduler_respawner: Mutex>, trashed_scheduler_inners: Mutex>, timeout_listeners: Mutex>, - handler_count: usize, + block_verification_handler_count: usize, handler_context: HandlerContext, // weak_self could be elided by changing InstalledScheduler::take_scheduler()'s receiver to // Arc from &Self, because SchedulerPool is used as in the form of Arc @@ -102,14 +141,15 @@ pub struct HandlerContext { transaction_status_sender: Option, replay_vote_sender: Option, prioritization_fee_cache: Arc, + transaction_recorder: TransactionRecorder, } pub type DefaultSchedulerPool = SchedulerPool, DefaultTaskHandler>; -const DEFAULT_POOL_CLEANER_INTERVAL: Duration = Duration::from_secs(10); +const DEFAULT_POOL_CLEANER_INTERVAL: Duration = Duration::from_secs(5); const DEFAULT_MAX_POOLING_DURATION: Duration = Duration::from_secs(180); -const DEFAULT_TIMEOUT_DURATION: Duration = Duration::from_secs(12); +const DEFAULT_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // Rough estimate of max UsageQueueLoader size in bytes: // UsageFromTask * UsageQueue's capacity * DEFAULT_MAX_USAGE_QUEUE_COUNT // 16 bytes * 128 items * 262_144 entries == 512 MiB @@ -126,27 +166,47 @@ const DEFAULT_TIMEOUT_DURATION: Duration = Duration::from_secs(12); // because UsageQueueLoader won't grow that much to begin with. const DEFAULT_MAX_USAGE_QUEUE_COUNT: usize = 262_144; +trait_set! { + pub trait BatchConverter = + DynClone + (for<'a> Fn(BankingPacketBatch, &'a dyn Fn(Task))) + Send + 'static; +} + +clone_trait_object!(BatchConverter); + +type BatchConverterCreator = + Box) -> Box) + Send>; + +#[derive(derive_more::Debug)] +struct BlockProductionSchedulerRespawner { + handler_count: usize, + #[debug("{on_spawn_block_production_scheduler:p}")] + on_spawn_block_production_scheduler: BatchConverterCreator, + banking_packet_receiver: BankingPacketReceiver, + banking_stage_monitor: Box, +} + impl SchedulerPool where S: SpawnableScheduler, TH: TaskHandler, { - // Some internal impl and test code want an actual concrete type, NOT the - // `dyn InstalledSchedulerPool`. So don't merge this into `Self::new_dyn()`. - #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] - fn new( + pub fn new( + supported_scheduling_mode: SupportedSchedulingMode, handler_count: Option, log_messages_bytes_limit: Option, transaction_status_sender: Option, replay_vote_sender: Option, prioritization_fee_cache: Arc, + transaction_recorder: TransactionRecorder, ) -> Arc { Self::do_new( + supported_scheduling_mode, handler_count, log_messages_bytes_limit, transaction_status_sender, replay_vote_sender, prioritization_fee_cache, + transaction_recorder, DEFAULT_POOL_CLEANER_INTERVAL, DEFAULT_MAX_POOLING_DURATION, DEFAULT_MAX_USAGE_QUEUE_COUNT, @@ -154,30 +214,62 @@ where ) } + #[cfg(feature = "dev-context-only-utils")] + pub fn new_for_verification( + handler_count: Option, + log_messages_bytes_limit: Option, + transaction_status_sender: Option, + replay_vote_sender: Option, + prioritization_fee_cache: Arc, + ) -> Arc { + Self::new( + SupportedSchedulingMode::block_verification_only(), + handler_count, + log_messages_bytes_limit, + transaction_status_sender, + replay_vote_sender, + prioritization_fee_cache, + TransactionRecorder::new_dummy(), + ) + } + + #[allow(clippy::too_many_arguments)] fn do_new( + supported_scheduling_mode: SupportedSchedulingMode, handler_count: Option, log_messages_bytes_limit: Option, transaction_status_sender: Option, replay_vote_sender: Option, prioritization_fee_cache: Arc, + mut transaction_recorder: TransactionRecorder, pool_cleaner_interval: Duration, max_pooling_duration: Duration, max_usage_queue_count: usize, timeout_duration: Duration, ) -> Arc { let handler_count = handler_count.unwrap_or(Self::default_handler_count()); - assert!(handler_count >= 1); + let bp_is_supported = + supported_scheduling_mode.is_supported(SchedulingMode::BlockProduction); + + if !bp_is_supported { + transaction_recorder = TransactionRecorder::new_dummy(); + } let scheduler_pool = Arc::new_cyclic(|weak_self| Self { + supported_scheduling_mode, scheduler_inners: Mutex::default(), + block_production_scheduler_inner: Mutex::default(), + block_production_scheduler_condvar: Condvar::default(), + block_production_scheduler_respawner: Mutex::default(), trashed_scheduler_inners: Mutex::default(), timeout_listeners: Mutex::default(), - handler_count, + block_verification_handler_count: handler_count, handler_context: HandlerContext { log_messages_bytes_limit, transaction_status_sender, replay_vote_sender, prioritization_fee_cache, + transaction_recorder, }, weak_self: weak_self.clone(), next_scheduler_id: AtomicSchedulerId::default(), @@ -188,8 +280,10 @@ where let cleaner_main_loop = { let weak_scheduler_pool = Arc::downgrade(&scheduler_pool); + let mut exiting = false; move || loop { sleep(pool_cleaner_interval); + trace!("Scheduler pool cleaner: start!!!",); let Some(scheduler_pool) = weak_scheduler_pool.upgrade() else { break; @@ -222,6 +316,12 @@ where idle_inner_count }; + let banking_stage_status = scheduler_pool.banking_stage_status(); + if !exiting && matches!(banking_stage_status, Some(BankingStageStatus::Exited)) { + exiting = true; + scheduler_pool.unregister_banking_stage(); + } + let trashed_inner_count = { let Ok(mut trashed_scheduler_inners) = scheduler_pool.trashed_scheduler_inners.lock() @@ -236,7 +336,7 @@ where trashed_inner_count }; - let triggered_timeout_listener_count = { + let (triggered_timeout_listener_count, active_timeout_listener_count) = { // Pre-allocate rather large capacity to avoid reallocation inside the lock. let mut expired_listeners = Vec::with_capacity(128); let Ok(mut timeout_listeners) = scheduler_pool.timeout_listeners.lock() else { @@ -248,24 +348,68 @@ where now.duration_since(*registered_at) > timeout_duration }, )); + let not_expired_count = timeout_listeners.len(); drop(timeout_listeners); - let count = expired_listeners.len(); + let expired_count = expired_listeners.len(); for (timeout_listener, _registered_at) in expired_listeners { timeout_listener.trigger(scheduler_pool.clone()); } - count + (expired_count, not_expired_count) }; + if matches!(banking_stage_status, Some(BankingStageStatus::Inactive)) { + let mut id_and_inner = scheduler_pool + .block_production_scheduler_inner + .lock() + .unwrap(); + if let Some(pooled) = &id_and_inner.1 { + info!("sch {} IS idle", pooled.id()); + if pooled.is_overgrown(false) { + info!("sch {} is overgrown!", pooled.id()); + let pooled = id_and_inner.1.take().unwrap(); + assert_eq!(Some(pooled.id()), id_and_inner.0.take()); + scheduler_pool.spawn_block_production_scheduler(&mut id_and_inner); + drop(id_and_inner); + let id = pooled.id(); + info!("dropping overgrown sch {id}"); + drop(pooled); + info!("dropped overgrown sch {id}"); + } else { + info!("sch {} isn't overgrown", pooled.id()); + pooled.reset(); + } + } + } + info!( - "Scheduler pool cleaner: dropped {} idle inners, {} trashed inners, triggered {} timeout listeners", - idle_inner_count, trashed_inner_count, triggered_timeout_listener_count, + "Scheduler pool cleaner: dropped {} idle inners, {} trashed inners, triggered {} timeout listeners, (exit: {:?})", + idle_inner_count, trashed_inner_count, triggered_timeout_listener_count, exiting, ); sleepless_testing::at(CheckPoint::IdleSchedulerCleaned(idle_inner_count)); sleepless_testing::at(CheckPoint::TrashedSchedulerCleaned(trashed_inner_count)); sleepless_testing::at(CheckPoint::TimeoutListenerTriggered( triggered_timeout_listener_count, )); + + if exiting && active_timeout_listener_count == 0 { + // Wait a bit to ensure the replay stage has gone. + sleep(Duration::from_secs(1)); + + let mut id_and_inner = scheduler_pool + .block_production_scheduler_inner + .lock() + .unwrap(); + if let Some(pooled) = id_and_inner.1.take() { + assert_eq!(Some(pooled.id()), id_and_inner.0.take()); + drop(id_and_inner); + let id = pooled.id(); + info!("dropping sch {id} after proper exit"); + drop(pooled); + info!("dropped sch {id} after proper exit"); + } + break; + } } }; @@ -278,24 +422,6 @@ where scheduler_pool } - // This apparently-meaningless wrapper is handy, because some callers explicitly want - // `dyn InstalledSchedulerPool` to be returned for type inference convenience. - pub fn new_dyn( - handler_count: Option, - log_messages_bytes_limit: Option, - transaction_status_sender: Option, - replay_vote_sender: Option, - prioritization_fee_cache: Arc, - ) -> InstalledSchedulerPoolArc { - Self::new( - handler_count, - log_messages_bytes_limit, - transaction_status_sender, - replay_vote_sender, - prioritization_fee_cache, - ) - } - // See a comment at the weak_self field for justification of this method's existence. fn self_arc(&self) -> Arc { self.weak_self @@ -309,21 +435,49 @@ where // This fn needs to return immediately due to being part of the blocking // `::wait_for_termination()` call. - fn return_scheduler(&self, scheduler: S::Inner, should_trash: bool) { + fn return_scheduler(&self, mut scheduler: S::Inner, should_trash: bool) { + let id = scheduler.id(); + debug!("return_scheduler(): id: {id} should_trash: {should_trash}"); + let mut id_and_inner = self.block_production_scheduler_inner.lock().unwrap(); + let is_block_production_scheduler_returned = Some(id) == id_and_inner.0.as_ref().copied(); + if should_trash { + if is_block_production_scheduler_returned { + // Abort this trashed scheduler to stop receiving BankingPacketBatch anymore... + scheduler.ensure_abort(); + } // Delay drop()-ing this trashed returned scheduler inner by stashing it in // self.trashed_scheduler_inners, which is periodically drained by the `solScCleaner` // thread. Dropping it could take long time (in fact, - // PooledSchedulerInner::usage_queue_loader can contain many entries to drop). + // TaskCreator::usage_queue_loader() can contain many entries to drop). self.trashed_scheduler_inners .lock() .expect("not poisoned") .push(scheduler); + + if is_block_production_scheduler_returned && self.should_respawn() { + info!("respawning scheduler after being trashed..."); + assert_eq!(id_and_inner.0.take(), Some(id)); + self.spawn_block_production_scheduler(&mut id_and_inner); + info!("respawned scheduler after being trashed."); + } + drop(id_and_inner); } else { - self.scheduler_inners - .lock() - .expect("not poisoned") - .push((scheduler, Instant::now())); + drop(id_and_inner); + if !is_block_production_scheduler_returned { + self.scheduler_inners + .lock() + .expect("not poisoned") + .push((scheduler, Instant::now())); + } else { + assert!(self + .block_production_scheduler_inner + .lock() + .unwrap() + .1 + .replace(scheduler) + .is_none()); + } } } @@ -339,13 +493,35 @@ where ) -> S { assert_matches!(result_with_timings, (Ok(_), _)); - // pop is intentional for filo, expecting relatively warmed-up scheduler due to having been - // returned recently - if let Some((inner, _pooled_at)) = self.scheduler_inners.lock().expect("not poisoned").pop() - { - S::from_inner(inner, context, result_with_timings) + if matches!(context.mode(), SchedulingMode::BlockVerification) { + // pop is intentional for filo, expecting relatively warmed-up scheduler due to having been + // returned recently + if let Some((inner, _pooled_at)) = + self.scheduler_inners.lock().expect("not poisoned").pop() + { + S::from_inner(inner, context, result_with_timings) + } else { + S::spawn( + self.block_verification_handler_count, + self.self_arc(), + context, + result_with_timings, + None, + ) + } } else { - S::spawn(self.self_arc(), context, result_with_timings) + let mut id_and_inner = self + .block_production_scheduler_inner + .lock() + .expect("not poisoned"); + id_and_inner = self + .block_production_scheduler_condvar + .wait_while(id_and_inner, |id_and_inner| id_and_inner.0.is_none()) + .unwrap(); + let Some(inner) = id_and_inner.1.take() else { + panic!("double take: {:?}, {:?}", context.slot(), context.mode()); + }; + S::from_inner(inner, context, result_with_timings) } } @@ -354,6 +530,100 @@ where self.scheduler_inners.lock().expect("not poisoned").len() } + pub fn block_production_supported(&self) -> bool { + self.supported_scheduling_mode + .is_supported(SchedulingMode::BlockProduction) + } + + pub fn register_banking_stage( + &self, + banking_packet_receiver: BankingPacketReceiver, + handler_count: usize, + banking_stage_monitor: Box, + on_spawn_block_production_scheduler: BatchConverterCreator, + ) { + *self.block_production_scheduler_respawner.lock().unwrap() = + Some(BlockProductionSchedulerRespawner { + handler_count, + banking_packet_receiver, + on_spawn_block_production_scheduler, + banking_stage_monitor, + }); + self.spawn_block_production_scheduler( + &mut self.block_production_scheduler_inner.lock().unwrap(), + ); + } + + fn unregister_banking_stage(&self) { + assert!(self + .block_production_scheduler_respawner + .lock() + .unwrap() + .take() + .is_some()); + } + + fn banking_stage_status(&self) -> Option { + self.block_production_scheduler_respawner + .lock() + .unwrap() + .as_ref() + .map(|respawner| respawner.banking_stage_monitor.status()) + } + + fn should_respawn(&self) -> bool { + !matches!( + self.banking_stage_status(), + None | Some(BankingStageStatus::Exited) + ) + } + + fn spawn_block_production_scheduler( + &self, + id_and_inner: &mut MutexGuard<'_, (Option, Option)>, + ) { + trace!("spawn block production scheduler: start!"); + let (handler_count, banking_stage_context) = { + let mut respawner_write = self.block_production_scheduler_respawner.lock().unwrap(); + let BlockProductionSchedulerRespawner { + handler_count, + banking_packet_receiver, + on_spawn_block_production_scheduler, + banking_stage_monitor: _, + } = &mut *respawner_write.as_mut().unwrap(); + + let adapter = Arc::new(BankingStageAdapter { + usage_queue_loader: UsageQueueLoader::default(), + transaction_deduper: DashSet::with_capacity(1_000_000), + next_task_id: AtomicU64::default(), + }); + + ( + *handler_count, + BankingStageContext { + banking_packet_receiver: banking_packet_receiver.clone(), + on_banking_packet_receive: on_spawn_block_production_scheduler(adapter.clone()), + adapter, + }, + ) + }; + + let scheduler = S::spawn( + handler_count, + self.self_arc(), + SchedulingContext::new(SchedulingMode::BlockProduction, None), + initialized_result_with_timings(), + Some(banking_stage_context), + ); + let ((Ok(_result), _timings), inner) = scheduler.into_inner() else { + panic!() + }; + assert!(id_and_inner.0.replace(inner.id()).is_none()); + assert!(id_and_inner.1.replace(inner).is_none()); + self.block_production_scheduler_condvar.notify_all(); + trace!("spawn block production scheduler: end!"); + } + pub fn default_handler_count() -> usize { Self::calculate_default_handler_count( thread::available_parallelism() @@ -395,8 +665,14 @@ where &self, context: SchedulingContext, result_with_timings: ResultWithTimings, - ) -> InstalledSchedulerBox { - Box::new(self.do_take_resumed_scheduler(context, result_with_timings)) + ) -> Option { + if !self.supported_scheduling_mode.is_supported(context.mode()) { + return None; + } + + Some(Box::new( + self.do_take_resumed_scheduler(context, result_with_timings), + )) } fn register_timeout_listener(&self, timeout_listener: TimeoutListener) { @@ -411,9 +687,8 @@ pub trait TaskHandler: Send + Sync + Debug + Sized + 'static { fn handle( result: &mut Result<()>, timings: &mut ExecuteTimings, - bank: &Arc, - transaction: &RuntimeTransaction, - index: usize, + scheduling_context: &SchedulingContext, + task: &Task, handler_context: &HandlerContext, ); } @@ -425,28 +700,104 @@ impl TaskHandler for DefaultTaskHandler { fn handle( result: &mut Result<()>, timings: &mut ExecuteTimings, - bank: &Arc, - transaction: &RuntimeTransaction, - index: usize, + scheduling_context: &SchedulingContext, + task: &Task, handler_context: &HandlerContext, ) { // scheduler must properly prevent conflicting tx executions. thus, task handler isn't // responsible for locking. - let batch = bank.prepare_unlocked_batch_from_single_tx(transaction); - let batch_with_indexes = TransactionBatchWithIndexes { - batch, - transaction_indexes: vec![index], - }; + let bank = scheduling_context.bank(); + let transaction = task.transaction(); + let index = task.index(); + + let (cost, added_cost) = + if matches!(scheduling_context.mode(), SchedulingMode::BlockProduction) { + let TransactionContext::BlockProduction(max_age) = task.context() else { + panic!() + }; + + let move_precompile_verification_to_svm = bank + .feature_set + .is_active(&feature_set::move_precompile_verification_to_svm::id()); + if let Err(error) = bank.refilter_prebuilt_block_production_transaction( + transaction, + max_age, + move_precompile_verification_to_svm, + ) { + *result = Err(error); + (None, false) + } else { + let cost = CostModel::calculate_cost(transaction, &bank.feature_set); + loop { + let r = bank.write_cost_tracker().unwrap().try_add(&cost); + if let Err(e) = r { + use solana_cost_model::cost_tracker::CostTrackerError; + if matches!(e, CostTrackerError::WouldExceedAccountDataBlockLimit) { + sleep(Duration::from_millis(10)); + continue; + } else { + *result = Err(e.into()); + break (Some(cost), false); + } + } else { + break (Some(cost), true); + } + } + } + } else { + (None, false) + }; + + if result.is_ok() { + let batch = bank.prepare_unlocked_batch_from_single_tx(transaction); + let transaction_indexes = match scheduling_context.mode() { + SchedulingMode::BlockVerification => vec![index.try_into().unwrap()], + SchedulingMode::BlockProduction => { + if handler_context.transaction_status_sender.is_some() { + // will be filled inside execute_batch() + Vec::with_capacity(1) + } else { + vec![] + } + } + }; + let batch_with_indexes = TransactionBatchWithIndexes { + batch, + transaction_indexes, + }; + + let pre_commit_callback = match scheduling_context.mode() { + SchedulingMode::BlockVerification => None, + SchedulingMode::BlockProduction => Some(|| { + let summary = handler_context.transaction_recorder.record_transactions( + bank.slot(), + vec![transaction.to_versioned_transaction()], + ); + summary + .result + .ok() + .map(|()| summary.starting_transaction_index) + }), + }; + + *result = execute_batch( + &batch_with_indexes, + bank, + handler_context.transaction_status_sender.as_ref(), + handler_context.replay_vote_sender.as_ref(), + timings, + handler_context.log_messages_bytes_limit, + &handler_context.prioritization_fee_cache, + pre_commit_callback, + ); + } + + if result.is_err() && added_cost { + if let Some(cost2) = cost { + bank.write_cost_tracker().unwrap().remove(&cost2); + } + } - *result = execute_batch( - &batch_with_indexes, - bank, - handler_context.transaction_status_sender.as_ref(), - handler_context.replay_vote_sender.as_ref(), - timings, - handler_context.log_messages_bytes_limit, - &handler_context.prioritization_fee_cache, - ); sleepless_testing::at(CheckPoint::TaskHandled(index)); } } @@ -471,14 +822,22 @@ impl ExecutedTask { // Note that the above properties can be upheld only when this is used inside MPSC or SPSC channels // (i.e. the consumer side needs to be single threaded). For the multiple consumer cases, // ChainedChannel can be used instead. -enum SubchanneledPayload { +use enum_ptr::{Aligned, Compact, EnumPtr, Unit}; + +#[repr(C, usize)] +#[derive(Debug, EnumPtr)] +pub enum SubchanneledPayload { Payload(P1), OpenSubchannel(P2), - CloseSubchannel, + CloseSubchannel(Unit), + Disconnect(Unit), + Reset(Unit), } type NewTaskPayload = SubchanneledPayload>; +type CompactNewTaskPayload = Compact; const_assert_eq!(mem::size_of::(), 16); +const_assert_eq!(mem::size_of::(), 8); // A tiny generic message type to synchronize multiple threads everytime some contextual data needs // to be switched (ie. SchedulingContext), just using a single communication channel. @@ -504,82 +863,99 @@ const_assert_eq!(mem::size_of::(), 16); mod chained_channel { use super::*; - // hide variants by putting this inside newtype - enum ChainedChannelPrivate { + #[derive(EnumPtr)] + #[repr(C, usize)] + #[allow(clippy::type_complexity)] + pub(super) enum ChainedChannel { Payload(P), - ContextAndChannels(C, Receiver>, Receiver

), + ContextAndChannels(Box<(C, Receiver>>, Receiver

)>), } - pub(super) struct ChainedChannel(ChainedChannelPrivate); - - impl ChainedChannel { + impl ChainedChannel { fn chain_to_new_channel( context: C, - receiver: Receiver, + receiver: Receiver>, aux_receiver: Receiver

, ) -> Self { - Self(ChainedChannelPrivate::ContextAndChannels( - context, - receiver, - aux_receiver, - )) + ChainedChannel::ContextAndChannels(Box::new((context, receiver, aux_receiver))) } } - pub(super) struct ChainedChannelSender { - sender: Sender>, + pub(super) struct ChainedChannelSender { + sender: Sender>>, aux_sender: Sender

, } - impl ChainedChannelSender { - fn new(sender: Sender>, aux_sender: Sender

) -> Self { + #[allow(dead_code)] + pub(super) trait WithMessageType { + type ChannelMessage; + } + + impl WithMessageType for ChainedChannelSender { + type ChannelMessage = ChainedChannel; + } + + impl ChainedChannelSender { + fn new(sender: Sender>>, aux_sender: Sender

) -> Self { Self { sender, aux_sender } } pub(super) fn send_payload( &self, payload: P, - ) -> std::result::Result<(), SendError>> { - self.sender - .send(ChainedChannel(ChainedChannelPrivate::Payload(payload))) + ) -> std::result::Result<(), SendError>>> { + self.sender.send(ChainedChannel::Payload(payload).into()) } + /* pub(super) fn send_aux_payload(&self, payload: P) -> std::result::Result<(), SendError

> { self.aux_sender.send(payload) } + */ pub(super) fn send_chained_channel( &mut self, context: C, count: usize, - ) -> std::result::Result<(), SendError>> { + ) -> std::result::Result<(), SendError>>> { let (chained_sender, chained_receiver) = crossbeam_channel::unbounded(); let (chained_aux_sender, chained_aux_receiver) = crossbeam_channel::unbounded(); for _ in 0..count { - self.sender.send(ChainedChannel::chain_to_new_channel( - context.clone(), - chained_receiver.clone(), - chained_aux_receiver.clone(), - ))? + self.sender.send( + ChainedChannel::chain_to_new_channel( + context.clone(), + chained_receiver.clone(), + chained_aux_receiver.clone(), + ) + .into(), + )? } self.sender = chained_sender; self.aux_sender = chained_aux_sender; Ok(()) } + + pub(super) fn len(&self) -> usize { + self.sender.len() + } + + pub(super) fn aux_len(&self) -> usize { + self.aux_sender.len() + } } // P doesn't need to be `: Clone`, yet rustc derive can't handle it. // see https://github.com/rust-lang/rust/issues/26925 #[derive_where(Clone)] - pub(super) struct ChainedChannelReceiver { - receiver: Receiver>, + pub(super) struct ChainedChannelReceiver { + receiver: Receiver>>, aux_receiver: Receiver

, context: C, } - impl ChainedChannelReceiver { + impl ChainedChannelReceiver { fn new( - receiver: Receiver>, + receiver: Receiver>>, aux_receiver: Receiver

, initial_context: C, ) -> Self { @@ -594,10 +970,11 @@ mod chained_channel { &self.context } - pub(super) fn for_select(&self) -> &Receiver> { + pub(super) fn for_select(&self) -> &Receiver>> { &self.receiver } + /* pub(super) fn aux_for_select(&self) -> &Receiver

{ &self.aux_receiver } @@ -605,11 +982,14 @@ mod chained_channel { pub(super) fn never_receive_from_aux(&mut self) { self.aux_receiver = never(); } + */ pub(super) fn after_select(&mut self, message: ChainedChannel) -> Option

{ - match message.0 { - ChainedChannelPrivate::Payload(payload) => Some(payload), - ChainedChannelPrivate::ContextAndChannels(context, channel, idle_channel) => { + match message { + ChainedChannel::Payload(payload) => Some(payload), + ChainedChannel::ContextAndChannels(b) => { + let (context, channel, idle_channel) = *b; + self.context = context; self.receiver = channel; self.aux_receiver = idle_channel; @@ -619,7 +999,7 @@ mod chained_channel { } } - pub(super) fn unbounded( + pub(super) fn unbounded( initial_context: C, ) -> (ChainedChannelSender, ChainedChannelReceiver) { let (sender, receiver) = crossbeam_channel::unbounded(); @@ -639,12 +1019,16 @@ mod chained_channel { /// pruning will be implemented in this type. #[derive(Default, Debug)] pub struct UsageQueueLoader { - usage_queues: DashMap, + usage_queues: DashMap, } impl UsageQueueLoader { pub fn load(&self, address: Pubkey) -> UsageQueue { - self.usage_queues.entry(address).or_default().clone() + // taken from https://github.com/xacrimon/dashmap/issues/292#issuecomment-1916621009 + match self.usage_queues.get(&address) { + Some(bar_read_guard) => bar_read_guard.value().clone(), + None => self.usage_queues.entry(address).or_default().clone(), + } } fn count(&self) -> usize { @@ -666,10 +1050,73 @@ pub struct PooledScheduler { context: SchedulingContext, } +#[derive(Debug)] +enum TaskCreator { + BlockVerification { + usage_queue_loader: UsageQueueLoader, + }, + BlockProduction { + banking_stage_adapter: Arc, + }, +} + +impl TaskCreator { + fn usage_queue_loader(&self) -> &UsageQueueLoader { + use TaskCreator::*; + + match self { + BlockVerification { usage_queue_loader } => usage_queue_loader, + BlockProduction { + banking_stage_adapter, + } => &banking_stage_adapter.usage_queue_loader, + } + } + + fn reset(&self) { + use TaskCreator::*; + + match self { + BlockVerification { + usage_queue_loader: _, + } => todo!(), + BlockProduction { + banking_stage_adapter, + } => banking_stage_adapter.reset(), + } + } + + fn is_overgrown(&self, max_usage_queue_count: usize, on_hot_path: bool) -> bool { + use TaskCreator::*; + + match self { + BlockVerification { usage_queue_loader } => { + assert!(on_hot_path); + // This check must be done on hot path everytime scheduler are returned to reliably + // detect too large loaders... + usage_queue_loader.count() > max_usage_queue_count + } + BlockProduction { + banking_stage_adapter, + } => { + // the slow path can be ensured to be called periodically. + // well, not so for single validator cluster.... + // should check next_task_id as well for ShortCounter::MAX/2 ? + let current_usage_queue_count = banking_stage_adapter.usage_queue_loader.count(); + let current_transaction_count = banking_stage_adapter.transaction_deduper.len(); + info!("bsa: {current_usage_queue_count} {current_transaction_count}"); + + //current_usage_queue_count > max_usage_queue_count + // || current_transaction_count > 1_000_000 + current_usage_queue_count > 300_000 || current_transaction_count > 200_000 + } + } + } +} + #[derive(Debug)] pub struct PooledSchedulerInner, TH: TaskHandler> { thread_manager: ThreadManager, - usage_queue_loader: UsageQueueLoader, + task_creator: TaskCreator, } impl Drop for ThreadManager @@ -700,7 +1147,7 @@ where // Ensure to initiate thread shutdown via disconnected new_task_receiver by replacing the // current new_task_sender with a random one... - self.new_task_sender = crossbeam_channel::unbounded().0; + self.disconnect_new_task_sender(); self.ensure_join_threads(true); assert_matches!(self.session_result_with_timings, Some((Ok(_), _))); @@ -709,15 +1156,11 @@ where impl PooledSchedulerInner where - S: SpawnableScheduler, + S: SpawnableScheduler, TH: TaskHandler, { - fn id(&self) -> SchedulerId { - self.thread_manager.scheduler_id - } - - fn is_trashed(&self) -> bool { - self.is_aborted() || self.is_overgrown() + fn is_trashed(&self, on_hot_path: bool) -> bool { + self.is_aborted() || self.is_overgrown(on_hot_path) } fn is_aborted(&self) -> bool { @@ -739,10 +1182,6 @@ where // scheduler to the pool, considering is_trashed() is checked immediately before that. self.thread_manager.are_threads_joined() } - - fn is_overgrown(&self) -> bool { - self.usage_queue_loader.count() > self.thread_manager.pool.max_usage_queue_count - } } // This type manages the OS threads for scheduling and executing transactions. The term @@ -754,8 +1193,8 @@ where struct ThreadManager, TH: TaskHandler> { scheduler_id: SchedulerId, pool: Arc>, - new_task_sender: Sender, - new_task_receiver: Option>, + new_task_sender: Arc>, + new_task_receiver: Option>, session_result_sender: Sender, session_result_receiver: Receiver, session_result_with_timings: Option, @@ -763,30 +1202,40 @@ struct ThreadManager, TH: TaskHandler> { handler_threads: Vec>, } +#[derive(Default)] +struct LogInterval(usize); + +impl LogInterval { + fn increment(&mut self) -> bool { + self.0 = self.0.checked_add(1).unwrap(); + self.0 % 2000 == 0 + } +} + struct HandlerPanicked; type HandlerResult = std::result::Result, HandlerPanicked>; +const_assert_eq!(mem::size_of::(), 8); impl, TH: TaskHandler> ThreadManager { fn new(pool: Arc>) -> Self { let (new_task_sender, new_task_receiver) = crossbeam_channel::unbounded(); let (session_result_sender, session_result_receiver) = crossbeam_channel::unbounded(); - let handler_count = pool.handler_count; Self { scheduler_id: pool.new_scheduler_id(), pool, - new_task_sender, + new_task_sender: Arc::new(new_task_sender), new_task_receiver: Some(new_task_receiver), session_result_sender, session_result_receiver, session_result_with_timings: None, scheduler_thread: None, - handler_threads: Vec::with_capacity(handler_count), + handler_threads: vec![], } } fn execute_task_with_handler( - bank: &Arc, + scheduling_context: &SchedulingContext, executed_task: &mut Box, handler_context: &HandlerContext, ) { @@ -794,29 +1243,58 @@ impl, TH: TaskHandler> ThreadManager { TH::handle( &mut executed_task.result_with_timings.0, &mut executed_task.result_with_timings.1, - bank, - executed_task.task.transaction(), - executed_task.task.task_index(), + scheduling_context, + &executed_task.task, handler_context, ); } #[must_use] fn accumulate_result_with_timings( + context: &SchedulingContext, (result, timings): &mut ResultWithTimings, executed_task: HandlerResult, - ) -> Option> { + error_count: &mut ShortCounter, + already_finishing: bool, + ) -> Option<(Box, bool)> { let Ok(executed_task) = executed_task else { return None; }; timings.accumulate(&executed_task.result_with_timings.1); - match executed_task.result_with_timings.0 { - Ok(()) => Some(executed_task), - Err(error) => { - error!("error is detected while accumulating....: {error:?}"); - *result = Err(error); - None - } + match context.mode() { + SchedulingMode::BlockVerification => match executed_task.result_with_timings.0 { + Ok(()) => Some((executed_task, false)), + Err(error) => { + error!("error is detected while accumulating....: {error:?}"); + *result = Err(error); + None + } + }, + SchedulingMode::BlockProduction => match executed_task.result_with_timings.0 { + Ok(()) => Some((executed_task, false)), + Err(TransactionError::CommitFailed) => { + if !already_finishing { + info!("maybe reached max tick height..."); + } + error_count.increment_self(); + Some((executed_task, true)) + } + Err(ref e @ TransactionError::WouldExceedMaxBlockCostLimit) + | Err(ref e @ TransactionError::WouldExceedMaxVoteCostLimit) + | Err(ref e @ TransactionError::WouldExceedMaxAccountCostLimit) + | Err(ref e @ TransactionError::WouldExceedAccountDataBlockLimit) => { + if !already_finishing { + info!("hit block cost: {e:?}"); + } + error_count.increment_self(); + Some((executed_task, true)) + } + Err(ref error) => { + debug!("error is detected while accumulating....: {error:?}"); + error_count.increment_self(); + Some((executed_task, false)) + } + }, } } @@ -838,9 +1316,21 @@ impl, TH: TaskHandler> ThreadManager { // for type safety. fn start_threads( &mut self, - context: SchedulingContext, + handler_count: usize, + mut context: SchedulingContext, mut result_with_timings: ResultWithTimings, + banking_stage_context: Option, ) { + assert!(handler_count >= 1); + + let scheduler_id = self.scheduler_id; + let mut slot = context.slot(); + + let postfix = match context.mode() { + SchedulingMode::BlockVerification => "V", + SchedulingMode::BlockProduction => "P", + }; + // Firstly, setup bi-directional messaging between the scheduler and handlers to pass // around tasks, by creating 2 channels (one for to-be-handled tasks from the scheduler to // the handlers and the other for finished tasks from the handlers to the scheduler). @@ -917,16 +1407,27 @@ impl, TH: TaskHandler> ThreadManager { // another blocking new task is arriving to finalize the tentatively extended // prioritization further. Consequently, this also contributes to alleviate the known // heuristic's caveat for the first task of linearized runs, which is described above. - let (mut runnable_task_sender, runnable_task_receiver) = - chained_channel::unbounded::(context); + let mode = context.mode(); + use crate::chained_channel::{ChainedChannelSender, WithMessageType}; + type RunnableTaskSender = ChainedChannelSender; + let (mut runnable_task_sender, runnable_task_receiver): (RunnableTaskSender, _) = + chained_channel::unbounded(context.clone()); + const_assert_eq!( + mem::size_of::<::ChannelMessage>(), + 16 + ); + const_assert_eq!( + mem::size_of::::ChannelMessage>>(), + 8 + ); // Create two handler-to-scheduler channels to prioritize the finishing of blocked tasks, // because it is more likely that a blocked task will have more blocked tasks behind it, // which should be scheduled while minimizing the delay to clear buffered linearized runs // as fast as possible. let (finished_blocked_task_sender, finished_blocked_task_receiver) = crossbeam_channel::unbounded::(); - let (finished_idle_task_sender, finished_idle_task_receiver) = - crossbeam_channel::unbounded::(); + //let (finished_idle_task_sender, finished_idle_task_receiver) = + // crossbeam_channel::unbounded::(); assert_matches!(self.session_result_with_timings, None); @@ -938,7 +1439,7 @@ impl, TH: TaskHandler> ThreadManager { // 5. the handler thread reply back to the scheduler thread as an executed task. // 6. the scheduler thread post-processes the executed task. let scheduler_main_loop = { - let handler_count = self.pool.handler_count; + let banking_stage_context = banking_stage_context.clone(); let session_result_sender = self.session_result_sender.clone(); // Taking new_task_receiver here is important to ensure there's a single receiver. In // this way, the replay stage will get .send() failures reliably, after this scheduler @@ -949,6 +1450,13 @@ impl, TH: TaskHandler> ThreadManager { .expect("no 2nd start_threads()"); let mut session_ending = false; + let (mut session_pausing, mut is_finished) = + if matches!(context.mode(), SchedulingMode::BlockProduction) { + (true, true) + } else { + (false, false) + }; + let mut session_resetting = false; // Now, this is the main loop for the scheduler thread, which is a special beast. // @@ -999,9 +1507,111 @@ impl, TH: TaskHandler> ThreadManager { }; let mut state_machine = unsafe { - SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling( + mode, + handler_count.checked_mul(2).unwrap().try_into().unwrap(), + ) + }; + let mut log_interval = LogInterval::default(); + let mut session_started_at = Instant::now(); + let mut cpu_session_started_at = cpu_time::ThreadTime::now(); + let ( + mut log_reported_at, + mut reported_task_total, + mut reported_executed_task_total, + ) = (session_started_at, 0, 0); + let mut cpu_log_reported_at = cpu_session_started_at; + let mut error_count = ShortCounter::zero(); + + let banking_packet_receiver = if let Some(b) = banking_stage_context.as_ref() { + &b.banking_packet_receiver + } else { + &never() }; + macro_rules! log_scheduler { + ($level:ident, $prefix:tt) => { + #[allow(clippy::arithmetic_side_effects)] + { + $level! { + "sch: {}: slot: {}({})[{:12}]({}{}): state_machine(({}({}b{}B{}F)=>{}({}+{}))/{}|{}TB|{}Lr) channels(<{} >{}+{} <{}+{} "v", + SchedulingMode::BlockProduction => "p", + }, + $prefix, + (if session_ending {"S"} else {"-"}), + (if session_pausing {"P"} else {"-"}), + state_machine.alive_task_count(), + state_machine.blocked_task_count(), state_machine.buffered_task_queue_count(), state_machine.eager_lock_total(), + state_machine.executed_task_total(), state_machine.executed_task_total() - error_count.current(), error_count.current(), + state_machine.task_total(), + state_machine.buffered_task_total(), + state_machine.reblocked_lock_total(), + new_task_receiver.len(), + runnable_task_sender.len(), runnable_task_sender.aux_len(), + finished_blocked_task_receiver.len(), 0 /*finished_idle_task_receiver.len()*/, + banking_packet_receiver.len(), + { + let now = Instant::now(); + let cpu_now = cpu_time::ThreadTime::now(); + let session_elapsed_us = now.duration_since(session_started_at).as_micros(); + let cpu_session_elapsed_us = cpu_now.duration_since(cpu_session_started_at).as_micros(); + let log_elapsed_us = now.duration_since(log_reported_at).as_micros(); + let cpu_log_elapsed_us = cpu_now.duration_since(cpu_log_reported_at).as_micros(); + + let l = format!( + "tps({}us|{}us): ({}|{}) ({}us|{}us): ({}|{})", + log_elapsed_us, + session_elapsed_us, + if log_elapsed_us > 0 { + format!( + "<{}>{}", + 1_000_000_u128 * ((state_machine.task_total() - reported_task_total) as u128) / log_elapsed_us, + 1_000_000_u128 * ((state_machine.executed_task_total() - reported_executed_task_total) as u128) / log_elapsed_us, + ) + } else { "-".to_string() }, + if session_elapsed_us > 0 { + format!( + "<{}>{}", + 1_000_000_u128 * (state_machine.task_total() as u128) / session_elapsed_us, + 1_000_000_u128 * (state_machine.executed_task_total() as u128) / session_elapsed_us, + ) + } else { "-".to_string() }, + cpu_log_elapsed_us, + cpu_session_elapsed_us, + if cpu_log_elapsed_us > 0 { + format!( + "<{}>{}", + 1_000_000_u128 * ((state_machine.task_total() - reported_task_total) as u128) / cpu_log_elapsed_us, + 1_000_000_u128 * ((state_machine.executed_task_total() - reported_executed_task_total) as u128) / cpu_log_elapsed_us, + ) + } else { "-".to_string() }, + if cpu_session_elapsed_us > 0 { + format!( + "<{}>{}", + 1_000_000_u128 * (state_machine.task_total() as u128) / cpu_session_elapsed_us, + 1_000_000_u128 * (state_machine.executed_task_total() as u128) / cpu_session_elapsed_us, + ) + } else { "-".to_string() }, + ); + #[allow(unused_assignments)] + { + (log_reported_at, reported_task_total, reported_executed_task_total) = (now, state_machine.task_total(), state_machine.executed_task_total()); + } + cpu_log_reported_at = cpu_now; + l + }, + } + } + } + } + + if !is_finished { + log_scheduler!(info, "started"); + } + // The following loop maintains and updates ResultWithTimings as its // externally-provided mutable state for each session in this way: // @@ -1009,14 +1619,13 @@ impl, TH: TaskHandler> ThreadManager { // 2. Subsequent result_with_timings are propagated explicitly from // the new_task_receiver.recv() invocation located at the end of loop. 'nonaborted_main_loop: loop { - let mut is_finished = false; while !is_finished { // ALL recv selectors are eager-evaluated ALWAYS by current crossbeam impl, // which isn't great and is inconsistent with `if`s in the Rust's match // arm. So, eagerly binding the result to a variable unconditionally here // makes no perf. difference... - let dummy_unblocked_task_receiver = - dummy_receiver(state_machine.has_unblocked_task()); + let dummy_buffered_task_receiver = + dummy_receiver(state_machine.has_runnable_task() && !session_pausing); // There's something special called dummy_unblocked_task_receiver here. // This odd pattern was needed to react to newly unblocked tasks from @@ -1028,40 +1637,96 @@ impl, TH: TaskHandler> ThreadManager { // consistent. Note that unified scheduler will go // into busy looping to seek lowest latency eventually. However, not now, // to measure _actual_ cpu usage easily with the select approach. - select_biased! { + let step_type = select_biased! { recv(finished_blocked_task_receiver) -> executed_task => { - let Some(executed_task) = Self::accumulate_result_with_timings( + let Ok(executed_task) = executed_task else { + error!("all handlers gone!!!"); + assert_matches!(state_machine.mode(), SchedulingMode::BlockProduction); + break 'nonaborted_main_loop; + }; + + let Some((executed_task, should_pause)) = Self::accumulate_result_with_timings( + &context, &mut result_with_timings, - executed_task.expect("alive handler") + executed_task, + &mut error_count, + session_ending || session_pausing, ) else { break 'nonaborted_main_loop; }; state_machine.deschedule_task(&executed_task.task); + if should_pause && !session_ending { + let task = banking_stage_context.as_ref().unwrap().adapter.recreate_task( + executed_task.task.transaction().clone(), + executed_task.task.context().clone(), + executed_task.task.index(), + ); + debug!("requeued tx!!!!"); + state_machine.do_schedule_task(task, true); + } + drop(executed_task); + if should_pause && !session_pausing && slot != Some(282254387/*00*/) { + session_pausing = true; + "pausing" + } else { + "desc_b_task" + } }, - recv(dummy_unblocked_task_receiver) -> dummy => { + recv(dummy_buffered_task_receiver) -> dummy => { assert_matches!(dummy, Err(RecvError)); let task = state_machine - .schedule_next_unblocked_task() + .schedule_next_buffered_task() .expect("unblocked task"); runnable_task_sender.send_payload(task).unwrap(); + "sc_b_task" }, recv(new_task_receiver) -> message => { - assert!(!session_ending); + assert!(state_machine.mode() == SchedulingMode::BlockProduction || !session_ending); - match message { + match message.map(|a| a.into()) { Ok(NewTaskPayload::Payload(task)) => { + if session_ending { + continue; + } sleepless_testing::at(CheckPoint::NewTask(task.task_index())); - if let Some(task) = state_machine.schedule_task(task) { - runnable_task_sender.send_aux_payload(task).unwrap(); + if let Some(task) = state_machine.do_schedule_task(task, session_pausing) { + //runnable_task_sender.send_aux_payload(task).unwrap(); + runnable_task_sender.send_payload(task).unwrap(); + "sc_i_task" + } else { + "new_b_task" } } - Ok(NewTaskPayload::CloseSubchannel) => { - session_ending = true; + Ok(NewTaskPayload::CloseSubchannel(_)) => { + match state_machine.mode() { + SchedulingMode::BlockVerification => { + session_ending = true; + "ending" + }, + SchedulingMode::BlockProduction => { + if slot == Some(282254387/*00*/) { + // can't assert pause signal may have been emitted.. + session_ending = true; + "ending" + } else if !session_pausing { + session_pausing = true; + "pausing" + } else { + info!("ignoring duplicate close subch"); + continue; + } + }, + } + } + Ok(NewTaskPayload::Reset(_)) => { + session_pausing = true; + session_resetting = true; + "draining" } Ok(NewTaskPayload::OpenSubchannel(_context_and_result_with_timings)) => unreachable!(), - Err(RecvError) => { + Ok(NewTaskPayload::Disconnect(_)) | Err(RecvError) => { // Mostly likely is that this scheduler is dropped for pruned blocks of // abandoned forks... // This short-circuiting is tested with test_scheduler_drop_short_circuiting. @@ -1069,49 +1734,172 @@ impl, TH: TaskHandler> ThreadManager { } } }, + /* recv(finished_idle_task_receiver) -> executed_task => { - let Some(executed_task) = Self::accumulate_result_with_timings( + let Some((executed_task, should_pause)) = Self::accumulate_result_with_timings( + &context, &mut result_with_timings, - executed_task.expect("alive handler") + executed_task.expect("alive handler"), + &mut error_count, + session_ending || session_pausing, ) else { break 'nonaborted_main_loop; }; state_machine.deschedule_task(&executed_task.task); + std::mem::forget(executed_task); + if should_pause && !session_pausing { + session_pausing = true; + "pausing" + } else { + "desc_i_task" + } }, + */ + default => { + if let Some(task) = (!session_pausing).then(|| state_machine.scan_and_schedule_next_task()).flatten() { + runnable_task_sender.send_payload(task).unwrap(); + "scan" + } else { + continue; + } + } }; + let force_log = step_type == "ending" + || step_type == "pausing" + || step_type == "draining"; + if log_interval.increment() || force_log { + log_scheduler!(info, step_type); + } else { + log_scheduler!(trace, step_type); + } - is_finished = session_ending && state_machine.has_no_active_task(); + is_finished = session_ending && state_machine.has_no_alive_task() + || session_pausing && state_machine.has_no_executing_task(); } + assert!(mem::replace(&mut is_finished, false)); // Finalize the current session after asserting it's explicitly requested so. - assert!(session_ending); + assert!(session_ending || session_pausing); // Send result first because this is blocking the replay code-path. session_result_sender .send(result_with_timings) .expect("always outlived receiver"); - state_machine.reinitialize(); - session_ending = false; + if session_ending { + log_scheduler!(info, "ended"); + } else { + log_scheduler!(info, "paused"); + } + match state_machine.mode() { + SchedulingMode::BlockVerification => { + reported_task_total = 0; + reported_executed_task_total = 0; + assert_eq!(error_count.current(), 0); + } + SchedulingMode::BlockProduction => { + session_started_at = Instant::now(); + cpu_session_started_at = cpu_time::ThreadTime::now(); + state_machine.reset_task_total(); + state_machine.reset_executed_task_total(); + reported_task_total = 0; + reported_executed_task_total = 0; + error_count.reset_to_zero(); + } + } // Prepare for the new session. - match new_task_receiver.recv() { - Ok(NewTaskPayload::OpenSubchannel(context_and_result_with_timings)) => { - let (new_context, new_result_with_timings) = - *context_and_result_with_timings; - // We just received subsequent (= not initial) session and about to - // enter into the preceding `while(!is_finished) {...}` loop again. - // Before that, propagate new SchedulingContext to handler threads - runnable_task_sender - .send_chained_channel(new_context, handler_count) - .unwrap(); - result_with_timings = new_result_with_timings; + loop { + if session_resetting { + while let Some(task) = state_machine.schedule_next_buffered_task() { + state_machine.deschedule_task(&task); + if log_interval.increment() { + log_scheduler!(info, "drained_desc"); + } else { + log_scheduler!(trace, "drained_desc"); + } + drop(task); + } + log_scheduler!(info, "drained"); + session_started_at = Instant::now(); + cpu_session_started_at = cpu_time::ThreadTime::now(); + reported_task_total = 0; + reported_executed_task_total = 0; + error_count.reset_to_zero(); + session_resetting = false; } - Err(_) => { - // This unusual condition must be triggered by ThreadManager::drop(). - // Initialize result_with_timings with a harmless value... - result_with_timings = initialized_result_with_timings(); - break 'nonaborted_main_loop; + match new_task_receiver.recv().map(|a| a.into()) { + Ok(NewTaskPayload::OpenSubchannel(context_and_result_with_timings)) => { + let (new_context, new_result_with_timings) = + *context_and_result_with_timings; + // We just received subsequent (= not initial) session and about to + // enter into the preceding `while(!is_finished) {...}` loop again. + // Before that, propagate new SchedulingContext to handler threads + assert_eq!(state_machine.mode(), new_context.mode()); + slot = new_context.slot(); + session_started_at = Instant::now(); + cpu_session_started_at = cpu_time::ThreadTime::now(); + + if session_ending { + log_interval = LogInterval::default(); + state_machine.reinitialize(new_context.mode()); + session_ending = false; + log_scheduler!(info, "started"); + } else { + state_machine.reset_task_total(); + state_machine.reset_executed_task_total(); + reported_task_total = 0; + reported_executed_task_total = 0; + error_count.reset_to_zero(); + session_pausing = false; + log_scheduler!(info, "unpaused"); + } + + runnable_task_sender + .send_chained_channel(new_context.clone(), handler_count) + .unwrap(); + context = new_context; + result_with_timings = new_result_with_timings; + break; + } + Ok(NewTaskPayload::CloseSubchannel(_)) + if matches!( + state_machine.mode(), + SchedulingMode::BlockProduction + ) => + { + info!("ignoring duplicate CloseSubchannel..."); + } + Ok(NewTaskPayload::Reset(_)) + if matches!( + state_machine.mode(), + SchedulingMode::BlockProduction + ) => + { + session_resetting = true; + log_scheduler!(info, "draining"); + } + Ok(NewTaskPayload::Payload(task)) + if matches!( + state_machine.mode(), + SchedulingMode::BlockProduction + ) => + { + assert!(state_machine.do_schedule_task(task, true).is_none()); + if log_interval.increment() { + log_scheduler!(info, "rebuffer"); + } else { + log_scheduler!(trace, "rebuffer"); + } + } + Ok(NewTaskPayload::Disconnect(_)) | Err(_) => { + // This unusual condition must be triggered by ThreadManager::drop(). + // Initialize result_with_timings with a harmless value... + result_with_timings = initialized_result_with_timings(); + session_ending = false; + session_pausing = false; + break 'nonaborted_main_loop; + } + Ok(_) => unreachable!(), } - Ok(_) => unreachable!(), } } @@ -1127,6 +1915,8 @@ impl, TH: TaskHandler> ThreadManager { session_result_sender .send(result_with_timings) .expect("always outlived receiver"); + log_scheduler!(info, "aborted"); + let _ = cpu_log_reported_at; // Next, drop `new_task_receiver`. After that, the paired singleton // `new_task_sender` will start to error when called by external threads, resulting @@ -1139,10 +1929,13 @@ impl, TH: TaskHandler> ThreadManager { }; let handler_main_loop = || { + let banking_stage_context = banking_stage_context.clone(); + let new_task_sender = Arc::downgrade(&self.new_task_sender); + let pool = self.pool.clone(); let mut runnable_task_receiver = runnable_task_receiver.clone(); let finished_blocked_task_sender = finished_blocked_task_sender.clone(); - let finished_idle_task_sender = finished_idle_task_sender.clone(); + //let finished_idle_task_sender = finished_idle_task_sender.clone(); // The following loop maintains and updates SchedulingContext as its // externally-provided state for each session in this way: @@ -1152,69 +1945,116 @@ impl, TH: TaskHandler> ThreadManager { // 2. Subsequent contexts are propagated explicitly inside `.after_select()` as part of // `select_biased!`, which are sent from `.send_chained_channel()` in the scheduler // thread for all-but-initial sessions. - move || loop { - let (task, sender) = select_biased! { - recv(runnable_task_receiver.for_select()) -> message => { - let Ok(message) = message else { - break; - }; - if let Some(task) = runnable_task_receiver.after_select(message) { - (task, &finished_blocked_task_sender) - } else { + move || { + let banking_packet_receiver = if let Some(b) = banking_stage_context.as_ref() { + &b.banking_packet_receiver + } else { + &never() + }; + let (do_now, dont_now) = (&disconnected::<()>(), &never::<()>()); + + let mut busy_start = Instant::now(); + loop { + let busy_waker = if busy_start.elapsed() < Duration::from_micros(10) { + do_now + } else { + dont_now + }; + + let (task, sender) = select_biased! { + recv(runnable_task_receiver.for_select()) -> message => { + let Ok(message) = message else { + break; + }; + if let Some(task) = runnable_task_receiver.after_select(message.into()) { + (task, &finished_blocked_task_sender) + } else { + continue; + } + }, + recv(banking_packet_receiver) -> banking_packet => { + let Some(new_task_sender) = new_task_sender.upgrade() else { + info!("dead new_task_sender"); + break; + }; + + let Ok(banking_packet) = banking_packet else { + info!("disconnected banking_packet_receiver"); + let current_thread = thread::current(); + if new_task_sender.send(NewTaskPayload::Disconnect(Unit::new()).into()).is_ok() { + info!("notified a disconnect from {:?}", current_thread); + } else { + // It seems that the scheduler thread has been aborted already... + warn!("failed to notify a disconnect from {:?}", current_thread); + } + break; + }; + (banking_stage_context.as_ref().unwrap().on_banking_packet_receive)(banking_packet, &move |task| { + new_task_sender + .send(NewTaskPayload::Payload(task).into()) + .unwrap(); + }); + continue; + }, + recv(busy_waker) -> _ => { continue; + }, + /* + recv(runnable_task_receiver.aux_for_select()) -> task => { + if let Ok(task) = task { + (task, &finished_idle_task_sender) + } else { + runnable_task_receiver.never_receive_from_aux(); + continue; + } + }, + */ + //default => { continue }, + }; + defer! { busy_start = Instant::now() } + defer! { + if !thread::panicking() { + return; } - }, - recv(runnable_task_receiver.aux_for_select()) -> task => { - if let Ok(task) = task { - (task, &finished_idle_task_sender) + + // The scheduler thread can't detect panics in handler threads with + // disconnected channel errors, unless all of them has died. So, send an + // explicit Err promptly. + let current_thread = thread::current(); + error!("handler thread is panicking: {:?}", current_thread); + if sender.send(Err(HandlerPanicked)).is_ok() { + info!("notified a panic from {:?}", current_thread); } else { - runnable_task_receiver.never_receive_from_aux(); - continue; + // It seems that the scheduler thread has been aborted already... + warn!("failed to notify a panic from {:?}", current_thread); } - }, - }; - defer! { - if !thread::panicking() { - return; } - - // The scheduler thread can't detect panics in handler threads with - // disconnected channel errors, unless all of them has died. So, send an - // explicit Err promptly. - let current_thread = thread::current(); - error!("handler thread is panicking: {:?}", current_thread); - if sender.send(Err(HandlerPanicked)).is_ok() { - info!("notified a panic from {:?}", current_thread); - } else { - // It seems that the scheduler thread has been aborted already... - warn!("failed to notify a panic from {:?}", current_thread); + let mut task = ExecutedTask::new_boxed(task); + Self::execute_task_with_handler( + runnable_task_receiver.context(), + &mut task, + &pool.handler_context, + ); + if sender.send(Ok(task)).is_err() { + warn!("handler_thread: scheduler thread aborted..."); + break; } } - let mut task = ExecutedTask::new_boxed(task); - Self::execute_task_with_handler( - runnable_task_receiver.context().bank(), - &mut task, - &pool.handler_context, - ); - if sender.send(Ok(task)).is_err() { - warn!("handler_thread: scheduler thread aborted..."); - break; - } } }; self.scheduler_thread = Some( thread::Builder::new() - .name("solScheduler".to_owned()) + .name(format!("solSchedule{postfix}")) .spawn_tracked(scheduler_main_loop) .unwrap(), ); - self.handler_threads = (0..self.pool.handler_count) + self.handler_threads = (0..handler_count) .map({ |thx| { thread::Builder::new() - .name(format!("solScHandler{:02}", thx)) + .name(format!("solScHandle{postfix}{:02}", thx)) .spawn_tracked(handler_main_loop()) .unwrap() } @@ -1225,7 +2065,7 @@ impl, TH: TaskHandler> ThreadManager { fn send_task(&self, task: Task) -> ScheduleResult { debug!("send_task()"); self.new_task_sender - .send(NewTaskPayload::Payload(task)) + .send(NewTaskPayload::Payload(task).into()) .map_err(|_| SchedulerAborted) } @@ -1266,17 +2106,8 @@ impl, TH: TaskHandler> ThreadManager { }; } - fn ensure_join_threads_after_abort( - &mut self, - should_receive_aborted_session_result: bool, - ) -> TransactionError { + fn ensure_join_threads_after_abort(&mut self, should_receive_aborted_session_result: bool) { self.ensure_join_threads(should_receive_aborted_session_result); - self.session_result_with_timings - .as_mut() - .unwrap() - .0 - .clone() - .unwrap_err() } fn are_threads_joined(&self) -> bool { @@ -1290,7 +2121,7 @@ impl, TH: TaskHandler> ThreadManager { } } - fn end_session(&mut self) { + fn do_end_session(&mut self, nonblocking: bool) { if self.are_threads_joined() { assert!(self.session_result_with_timings.is_some()); debug!("end_session(): skipping; already joined the aborted threads.."); @@ -1306,7 +2137,7 @@ impl, TH: TaskHandler> ThreadManager { let mut abort_detected = self .new_task_sender - .send(NewTaskPayload::CloseSubchannel) + .send(NewTaskPayload::CloseSubchannel(Unit::new()).into()) .is_err(); if abort_detected { @@ -1314,6 +2145,10 @@ impl, TH: TaskHandler> ThreadManager { return; } + if nonblocking { + return; + } + // Even if abort is detected, it's guaranteed that the scheduler thread puts the last // message into the session_result_sender before terminating. let result_with_timings = self.session_result_receiver.recv().unwrap(); @@ -1325,6 +2160,10 @@ impl, TH: TaskHandler> ThreadManager { debug!("end_session(): ended session at {:?}...", thread::current()); } + fn end_session(&mut self) { + self.do_end_session(false) + } + fn start_session( &mut self, context: SchedulingContext, @@ -1333,16 +2172,24 @@ impl, TH: TaskHandler> ThreadManager { assert!(!self.are_threads_joined()); assert_matches!(self.session_result_with_timings, None); self.new_task_sender - .send(NewTaskPayload::OpenSubchannel(Box::new(( - context, - result_with_timings, - )))) + .send(NewTaskPayload::OpenSubchannel(Box::new((context, result_with_timings))).into()) .expect("no new session after aborted"); } + + fn disconnect_new_task_sender(&mut self) { + self.new_task_sender = Arc::new(crossbeam_channel::unbounded().0); + } +} + +pub trait SchedulerInner { + fn id(&self) -> SchedulerId; + fn is_overgrown(&self, on_hot_path: bool) -> bool; + fn reset(&self); + fn ensure_abort(&mut self); } pub trait SpawnableScheduler: InstalledScheduler { - type Inner: Debug + Send + Sync; + type Inner: SchedulerInner + Debug + Send + Sync; fn into_inner(self) -> (ResultWithTimings, Self::Inner); @@ -1353,9 +2200,11 @@ pub trait SpawnableScheduler: InstalledScheduler { ) -> Self; fn spawn( + handler_count: usize, pool: Arc>, context: SchedulingContext, result_with_timings: ResultWithTimings, + banking_stage_context: Option, ) -> Self where Self: Sized; @@ -1385,21 +2234,115 @@ impl SpawnableScheduler for PooledScheduler { } fn spawn( + handler_count: usize, pool: Arc>, context: SchedulingContext, result_with_timings: ResultWithTimings, + banking_stage_context: Option, ) -> Self { + info!("spawning new scheduler for slot: {:?}", context.slot()); + let task_creator = match context.mode() { + SchedulingMode::BlockVerification => TaskCreator::BlockVerification { + usage_queue_loader: UsageQueueLoader::default(), + }, + SchedulingMode::BlockProduction => TaskCreator::BlockProduction { + banking_stage_adapter: banking_stage_context.as_ref().unwrap().adapter.clone(), + }, + }; let mut inner = Self::Inner { thread_manager: ThreadManager::new(pool), - usage_queue_loader: UsageQueueLoader::default(), + task_creator, }; - inner - .thread_manager - .start_threads(context.clone(), result_with_timings); + inner.thread_manager.start_threads( + handler_count, + context.clone(), + result_with_timings, + banking_stage_context, + ); Self { inner, context } } } +#[derive(Debug)] +pub enum BankingStageStatus { + Active, + Inactive, + Exited, +} + +pub trait BankingStageMonitor: Send + Debug { + fn status(&self) -> BankingStageStatus; +} + +#[derive(Debug)] +pub struct BankingStageAdapter { + usage_queue_loader: UsageQueueLoader, + transaction_deduper: DashSet, + next_task_id: AtomicU64, +} + +impl BankingStageAdapter { + pub fn generate_task_ids(&self, count: u64) -> u64 { + self.next_task_id.fetch_add(count, Relaxed) + } + + fn do_create_task( + &self, + transaction: RuntimeTransaction, + context: TransactionContext, + index: TaskKey, + ) -> Task { + SchedulingStateMachine::do_create_task(transaction, context, index, &mut |pubkey| { + self.usage_queue_loader.load(pubkey) + }) + } + + pub fn create_new_task( + &self, + transaction: RuntimeTransaction, + context: TransactionContext, + index: TaskKey, + ) -> Option { + let hash = transaction.message_hash(); + // Tolerate double lookup to avoid a write-lock.... + if self.transaction_deduper.contains(hash) || !self.transaction_deduper.insert(*hash) { + //return None; + } + + Some(self.do_create_task(transaction, context, index)) + } + + fn recreate_task( + &self, + transaction: RuntimeTransaction, + context: TransactionContext, + old_index: TaskKey, + ) -> Task { + let new_index = { + let inherited_priority = + old_index & const { (u64::MAX as TaskKey) << (TaskKey::BITS / 2) }; + let new_task_id = self.generate_task_ids(1) as TaskKey; + inherited_priority | new_task_id + }; + + self.do_create_task(transaction, context, new_index) + } + + fn reset(&self) { + info!( + "resetting transaction_deduper... {}", + self.transaction_deduper.len() + ); + self.transaction_deduper.clear(); + info!( + "resetting transaction_deduper... done: {}", + self.transaction_deduper.len() + ); + // We can't reset self.usage_queue_loader because task (re)creation is multi-threaded + // without any synchronization + } +} + impl InstalledScheduler for PooledScheduler { fn id(&self) -> SchedulerId { self.inner.id() @@ -1412,10 +2355,11 @@ impl InstalledScheduler for PooledScheduler { fn schedule_execution( &self, transaction: RuntimeTransaction, - index: usize, + index: TaskKey, ) -> ScheduleResult { + assert_matches!(self.context().mode(), SchedulingMode::BlockVerification); let task = SchedulingStateMachine::create_task(transaction, index, &mut |pubkey| { - self.inner.usage_queue_loader.load(pubkey) + self.inner.task_creator.usage_queue_loader().load(pubkey) }); self.inner.thread_manager.send_task(task) } @@ -1423,7 +2367,15 @@ impl InstalledScheduler for PooledScheduler { fn recover_error_after_abort(&mut self) -> TransactionError { self.inner .thread_manager - .ensure_join_threads_after_abort(true) + .ensure_join_threads_after_abort(true); + self.inner + .thread_manager + .session_result_with_timings + .as_mut() + .unwrap() + .0 + .clone() + .unwrap_err() } fn wait_for_termination( @@ -1435,19 +2387,22 @@ impl InstalledScheduler for PooledScheduler { } fn pause_for_recent_blockhash(&mut self) { - self.inner.thread_manager.end_session(); + // this fn is called from poh thread, while it's being locked. so, we can't wait scheduler + // termination here to avoid deadlock. just async signaling is enough + let nonblocking = matches!(self.context().mode(), SchedulingMode::BlockProduction); + self.inner.thread_manager.do_end_session(nonblocking); } } impl UninstalledScheduler for PooledSchedulerInner where - S: SpawnableScheduler>, + S: SpawnableScheduler, TH: TaskHandler, { fn return_to_pool(self: Box) { // Refer to the comment in is_trashed() as to the exact definition of the concept of // _trashed_ and the interaction among different parts of unified scheduler. - let should_trash = self.is_trashed(); + let should_trash = self.is_trashed(true); if should_trash { info!("trashing scheduler (id: {})...", self.id()); } @@ -1458,6 +2413,39 @@ where } } +impl SchedulerInner for PooledSchedulerInner +where + S: SpawnableScheduler, + TH: TaskHandler, +{ + fn id(&self) -> SchedulerId { + self.thread_manager.scheduler_id + } + + fn is_overgrown(&self, on_hot_path: bool) -> bool { + self.task_creator + .is_overgrown(self.thread_manager.pool.max_usage_queue_count, on_hot_path) + } + + fn reset(&self) { + if let Err(a) = self + .thread_manager + .new_task_sender + .send(NewTaskPayload::Reset(Unit::new()).into()) + { + warn!("failed to send a reset due to error: {a:?}"); + } + self.task_creator.reset() + } + + fn ensure_abort(&mut self) { + if self.thread_manager.are_threads_joined() { + return; + } + self.thread_manager.disconnect_new_task_sender() + } +} + #[cfg(test)] mod tests { use { @@ -1468,7 +2456,9 @@ mod tests { bank::Bank, bank_forks::BankForks, genesis_utils::{create_genesis_config, GenesisConfigInfo}, - installed_scheduler_pool::{BankWithScheduler, SchedulingContext}, + installed_scheduler_pool::{ + BankWithScheduler, InstalledSchedulerPoolArc, SchedulingContext, + }, prioritization_fee_cache::PrioritizationFeeCache, }, solana_sdk::{ @@ -1485,6 +2475,58 @@ mod tests { }, }; + impl SchedulerPool + where + S: SpawnableScheduler, + TH: TaskHandler, + { + fn do_new_for_verification( + handler_count: Option, + log_messages_bytes_limit: Option, + transaction_status_sender: Option, + replay_vote_sender: Option, + prioritization_fee_cache: Arc, + pool_cleaner_interval: Duration, + max_pooling_duration: Duration, + max_usage_queue_count: usize, + timeout_duration: Duration, + ) -> Arc { + Self::do_new( + SupportedSchedulingMode::block_verification_only(), + handler_count, + log_messages_bytes_limit, + transaction_status_sender, + replay_vote_sender, + prioritization_fee_cache, + TransactionRecorder::new_dummy(), + pool_cleaner_interval, + max_pooling_duration, + max_usage_queue_count, + timeout_duration, + ) + } + + // This apparently-meaningless wrapper is handy, because some callers explicitly want + // `dyn InstalledSchedulerPool` to be returned for type inference convenience. + fn new_dyn_for_verification( + handler_count: Option, + log_messages_bytes_limit: Option, + transaction_status_sender: Option, + replay_vote_sender: Option, + prioritization_fee_cache: Arc, + ) -> InstalledSchedulerPoolArc { + Self::new( + SupportedSchedulingMode::block_verification_only(), + handler_count, + log_messages_bytes_limit, + transaction_status_sender, + replay_vote_sender, + prioritization_fee_cache, + TransactionRecorder::new_dummy(), + ) + } + } + #[derive(Debug)] enum TestCheckPoint { BeforeNewTask, @@ -1505,8 +2547,13 @@ mod tests { solana_logger::setup(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_dyn_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); // this indirectly proves that there should be circular link because there's only one Arc // at this moment now @@ -1521,11 +2568,16 @@ mod tests { solana_logger::setup(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_dyn_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let bank = Arc::new(Bank::default_for_tests()); - let context = SchedulingContext::new(bank); - let scheduler = pool.take_scheduler(context); + let context = SchedulingContext::for_verification(bank); + let scheduler = pool.take_scheduler(context).unwrap(); let debug = format!("{scheduler:#?}"); assert!(!debug.is_empty()); @@ -1546,7 +2598,7 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = DefaultSchedulerPool::do_new( + let pool_raw = DefaultSchedulerPool::do_new_for_verification( None, None, None, @@ -1559,7 +2611,7 @@ mod tests { ); let pool = pool_raw.clone(); let bank = Arc::new(Bank::default_for_tests()); - let context1 = SchedulingContext::new(bank); + let context1 = SchedulingContext::for_verification(bank); let context2 = context1.clone(); let old_scheduler = pool.do_take_scheduler(context1); @@ -1611,7 +2663,7 @@ mod tests { let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); const REDUCED_MAX_USAGE_QUEUE_COUNT: usize = 1; - let pool_raw = DefaultSchedulerPool::do_new( + let pool_raw = DefaultSchedulerPool::do_new_for_verification( None, None, None, @@ -1624,7 +2676,7 @@ mod tests { ); let pool = pool_raw.clone(); let bank = Arc::new(Bank::default_for_tests()); - let context1 = SchedulingContext::new(bank); + let context1 = SchedulingContext::for_verification(bank); let context2 = context1.clone(); let small_scheduler = pool.do_take_scheduler(context1); @@ -1632,14 +2684,16 @@ mod tests { for _ in 0..REDUCED_MAX_USAGE_QUEUE_COUNT { small_scheduler .inner - .usage_queue_loader + .task_creator + .usage_queue_loader() .load(Pubkey::new_unique()); } let big_scheduler = pool.do_take_scheduler(context2); for _ in 0..REDUCED_MAX_USAGE_QUEUE_COUNT + 1 { big_scheduler .inner - .usage_queue_loader + .task_creator + .usage_queue_loader() .load(Pubkey::new_unique()); } @@ -1686,7 +2740,7 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = DefaultSchedulerPool::do_new( + let pool_raw = DefaultSchedulerPool::do_new_for_verification( None, None, None, @@ -1699,8 +2753,8 @@ mod tests { ); let pool = pool_raw.clone(); let bank = Arc::new(Bank::default_for_tests()); - let context = SchedulingContext::new(bank.clone()); - let scheduler = pool.take_scheduler(context); + let context = SchedulingContext::for_verification(bank.clone()); + let scheduler = pool.take_scheduler(context).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); pool.register_timeout_listener(bank.create_timeout_listener()); assert_eq!(pool_raw.scheduler_inners.lock().unwrap().len(), 0); @@ -1734,17 +2788,18 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = SchedulerPool::, _>::do_new( - None, - None, - None, - None, - ignored_prioritization_fee_cache, - SHORTENED_POOL_CLEANER_INTERVAL, - DEFAULT_MAX_POOLING_DURATION, - DEFAULT_MAX_USAGE_QUEUE_COUNT, - SHORTENED_TIMEOUT_DURATION, - ); + let pool_raw = + SchedulerPool::, _>::do_new_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + SHORTENED_POOL_CLEANER_INTERVAL, + DEFAULT_MAX_POOLING_DURATION, + DEFAULT_MAX_USAGE_QUEUE_COUNT, + SHORTENED_TIMEOUT_DURATION, + ); #[derive(Debug)] struct ExecuteTimingCounter; @@ -1752,9 +2807,8 @@ mod tests { fn handle( _result: &mut Result<()>, timings: &mut ExecuteTimings, - _bank: &Arc, - _transaction: &RuntimeTransaction, - _index: usize, + _bank: &SchedulingContext, + _task: &Task, _handler_context: &HandlerContext, ) { timings.metrics[ExecuteTimingType::CheckUs] += 123; @@ -1770,9 +2824,9 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); pool.register_timeout_listener(bank.create_timeout_listener()); @@ -1821,7 +2875,7 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = DefaultSchedulerPool::do_new( + let pool_raw = DefaultSchedulerPool::do_new_for_verification( None, None, None, @@ -1838,9 +2892,9 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); pool.register_timeout_listener(bank.create_timeout_listener()); @@ -1868,7 +2922,7 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = SchedulerPool::, _>::do_new( + let pool_raw = SchedulerPool::, _>::do_new_for_verification( None, None, None, @@ -1890,9 +2944,9 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); pool.register_timeout_listener(bank.create_timeout_listener()); @@ -1935,9 +2989,8 @@ mod tests { fn handle( result: &mut Result<()>, _timings: &mut ExecuteTimings, - _bank: &Arc, - _transaction: &RuntimeTransaction, - _index: usize, + _bank: &SchedulingContext, + _task: &Task, _handler_context: &HandlerContext, ) { *result = Err(TransactionError::AccountNotFound); @@ -1971,14 +3024,14 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new( + let pool = SchedulerPool::, _>::new_for_verification( None, None, None, None, ignored_prioritization_fee_cache, ); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); let scheduler = pool.do_take_scheduler(context); scheduler.schedule_execution(tx, 0).unwrap(); @@ -2028,6 +3081,7 @@ mod tests { } #[test] + #[ignore] fn test_scheduler_drop_short_circuiting() { solana_logger::setup(); @@ -2038,7 +3092,7 @@ mod tests { &TestCheckPoint::AfterSchedulerThreadAborted, ]); - static TASK_COUNT: Mutex = Mutex::new(0); + static TASK_COUNT: Mutex = Mutex::new(0); #[derive(Debug)] struct CountingHandler; @@ -2046,9 +3100,8 @@ mod tests { fn handle( _result: &mut Result<()>, _timings: &mut ExecuteTimings, - _bank: &Arc, - _transaction: &RuntimeTransaction, - _index: usize, + _bank: &SchedulingContext, + _task: &Task, _handler_context: &HandlerContext, ) { *TASK_COUNT.lock().unwrap() += 1; @@ -2064,14 +3117,14 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new( + let pool = SchedulerPool::, _>::new_for_verification( None, None, None, None, ignored_prioritization_fee_cache, ); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); let scheduler = pool.do_take_scheduler(context); // This test is racy. @@ -2079,7 +3132,7 @@ mod tests { // That's because the scheduler needs to be aborted quickly as an expected behavior, // leaving some readily-available work untouched. So, schedule rather large number of tasks // to make the short-cutting abort code-path win the race easily. - const MAX_TASK_COUNT: usize = 100; + const MAX_TASK_COUNT: TaskKey = 100; for i in 0..MAX_TASK_COUNT { let tx = RuntimeTransaction::from_transaction_for_tests(system_transaction::transfer( @@ -2104,10 +3157,15 @@ mod tests { solana_logger::setup(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::for_verification(bank); let scheduler1 = pool.do_take_scheduler(context.clone()); let scheduler_id1 = scheduler1.id(); @@ -2133,10 +3191,15 @@ mod tests { solana_logger::setup(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::for_verification(bank); let mut scheduler = pool.do_take_scheduler(context.clone()); // should never panic. @@ -2152,20 +3215,25 @@ mod tests { solana_logger::setup(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let old_bank = &Arc::new(Bank::default_for_tests()); let new_bank = &Arc::new(Bank::default_for_tests()); assert!(!Arc::ptr_eq(old_bank, new_bank)); - let old_context = &SchedulingContext::new(old_bank.clone()); - let new_context = &SchedulingContext::new(new_bank.clone()); + let old_context = &SchedulingContext::for_verification(old_bank.clone()); + let new_context = &SchedulingContext::for_verification(new_bank.clone()); let scheduler = pool.do_take_scheduler(old_context.clone()); let scheduler_id = scheduler.id(); pool.return_scheduler(scheduler.into_inner().1, false); - let scheduler = pool.take_scheduler(new_context.clone()); + let scheduler = pool.take_scheduler(new_context.clone()).unwrap(); assert_eq!(scheduler_id, scheduler.id()); assert!(Arc::ptr_eq(scheduler.context().bank(), new_bank)); } @@ -2178,8 +3246,13 @@ mod tests { let bank_forks = BankForks::new_rw_arc(bank); let mut bank_forks = bank_forks.write().unwrap(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_dyn_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); bank_forks.install_scheduler_pool(pool); } @@ -2192,8 +3265,13 @@ mod tests { let child_bank = Bank::new_from_parent(bank, &Pubkey::default(), 1); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); + let pool = DefaultSchedulerPool::new_dyn_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let bank = Bank::default_for_tests(); let bank_forks = BankForks::new_rw_arc(bank); @@ -2242,12 +3320,17 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - let context = SchedulingContext::new(bank.clone()); + let pool = DefaultSchedulerPool::new_dyn_for_verification( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); + let context = SchedulingContext::for_verification(bank.clone()); assert_eq!(bank.transaction_count(), 0); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); scheduler.schedule_execution(tx0, 0).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_matches!(bank.wait_for_completed_scheduler(), Some((Ok(()), _))); @@ -2277,7 +3360,7 @@ mod tests { let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = DefaultSchedulerPool::do_new( + let pool_raw = DefaultSchedulerPool::do_new_for_verification( None, None, None, @@ -2289,8 +3372,8 @@ mod tests { DEFAULT_TIMEOUT_DURATION, ); let pool = pool_raw.clone(); - let context = SchedulingContext::new(bank.clone()); - let scheduler = pool.take_scheduler(context); + let context = SchedulingContext::for_verification(bank.clone()); + let scheduler = pool.take_scheduler(context).unwrap(); let unfunded_keypair = Keypair::new(); let bad_tx = RuntimeTransaction::from_transaction_for_tests(system_transaction::transfer( @@ -2383,11 +3466,11 @@ mod tests { fn handle( _result: &mut Result<()>, _timings: &mut ExecuteTimings, - _bank: &Arc, - _transaction: &RuntimeTransaction, - index: usize, + _bank: &SchedulingContext, + task: &Task, _handler_context: &HandlerContext, ) { + let index = task.index(); if index == 0 { sleepless_testing::at(PanickingHanlderCheckPoint::BeforeNotifiedPanic); } else if index == 1 { @@ -2407,19 +3490,19 @@ mod tests { // Use 2 transactions with different timings to deliberately cover the two code paths of // notifying panics in the handler threads, taken conditionally depending on whether the // scheduler thread has been aborted already or not. - const TX_COUNT: usize = 2; + const TX_COUNT: TaskKey = 2; let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new_dyn( - Some(TX_COUNT), // fix to use exactly 2 handlers + let pool = SchedulerPool::, _>::new_dyn_for_verification( + Some(TX_COUNT as usize), // fix to use exactly 2 handlers None, None, None, ignored_prioritization_fee_cache, ); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); for index in 0..TX_COUNT { // Use 2 non-conflicting txes to exercise the channel disconnected case as well. @@ -2463,11 +3546,11 @@ mod tests { fn handle( result: &mut Result<()>, _timings: &mut ExecuteTimings, - _bank: &Arc, - _transaction: &RuntimeTransaction, - index: usize, + _bank: &SchedulingContext, + task: &Task, _handler_context: &HandlerContext, ) { + let index = task.index(); *TASK_COUNT.lock().unwrap() += 1; if index == 1 { *result = Err(TransactionError::AccountNotFound); @@ -2485,14 +3568,14 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new( + let pool = SchedulerPool::, _>::new_for_verification( None, None, None, None, ignored_prioritization_fee_cache, ); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); let scheduler = pool.do_take_scheduler(context); for i in 0..10 { @@ -2522,8 +3605,8 @@ mod tests { fn test_scheduler_schedule_execution_blocked() { solana_logger::setup(); - const STALLED_TRANSACTION_INDEX: usize = 0; - const BLOCKED_TRANSACTION_INDEX: usize = 1; + const STALLED_TRANSACTION_INDEX: TaskKey = 0; + const BLOCKED_TRANSACTION_INDEX: TaskKey = 1; static LOCK_TO_STALL: Mutex<()> = Mutex::new(()); #[derive(Debug)] @@ -2532,24 +3615,17 @@ mod tests { fn handle( result: &mut Result<()>, timings: &mut ExecuteTimings, - bank: &Arc, - transaction: &RuntimeTransaction, - index: usize, + bank: &SchedulingContext, + task: &Task, handler_context: &HandlerContext, ) { + let index = task.index(); match index { STALLED_TRANSACTION_INDEX => *LOCK_TO_STALL.lock().unwrap(), BLOCKED_TRANSACTION_INDEX => {} _ => unreachable!(), }; - DefaultTaskHandler::handle( - result, - timings, - bank, - transaction, - index, - handler_context, - ); + DefaultTaskHandler::handle(result, timings, bank, task, handler_context); } } @@ -2576,17 +3652,17 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let (bank, _bank_forks) = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new_dyn( + let pool = SchedulerPool::, _>::new_dyn_for_verification( None, None, None, None, ignored_prioritization_fee_cache, ); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); assert_eq!(bank.transaction_count(), 0); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); // Stall handling tx0 and tx1 let lock_to_stall = LOCK_TO_STALL.lock().unwrap(); @@ -2617,13 +3693,12 @@ mod tests { fn handle( _result: &mut Result<()>, _timings: &mut ExecuteTimings, - bank: &Arc, - _transaction: &RuntimeTransaction, - index: usize, + context: &SchedulingContext, + task: &Task, _handler_context: &HandlerContext, ) { // The task index must always be matched to the slot. - assert_eq!(index as Slot, bank.slot()); + assert_eq!(task.index() as Slot, context.bank().slot()); } } @@ -2643,7 +3718,7 @@ mod tests { )); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new( + let pool = SchedulerPool::, _>::new_for_verification( Some(4), // spawn 4 threads None, None, @@ -2659,8 +3734,8 @@ mod tests { 2, genesis_config.hash(), )); - let context0 = &SchedulingContext::new(bank0.clone()); - let context1 = &SchedulingContext::new(bank1.clone()); + let context0 = &SchedulingContext::for_verification(bank0.clone()); + let context1 = &SchedulingContext::for_verification(bank1.clone()); // Exercise the scheduler by busy-looping to expose the race condition for (context, index) in [(context0, 0), (context1, 1)] @@ -2668,7 +3743,7 @@ mod tests { .cycle() .take(10000) { - let scheduler = pool.take_scheduler(context.clone()); + let scheduler = pool.take_scheduler(context.clone()).unwrap(); scheduler .schedule_execution(dummy_tx.clone(), index) .unwrap(); @@ -2714,9 +3789,8 @@ mod tests { fn schedule_execution( &self, transaction: RuntimeTransaction, - index: usize, + index: TaskKey, ) -> ScheduleResult { - let transaction_and_index = (transaction, index); let context = self.context().clone(); let pool = self.3.clone(); @@ -2728,12 +3802,15 @@ mod tests { let mut result = Ok(()); let mut timings = ExecuteTimings::default(); + let task = SchedulingStateMachine::create_task(transaction, index, &mut |_| { + UsageQueue::default() + }); + ::handle( &mut result, &mut timings, - context.bank(), - &transaction_and_index.0, - transaction_and_index.1, + &context, + &task, &pool.handler_context, ); (result, timings) @@ -2776,6 +3853,24 @@ mod tests { } } + impl SchedulerInner for AsyncScheduler { + fn id(&self) -> SchedulerId { + 42 + } + + fn is_overgrown(&self, _on_hot_path: bool) -> bool { + todo!() + } + + fn reset(&self) { + todo!() + } + + fn ensure_abort(&mut self) { + todo!() + } + } + impl SpawnableScheduler for AsyncScheduler { @@ -2795,9 +3890,11 @@ mod tests { } fn spawn( + _handler_count: usize, pool: Arc>, context: SchedulingContext, _result_with_timings: ResultWithTimings, + _banking_stage_context: Option, ) -> Self { AsyncScheduler::( Mutex::new(initialized_result_with_timings()), @@ -2837,18 +3934,18 @@ mod tests { ); } let (bank, _bank_forks) = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::for_verification(bank.clone()); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = - SchedulerPool::, DefaultTaskHandler>::new_dyn( + SchedulerPool::, DefaultTaskHandler>::new_dyn_for_verification( None, None, None, None, ignored_prioritization_fee_cache, ); - let scheduler = pool.take_scheduler(context); + let scheduler = pool.take_scheduler(context).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_eq!(bank.transaction_count(), 0); @@ -2923,14 +4020,17 @@ mod tests { let result = &mut Ok(()); let timings = &mut ExecuteTimings::default(); let prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let scheduling_context = &SchedulingContext::for_verification(bank.clone()); let handler_context = &HandlerContext { log_messages_bytes_limit: None, transaction_status_sender: None, replay_vote_sender: None, prioritization_fee_cache, + transaction_recorder: TransactionRecorder::new_dummy(), }; - DefaultTaskHandler::handle(result, timings, bank, &tx, 0, handler_context); + let task = SchedulingStateMachine::create_task(tx, 0, &mut |_| UsageQueue::default()); + DefaultTaskHandler::handle(result, timings, scheduling_context, &task, handler_context); assert_matches!(result, Err(TransactionError::AccountLoadedTwice)); } } diff --git a/validator/Cargo.toml b/validator/Cargo.toml index 4cd77b0b1c8..c8d7969ab71 100644 --- a/validator/Cargo.toml +++ b/validator/Cargo.toml @@ -27,6 +27,7 @@ jsonrpc-ipc-server = { workspace = true } lazy_static = { workspace = true } libloading = { workspace = true } log = { workspace = true } +mimalloc = { workspace = true } num_cpus = { workspace = true } rand = { workspace = true } rayon = { workspace = true } @@ -77,9 +78,6 @@ solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } spl-token-2022 = { workspace = true, features = ["no-entrypoint"] } tempfile = { workspace = true } -[target.'cfg(not(any(target_env = "msvc", target_os = "freebsd")))'.dependencies] -jemallocator = { workspace = true } - [target."cfg(unix)".dependencies] libc = { workspace = true } signal-hook = { workspace = true } diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 19a2b7f77e7..4669dcf52d7 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1595,6 +1595,15 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .validator(|s| is_within_range(s, 1..)) .help(DefaultSchedulerPool::cli_message()), ) + .arg( + Arg::with_name("enable_experimental_block_production_method") + .long("enable-experimental-block-production-method") + .takes_value(false) + .help( + "Accept unified-scheduler to be used as an experimental block \ + production method", + ), + ) .arg( Arg::with_name("wen_restart") .long("wen-restart") diff --git a/validator/src/main.rs b/validator/src/main.rs index bee65e487ba..24f0717ae21 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -1,6 +1,4 @@ #![allow(clippy::arithmetic_side_effects)] -#[cfg(not(any(target_env = "msvc", target_os = "freebsd")))] -use jemallocator::Jemalloc; use { agave_validator::{ admin_rpc_service, @@ -87,9 +85,8 @@ use { }, }; -#[cfg(not(any(target_env = "msvc", target_os = "freebsd")))] #[global_allocator] -static GLOBAL: Jemalloc = Jemalloc; +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; #[derive(Debug, PartialEq, Eq)] enum Operation { @@ -1845,6 +1842,17 @@ pub fn main() { "block_production_method", BlockProductionMethod ) + .inspect(|method| { + if matches!(method, BlockProductionMethod::UnifiedScheduler) + && !matches.is_present("enable_experimental_block_production_method") + { + eprintln!( + "Currently, the unified-scheduler method is experimental for block-production. \ + Explicitly pass --enable-experimental-block-production-method to use it." + ); + exit(1); + } + }) .unwrap_or_default(); validator_config.enable_block_production_forwarding = staked_nodes_overrides_path.is_some(); validator_config.unified_scheduler_handler_threads =