diff --git a/Cargo.lock b/Cargo.lock index 08c2fa97d31ef7..b9e63af2280fcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -820,7 +820,7 @@ dependencies = [ "lazy_static", "lazycell", "peeking_take_while", - "prettyplease 0.2.4", + "prettyplease 0.2.16", "proc-macro2", "quote", "regex", @@ -829,6 +829,29 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "bindgen" +version = "0.69.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c69fae65a523209d34240b60abe0c42d33d1045d445c0839d8a4894a736e2d" +dependencies = [ + "bitflags 2.4.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease 0.2.16", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.52", + "which", +] + [[package]] name = "bit-set" version = "0.5.2" @@ -1450,7 +1473,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1546,6 +1569,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1615,11 +1648,10 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3db02a9c5b5121e1e42fbdb1aeb65f5e02624cc58c43f2884c6ccac0b82f95" +version = "0.5.11" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.19", ] [[package]] @@ -1630,7 +1662,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1639,7 +1671,7 @@ version = "0.9.5" source = "git+https://github.com/solana-labs/crossbeam?rev=fd279d707025f0e60951e429bf778b4813d1b6bf#fd279d707025f0e60951e429bf778b4813d1b6bf" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils", + "crossbeam-utils 0.8.18", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1654,6 +1686,11 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" + [[package]] name = "crunchy" version = "0.2.2" @@ -1720,6 +1757,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "curve25519-dalek" version = "3.2.1" @@ -1802,6 +1845,15 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3" +dependencies = [ + "powerfmt", +] + [[package]] name = "derivation-path" version = "0.2.0" @@ -2798,6 +2850,43 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iai-callgrind" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e99bf26f496b13ac6273014f40afda46a233fbfb0289ce50fb4daaad2f2ffc80" +dependencies = [ + "bincode", + "bindgen 0.69.2", + "cc", + "cfg-if 1.0.0", + "cty", + "iai-callgrind-macros", + "iai-callgrind-runner", + "regex", +] + +[[package]] +name = "iai-callgrind-macros" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2a4bb39225592c0a28cfca6f70af52ebd8da23f533c2cdd0a3329c1fa252d56" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "iai-callgrind-runner" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c23a951b9eccaa1e38556d27473d1462a9c247a27961812edcaac156af861282" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.46" @@ -3177,7 +3266,7 @@ version = "0.11.0+8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" dependencies = [ - "bindgen", + "bindgen 0.65.1", "bzip2-sys", "cc", "glob", @@ -3665,15 +3754,6 @@ dependencies = [ "syn 2.0.52", ] -[[package]] -name = "num_threads" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ba99ba6393e2c3734791401b66902d981cb03bf190af674ca69949b6d5fb15" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -4076,6 +4156,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.15" @@ -4130,9 +4216,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn 2.0.52", @@ -4195,6 +4281,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "hex", +] + [[package]] name = "proptest" version = "1.4.0" @@ -4503,7 +4615,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -4752,9 +4864,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ "bitflags 2.4.2", "errno", @@ -5020,8 +5132,13 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ + "base64 0.13.1", + "chrono", + "hex", "serde", + "serde_json", "serde_with_macros", + "time", ] [[package]] @@ -7559,8 +7676,12 @@ name = "solana-unified-scheduler-logic" version = "2.0.0" dependencies = [ "assert_matches", + "iai-callgrind", + "qualifier_attr", "solana-sdk", + "solana-unified-scheduler-logic", "static_assertions", + "triomphe", ] [[package]] @@ -7568,18 +7689,30 @@ name = "solana-unified-scheduler-pool" version = "2.0.0" dependencies = [ "assert_matches", + "bincode", + "cpu-time", + "criterion", "crossbeam-channel", "dashmap", "derivative", "log", + "procfs", "qualifier_attr", + "rand 0.8.5", + "rustix", + "serde_json", "solana-ledger", "solana-logger", + "solana-measure", + "solana-metrics", + "solana-nohash-hasher", "solana-program-runtime", "solana-runtime", "solana-sdk", "solana-unified-scheduler-logic", + "solana-unified-scheduler-pool", "solana-vote", + "tikv-jemallocator", ] [[package]] @@ -8020,6 +8153,12 @@ dependencies = [ "spl-program-error", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -8374,21 +8513,32 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5" dependencies = [ + "deranged", "itoa", - "libc", - "num_threads", + "powerfmt", + "serde", + "time-core", "time-macros", ] +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + [[package]] name = "time-macros" -version = "0.2.4" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20" +dependencies = [ + "time-core", +] [[package]] name = "tiny-bip39" @@ -8758,6 +8908,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de5f738ceab88e2491a94ddc33c3feeadfa95fedc60363ef110845df12f3878" +[[package]] +name = "triomphe" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" +dependencies = [ + "serde", + "stable_deref_trait", +] + [[package]] name = "try-lock" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index f96d072d931633..95814a448d6683 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,9 +181,10 @@ console_error_panic_hook = "0.1.7" console_log = "0.2.2" const_format = "0.2.32" core_affinity = "0.5.10" +cpu-time = "1.0.0" criterion = "0.5.1" criterion-stats = "0.3.0" -crossbeam-channel = "0.5.12" +crossbeam-channel = "0.5.11" csv = "1.3.0" ctrlc = "3.4.4" curve25519-dalek = "3.2.1" @@ -269,6 +270,7 @@ predicates = "2.1" pretty-hex = "0.3.0" prio-graph = "0.2.1" proc-macro2 = "1.0.79" +procfs = "0.16.0" proptest = "1.4" prost = "0.11.9" prost-build = "0.11.9" @@ -289,6 +291,7 @@ reqwest = { version = "0.11.23", default-features = false } rolling-file = "0.2.0" rpassword = "7.3" rustc_version = "0.4" +rustix = "0.38.32" rustls = { version = "0.21.11", default-features = false, features = ["quic"] } rustversion = "1.0.14" scopeguard = "1.2.0" @@ -443,6 +446,8 @@ zstd = "0.11.2" # for details, see https://github.com/solana-labs/crossbeam/commit/fd279d707025f0e60951e429bf778b4813d1b6bf crossbeam-epoch = { git = "https://github.com/solana-labs/crossbeam", rev = "fd279d707025f0e60951e429bf778b4813d1b6bf" } +crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" } + # We include the following crates as our dependencies above from crates.io: # # * spl-associated-token-account diff --git a/ci/test-bench.sh b/ci/test-bench.sh index aacc82cffbb0a6..1444405bcccf5c 100755 --- a/ci/test-bench.sh +++ b/ci/test-bench.sh @@ -56,6 +56,10 @@ _ $cargoNightly bench --manifest-path gossip/Cargo.toml ${V:+--verbose} \ _ $cargoNightly bench --manifest-path poh/Cargo.toml ${V:+--verbose} \ -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" +# Run scheduler-pool benches +_ $cargoNightly bench --manifest-path scheduler-pool/Cargo.toml ${V:+--verbose} \ + -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" + # Run core benches _ $cargoNightly bench --manifest-path core/Cargo.toml ${V:+--verbose} \ -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" diff --git a/core/src/validator.rs b/core/src/validator.rs index 47153dd2706dd9..17a277c041e201 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -149,8 +149,8 @@ const WAIT_FOR_WEN_RESTART_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = #[derive(Clone, EnumString, EnumVariantNames, Default, IntoStaticStr, Display)] #[strum(serialize_all = "kebab-case")] pub enum BlockVerificationMethod { - #[default] BlockstoreProcessor, + #[default] UnifiedScheduler, } @@ -1462,8 +1462,10 @@ impl Validator { // Used for notifying many nodes in parallel to exit pub fn exit(&mut self) { + info!("exit1"); self.validator_exit.write().unwrap().exit(); + info!("exit2"); // drop all signals in blockstore self.blockstore.drop_signal(); } @@ -1499,24 +1501,29 @@ impl Validator { } pub fn join(self) { - drop(self.bank_forks); + info!("join1"); drop(self.cluster_info); + info!("join2"); self.poh_service.join().expect("poh_service"); drop(self.poh_recorder); + info!("join3"); if let Some(json_rpc_service) = self.json_rpc_service { json_rpc_service.join().expect("rpc_service"); } + info!("join4"); if let Some(pubsub_service) = self.pubsub_service { pubsub_service.join().expect("pubsub_service"); } + info!("join5"); self.rpc_completed_slots_service .join() .expect("rpc_completed_slots_service"); + info!("join6"); if let Some(optimistically_confirmed_bank_tracker) = self.optimistically_confirmed_bank_tracker { @@ -1525,96 +1532,126 @@ impl Validator { .expect("optimistically_confirmed_bank_tracker"); } + info!("join7"); if let Some(transaction_status_service) = self.transaction_status_service { transaction_status_service .join() .expect("transaction_status_service"); } + info!("join8"); if let Some(rewards_recorder_service) = self.rewards_recorder_service { rewards_recorder_service .join() .expect("rewards_recorder_service"); } + info!("join9"); if let Some(cache_block_meta_service) = self.cache_block_meta_service { cache_block_meta_service .join() .expect("cache_block_meta_service"); } + info!("join10"); if let Some(system_monitor_service) = self.system_monitor_service { system_monitor_service .join() .expect("system_monitor_service"); } + info!("join11"); if let Some(sample_performance_service) = self.sample_performance_service { sample_performance_service .join() .expect("sample_performance_service"); } + info!("join12"); if let Some(entry_notifier_service) = self.entry_notifier_service { entry_notifier_service .join() .expect("entry_notifier_service"); } + info!("join13"); if let Some(s) = self.snapshot_packager_service { s.join().expect("snapshot_packager_service"); } + info!("join14"); self.gossip_service.join().expect("gossip_service"); if let Some(repair_quic_endpoint) = &self.repair_quic_endpoint { repair::quic_endpoint::close_quic_endpoint(repair_quic_endpoint); } + info!("join15"); self.serve_repair_service .join() .expect("serve_repair_service"); + info!("join15"); if let Some(repair_quic_endpoint_join_handle) = self.repair_quic_endpoint_join_handle { self.repair_quic_endpoint_runtime .map(|runtime| runtime.block_on(repair_quic_endpoint_join_handle)) .transpose() .unwrap(); }; + info!("join16"); self.stats_reporter_service .join() .expect("stats_reporter_service"); + info!("join17"); self.blockstore_metric_report_service .join() .expect("ledger_metric_report_service"); + info!("join18"); self.accounts_background_service .join() .expect("accounts_background_service"); + info!("join19"); self.accounts_hash_verifier .join() .expect("accounts_hash_verifier"); + info!("join20"); if let Some(turbine_quic_endpoint) = &self.turbine_quic_endpoint { solana_turbine::quic_endpoint::close_quic_endpoint(turbine_quic_endpoint); } + info!("join21"); self.tpu.join().expect("tpu"); + info!("join22"); self.tvu.join().expect("tvu"); + info!("join23"); if let Some(turbine_quic_endpoint_join_handle) = self.turbine_quic_endpoint_join_handle { self.turbine_quic_endpoint_runtime .map(|runtime| runtime.block_on(turbine_quic_endpoint_join_handle)) .transpose() .unwrap(); } + info!("join24"); self.completed_data_sets_service .join() .expect("completed_data_sets_service"); + info!("join25"); if let Some(ip_echo_server) = self.ip_echo_server { ip_echo_server.shutdown_background(); } + info!("join26"); if let Some(geyser_plugin_service) = self.geyser_plugin_service { geyser_plugin_service.join().expect("geyser_plugin_service"); } + info!("join27"); self.poh_timing_report_service .join() .expect("poh_timing_report_service"); + info!("join28"); + self.bank_forks.write().unwrap().prepare_to_drop(); + let sc = Arc::strong_count(&self.bank_forks); + if let Some(bank_forks) = Arc::into_inner(self.bank_forks) { + drop::(bank_forks.into_inner().unwrap()); + } else { + warn!("seems bankforks are leaking...{}:", sc); + } } } diff --git a/core/tests/unified_scheduler.rs b/core/tests/unified_scheduler.rs index fae6f3cccfe698..fa79cbcfb030c3 100644 --- a/core/tests/unified_scheduler.rs +++ b/core/tests/unified_scheduler.rs @@ -18,7 +18,8 @@ use { solana_program_runtime::timings::ExecuteTimings, solana_runtime::{ accounts_background_service::AbsRequestSender, bank::Bank, bank_forks::BankForks, - genesis_utils::GenesisConfigInfo, prioritization_fee_cache::PrioritizationFeeCache, + genesis_utils::GenesisConfigInfo, installed_scheduler_pool::DefaultScheduleExecutionArg, + prioritization_fee_cache::PrioritizationFeeCache, }, solana_sdk::{ hash::Hash, @@ -27,11 +28,14 @@ use { transaction::{Result, SanitizedTransaction}, }, solana_unified_scheduler_pool::{ - DefaultTaskHandler, HandlerContext, PooledScheduler, SchedulerPool, TaskHandler, + DefaultTaskHandler, HandlerContext, PooledScheduler, SchedulerPool, SpawnableScheduler, + TaskHandler, }, std::{ collections::HashMap, sync::{Arc, Mutex}, + thread::sleep, + time::Duration, }, }; @@ -41,10 +45,11 @@ fn test_scheduler_waited_by_drop_bank_service() { static LOCK_TO_STALL: Mutex<()> = Mutex::new(()); - #[derive(Debug)] + #[derive(Clone, Debug)] struct StallingHandler; - impl TaskHandler for StallingHandler { + impl TaskHandler for StallingHandler { fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -55,10 +60,24 @@ fn test_scheduler_waited_by_drop_bank_service() { info!("Stalling at StallingHandler::handle()..."); *LOCK_TO_STALL.lock().unwrap(); // Wait a bit for the replay stage to prune banks - std::thread::sleep(std::time::Duration::from_secs(3)); + sleep(Duration::from_secs(3)); info!("Now entering into DefaultTaskHandler::handle()..."); - DefaultTaskHandler::handle(result, timings, bank, transaction, index, handler_context); + >::handle( + &DefaultTaskHandler, + result, + timings, + bank, + transaction, + index, + handler_context, + ); + } + + fn create>( + _pool: &SchedulerPool, + ) -> Self { + Self } } @@ -72,7 +91,7 @@ fn test_scheduler_waited_by_drop_bank_service() { let genesis_bank = Bank::new_for_tests(&genesis_config); let bank_forks = BankForks::new_rw_arc(genesis_bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool_raw = SchedulerPool::, _>::new( + let pool_raw = SchedulerPool::, _, _>::new( None, None, None, @@ -107,7 +126,9 @@ fn test_scheduler_waited_by_drop_bank_service() { // Delay transaction execution to ensure transaction execution happens after termintion has // been started let lock_to_stall = LOCK_TO_STALL.lock().unwrap(); - pruned_bank.schedule_transaction_executions([(&tx, &0)].into_iter()); + pruned_bank + .schedule_transaction_executions([(&tx, &0)].into_iter()) + .unwrap(); drop(pruned_bank); assert_eq!(pool_raw.pooled_scheduler_count(), 0); drop(lock_to_stall); diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index 814c30a4f7b384..236c9f9f060d5c 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -864,7 +864,6 @@ fn main() { .takes_value(true) .possible_values(BlockVerificationMethod::cli_names()) .global(true) - .hidden(hidden_unless_forced()) .help(BlockVerificationMethod::cli_message()), ) .arg( @@ -874,7 +873,6 @@ fn main() { .takes_value(true) .validator(|s| is_within_range(s, 1..)) .global(true) - .hidden(hidden_unless_forced()) .help(DefaultSchedulerPool::cli_message()), ) .arg( @@ -1823,6 +1821,8 @@ fn main() { exit_signal.store(true, Ordering::Relaxed); system_monitor_service.join().unwrap(); + bank_forks.write().unwrap().prepare_to_drop(); + drop::(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap()); } ("graph", Some(arg_matches)) => { let output_file = value_t_or_exit!(arg_matches, "graph_filename", String); diff --git a/ledger/src/blockstore_processor.rs b/ledger/src/blockstore_processor.rs index bc2a60efb2de87..bdd7a188b3df12 100644 --- a/ledger/src/blockstore_processor.rs +++ b/ledger/src/blockstore_processor.rs @@ -341,8 +341,7 @@ fn process_batches( // scheduling always succeeds here without being blocked on actual transaction executions. // The transaction execution errors will be collected via the blocking fn called // BankWithScheduler::wait_for_completed_scheduler(), if any. - schedule_batches_for_execution(bank, batches); - Ok(()) + schedule_batches_for_execution(bank, batches) } else { debug!( "process_batches()/rebatch_and_execute_batches({} batches)", @@ -364,7 +363,7 @@ fn process_batches( fn schedule_batches_for_execution( bank: &BankWithScheduler, batches: &[TransactionBatchWithIndexes], -) { +) -> Result<()> { for TransactionBatchWithIndexes { batch, transaction_indexes, @@ -375,8 +374,9 @@ fn schedule_batches_for_execution( .sanitized_transactions() .iter() .zip(transaction_indexes.iter()), - ); + )?; } + Ok(()) } fn rebatch_transactions<'a>( @@ -445,9 +445,7 @@ fn rebatch_and_execute_batches( { let mut cost_tracker = bank.write_cost_tracker().unwrap(); for tx_cost in &tx_costs { - cost_tracker - .try_add(tx_cost) - .map_err(TransactionError::from)?; + cost_tracker.try_add(tx_cost)?; } } @@ -2149,6 +2147,7 @@ pub mod tests { instruction::{Instruction, InstructionError}, native_token::LAMPORTS_PER_SOL, pubkey::Pubkey, + scheduling::SchedulingMode, signature::{Keypair, Signer}, system_instruction::SystemError, system_transaction, @@ -4751,7 +4750,7 @@ pub mod tests { .. } = create_genesis_config_with_leader(500, &dummy_leader_pubkey, 100); let bank = Arc::new(Bank::new_for_tests(&genesis_config)); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let txs = create_test_transactions(&mint_keypair, &genesis_config.hash()); @@ -4766,7 +4765,7 @@ pub mod tests { mocked_scheduler .expect_schedule_execution() .times(txs.len()) - .returning(|_| ()); + .returning(|_| Ok(())); mocked_scheduler .expect_wait_for_termination() .with(mockall::predicate::eq(true)) diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 3ddd2aa19dcaa8..36b2379e77033a 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -4,7 +4,7 @@ use { crossbeam_channel::{unbounded, Receiver}, gag::BufferRedirect, log::*, - rand::seq::IteratorRandom, + rand::seq::SliceRandom, serial_test::serial, solana_accounts_db::{ hardened_unpack::open_genesis_config, utils::create_accounts_run_and_snapshot_dirs, @@ -5675,12 +5675,14 @@ fn test_randomly_mixed_block_verification_methods_between_bootstrap_and_not() { ); // Randomly switch to use unified scheduler - config - .validator_configs - .iter_mut() - .choose(&mut rand::thread_rng()) - .unwrap() - .block_verification_method = BlockVerificationMethod::UnifiedScheduler; + let mut methods = [ + BlockVerificationMethod::UnifiedScheduler, + BlockVerificationMethod::BlockstoreProcessor, + ]; + methods.shuffle(&mut rand::thread_rng()); + for (validator_config, method) in config.validator_configs.iter_mut().zip(methods) { + validator_config.block_verification_method = method; + } let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified); cluster_tests::spend_and_verify_all_nodes( diff --git a/metrics/src/datapoint.rs b/metrics/src/datapoint.rs index e2740ce3aecc47..8a13a112da0636 100644 --- a/metrics/src/datapoint.rs +++ b/metrics/src/datapoint.rs @@ -60,6 +60,15 @@ impl DataPoint { } } + pub fn at(timestamp: SystemTime, name: &'static str) -> Self { + DataPoint { + name, + timestamp, + tags: vec![], + fields: vec![], + } + } + pub fn add_tag(&mut self, name: &'static str, value: &str) -> &mut Self { self.tags.push((name, value.to_string())); self @@ -160,6 +169,56 @@ macro_rules! create_datapoint { }; } +#[macro_export] +macro_rules! create_datapoint_at { + (@field $point:ident $name:expr, $string:expr, String) => { + $point.add_field_str($name, &$string); + }; + (@field $point:ident $name:expr, $value:expr, i64) => { + $point.add_field_i64($name, $value as i64); + }; + (@field $point:ident $name:expr, $value:expr, f64) => { + $point.add_field_f64($name, $value as f64); + }; + (@field $point:ident $name:expr, $value:expr, bool) => { + $point.add_field_bool($name, $value as bool); + }; + (@tag $point:ident $tag_name:expr, $tag_value:expr) => { + $point.add_tag($tag_name, &$tag_value); + }; + + (@fields $point:ident) => {}; + + // process tags + (@fields $point:ident $tag_name:expr => $tag_value:expr, $($rest:tt)*) => { + $crate::create_datapoint!(@tag $point $tag_name, $tag_value); + $crate::create_datapoint!(@fields $point $($rest)*); + }; + (@fields $point:ident $tag_name:expr => $tag_value:expr) => { + $crate::create_datapoint!(@tag $point $tag_name, $tag_value); + }; + + // process fields + (@fields $point:ident ($name:expr, $value:expr, $type:ident) , $($rest:tt)*) => { + $crate::create_datapoint!(@field $point $name, $value, $type); + $crate::create_datapoint!(@fields $point $($rest)*); + }; + (@fields $point:ident ($name:expr, $value:expr, $type:ident)) => { + $crate::create_datapoint!(@field $point $name, $value, $type); + }; + + (@point $name:expr, $at:expr, $($fields:tt)+) => { + { + let mut point = $crate::datapoint::DataPoint::at($at, &$name); + $crate::create_datapoint!(@fields point $($fields)+); + point + } + }; + (@point $name:expr, $at:expr) => { + $crate::datapoint::DataPoint::at($at, &$name) + }; +} + #[macro_export] macro_rules! datapoint { ($level:expr, $name:expr, $($fields:tt)+) => { @@ -168,6 +227,21 @@ macro_rules! datapoint { } }; } + +#[macro_export] +macro_rules! datapoint_at { + ($level:expr, $at:expr, $name:expr) => { + if log::log_enabled!($level) { + $crate::submit($crate::create_datapoint_at!(@point $name, $at), $level); + } + }; + ($level:expr, $at:expr, $name:expr, $($fields:tt)+) => { + if log::log_enabled!($level) { + $crate::submit($crate::create_datapoint_at!(@point $name, $at, $($fields)+), $level); + } + }; +} + #[macro_export] macro_rules! datapoint_error { ($name:expr, $($fields:tt)+) => { @@ -189,6 +263,16 @@ macro_rules! datapoint_info { }; } +#[macro_export] +macro_rules! datapoint_info_at { + ($at:expr, $name:expr) => { + $crate::datapoint_at!(log::Level::Info, $at, $name); + }; + ($at:expr, $name:expr, $($fields:tt)+) => { + $crate::datapoint_at!(log::Level::Info, $at, $name, $($fields)+); + }; +} + #[macro_export] macro_rules! datapoint_debug { ($name:expr, $($fields:tt)+) => { diff --git a/metrics/src/metrics.rs b/metrics/src/metrics.rs index aae2dabb364077..069aa2531a74ba 100644 --- a/metrics/src/metrics.rs +++ b/metrics/src/metrics.rs @@ -181,7 +181,7 @@ impl Default for MetricsAgent { Self::new( Arc::new(InfluxDbMetricsWriter::new()), - Duration::from_secs(10), + Duration::from_secs(1), max_points_per_sec, ) } diff --git a/program-runtime/src/loaded_programs.rs b/program-runtime/src/loaded_programs.rs index e5ee034e753a2d..dde03acb6da83f 100644 --- a/program-runtime/src/loaded_programs.rs +++ b/program-runtime/src/loaded_programs.rs @@ -791,6 +791,10 @@ impl ProgramCache { self.fork_graph = Some(fork_graph); } + pub fn unset_fork_graph(&mut self) { + self.fork_graph = None; + } + /// Returns the current environments depending on the given epoch pub fn get_environments_for_epoch(&self, epoch: Epoch) -> &ProgramRuntimeEnvironments { if epoch != self.latest_root_epoch { diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 2ccdc186b0eb6d..432beb9053dabf 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -1113,7 +1113,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1189,6 +1189,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1209,11 +1219,10 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3db02a9c5b5121e1e42fbdb1aeb65f5e02624cc58c43f2884c6ccac0b82f95" +version = "0.5.11" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.19", ] [[package]] @@ -1224,7 +1233,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1234,7 +1243,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils", + "crossbeam-utils 0.8.18", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1249,6 +1258,11 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" + [[package]] name = "crunchy" version = "0.2.2" @@ -1367,6 +1381,15 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb30d70a07a3b04884d2677f06bec33509dc67ca60d92949e5535352d3191dc" +dependencies = [ + "powerfmt", +] + [[package]] name = "derivation-path" version = "0.2.0" @@ -2116,6 +2139,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "histogram" version = "0.6.9" @@ -3180,15 +3209,6 @@ dependencies = [ "syn 2.0.52", ] -[[package]] -name = "num_threads" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -3550,6 +3570,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.8" @@ -3670,6 +3696,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "hex", +] + [[package]] name = "prost" version = "0.11.9" @@ -3907,7 +3959,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -4134,9 +4186,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ "bitflags 2.4.2", "errno", @@ -4360,8 +4412,13 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ + "base64 0.13.1", + "chrono", + "hex", "serde", + "serde_json", "serde_with_macros", + "time", ] [[package]] @@ -6538,6 +6595,7 @@ name = "solana-unified-scheduler-logic" version = "2.0.0" dependencies = [ "assert_matches", + "qualifier_attr", "solana-sdk", "static_assertions", ] @@ -6547,17 +6605,24 @@ name = "solana-unified-scheduler-pool" version = "2.0.0" dependencies = [ "assert_matches", + "cpu-time", "crossbeam-channel", "dashmap", "derivative", "log", + "procfs", "qualifier_attr", + "rustix", + "serde_json", "solana-ledger", + "solana-measure", + "solana-metrics", "solana-program-runtime", "solana-runtime", "solana-sdk", "solana-unified-scheduler-logic", "solana-vote", + "tikv-jemallocator", ] [[package]] @@ -7210,21 +7275,32 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" dependencies = [ + "deranged", "itoa", - "libc", - "num_threads", + "powerfmt", + "serde", + "time-core", "time-macros", ] +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + [[package]] name = "time-macros" -version = "0.2.4" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" +dependencies = [ + "time-core", +] [[package]] name = "tiny-bip39" diff --git a/programs/sbf/Cargo.toml b/programs/sbf/Cargo.toml index 830b57d8e5359b..349a0e4867c853 100644 --- a/programs/sbf/Cargo.toml +++ b/programs/sbf/Cargo.toml @@ -166,6 +166,8 @@ members = [ targets = ["x86_64-unknown-linux-gnu"] [patch.crates-io] +crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" } + # We include the following crates as our dependencies from crates.io: # # * spl-associated-token-account diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 33e4fa63bc0d19..1332e8acf490b6 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -823,7 +823,7 @@ pub struct Bank { epoch_reward_status: EpochRewardStatus, - transaction_processor: TransactionBatchProcessor, + pub transaction_processor: TransactionBatchProcessor, check_program_modification_slot: bool, diff --git a/runtime/src/bank_forks.rs b/runtime/src/bank_forks.rs index 46d9516d9e7b2e..6752b3b5e84636 100644 --- a/runtime/src/bank_forks.rs +++ b/runtime/src/bank_forks.rs @@ -5,7 +5,8 @@ use { accounts_background_service::{AbsRequestSender, SnapshotRequest, SnapshotRequestKind}, bank::{epoch_accounts_hash_utils, Bank, SquashTiming}, installed_scheduler_pool::{ - BankWithScheduler, InstalledSchedulerPoolArc, SchedulingContext, + BankWithScheduler, DefaultScheduleExecutionArg, InstalledSchedulerPoolArc, + SchedulingContext, }, snapshot_config::SnapshotConfig, }, @@ -16,6 +17,7 @@ use { solana_sdk::{ clock::{Epoch, Slot}, hash::Hash, + scheduling::SchedulingMode, timing, }, std::{ @@ -81,7 +83,13 @@ pub struct BankForks { last_accounts_hash_slot: Slot, in_vote_only_mode: Arc, highest_slot_at_startup: Slot, - scheduler_pool: Option, + scheduler_pool: Option>, +} + +impl Drop for BankForks { + fn drop(&mut self) { + info!("BankForks::drop(): successfully dropped"); + } } impl Index for BankForks { @@ -215,7 +223,10 @@ impl BankForks { self[self.root()].clone() } - pub fn install_scheduler_pool(&mut self, pool: InstalledSchedulerPoolArc) { + pub fn install_scheduler_pool( + &mut self, + pool: InstalledSchedulerPoolArc, + ) { info!("Installed new scheduler_pool into bank_forks: {:?}", pool); assert!( self.scheduler_pool.replace(pool).is_none(), @@ -223,6 +234,27 @@ impl BankForks { ); } + pub fn uninstall_scheduler_pool(&mut self) { + // hint scheduler pool to cut circular references of Arc + if let Some(sp) = self.scheduler_pool.take() { + sp.uninstalled_from_bank_forks(); + } + } + + pub fn prepare_to_drop(&mut self) { + let root_bank = self.root_bank(); + // drop all non root BankWithScheduler, which causes all schedulers wind down. + self.banks.clear(); + self.uninstall_scheduler_pool(); + // this cuts circular references of BankForks... + root_bank + .transaction_processor + .program_cache + .write() + .unwrap() + .unset_fork_graph(); + } + pub fn insert(&mut self, mut bank: Bank) -> BankWithScheduler { if self.root.load(Ordering::Relaxed) < self.highest_slot_at_startup { bank.check_program_modification_slot(); @@ -230,7 +262,7 @@ impl BankForks { let bank = Arc::new(bank); let bank = if let Some(scheduler_pool) = &self.scheduler_pool { - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let scheduler = scheduler_pool.take_scheduler(context); BankWithScheduler::new(bank, Some(scheduler)) } else { diff --git a/runtime/src/installed_scheduler_pool.rs b/runtime/src/installed_scheduler_pool.rs index aaf3ea98f1b9aa..6dde48f1e424b3 100644 --- a/runtime/src/installed_scheduler_pool.rs +++ b/runtime/src/installed_scheduler_pool.rs @@ -25,21 +25,25 @@ use { log::*, solana_program_runtime::timings::ExecuteTimings, solana_sdk::{ + clock::Slot, hash::Hash, - slot_history::Slot, + scheduling::{SchedulingMode, WithSchedulingMode}, transaction::{Result, SanitizedTransaction}, }, std::{ + borrow::Borrow, fmt::Debug, ops::Deref, sync::{Arc, RwLock}, + thread, }, }; #[cfg(feature = "dev-context-only-utils")] use {mockall::automock, qualifier_attr::qualifiers}; -pub trait InstalledSchedulerPool: Send + Sync + Debug { - fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox; +pub trait InstalledSchedulerPool: Send + Sync + Debug { + fn take_scheduler(&self, context: SchedulingContext) -> Box>; + fn uninstalled_from_bank_forks(self: Arc); } #[cfg_attr(doc, aquamarine::aquamarine)] @@ -97,15 +101,15 @@ pub trait InstalledSchedulerPool: Send + Sync + Debug { feature = "dev-context-only-utils", allow(unused_attributes, clippy::needless_lifetimes) )] -pub trait InstalledScheduler: Send + Sync + Debug + 'static { +pub trait InstalledScheduler: Send + Sync + Debug + 'static { fn id(&self) -> SchedulerId; fn context(&self) -> &SchedulingContext; // Calling this is illegal as soon as wait_for_termination is called. fn schedule_execution<'a>( &'a self, - transaction_with_index: &'a (&'a SanitizedTransaction, usize), - ); + transaction_with_index: SEA::TransactionWithIndex<'a>, + ) -> Result<()>; /// Wait for a scheduler to terminate after processing. /// @@ -135,13 +139,47 @@ pub trait UninstalledScheduler: Send + Sync + Debug + 'static { fn return_to_pool(self: Box); } -pub type InstalledSchedulerBox = Box; +pub type InstalledSchedulerBox = Box>; pub type UninstalledSchedulerBox = Box; -pub type InstalledSchedulerPoolArc = Arc; +pub type InstalledSchedulerPoolArc = Arc>; pub type SchedulerId = u64; +pub trait WithTransactionAndIndex: Send + Sync + Debug { + fn with_transaction_and_index( + &self, + callback: impl FnOnce(&SanitizedTransaction, usize) -> R, + ) -> R; +} + +impl< + T: Send + Sync + Debug + Borrow, + U: Send + Sync + Debug + Borrow, + Z: Send + Sync + Debug + Deref, + > WithTransactionAndIndex for Z +{ + fn with_transaction_and_index( + &self, + callback: impl FnOnce(&SanitizedTransaction, usize) -> R, + ) -> R { + callback(self.0.borrow(), *self.1.borrow()) + } +} + +pub trait ScheduleExecutionArg: Send + Sync + Debug + 'static { + // GAT is used to make schedule_execution parametric even supporting references + // under the object-safety req. of InstalledScheduler trait... + type TransactionWithIndex<'tx>: WithTransactionAndIndex; +} + +#[derive(Debug, Default, Clone)] +pub struct DefaultScheduleExecutionArg; + +impl ScheduleExecutionArg for DefaultScheduleExecutionArg { + type TransactionWithIndex<'tx> = &'tx (&'tx SanitizedTransaction, usize); +} + /// A small context to propagate a bank and its scheduling mode to the scheduler subsystem. /// /// Note that this isn't called `SchedulerContext` because the contexts aren't associated with @@ -153,13 +191,19 @@ pub type SchedulerId = u64; /// `SchedulingContext`s. #[derive(Clone, Debug)] pub struct SchedulingContext { - // mode: SchedulingMode, // this will be added later. + mode: SchedulingMode, bank: Arc, } +impl WithSchedulingMode for SchedulingContext { + fn mode(&self) -> SchedulingMode { + self.mode + } +} + impl SchedulingContext { - pub fn new(bank: Arc) -> Self { - Self { bank } + pub fn new(mode: SchedulingMode, bank: Arc) -> Self { + Self { mode, bank } } pub fn bank(&self) -> &Arc { @@ -246,9 +290,14 @@ impl BankWithScheduler { pub(crate) fn new(bank: Arc, scheduler: Option) -> Self { if let Some(bank_in_context) = scheduler .as_ref() - .map(|scheduler| scheduler.context().bank()) + .map(|scheduler| scheduler.context().bank().clone()) { - assert!(Arc::ptr_eq(&bank, bank_in_context)); + assert!( + Arc::ptr_eq(&bank, &bank_in_context), + "different bank!? {} {}", + bank.slot(), + bank_in_context.slot() + ); } Self { @@ -290,7 +339,7 @@ impl BankWithScheduler { pub fn schedule_transaction_executions<'a>( &self, transactions_with_indexes: impl ExactSizeIterator, - ) { + ) -> Result<()> { trace!( "schedule_transaction_executions(): {} txs", transactions_with_indexes.len() @@ -300,8 +349,10 @@ impl BankWithScheduler { let scheduler = scheduler_guard.as_ref().unwrap(); for (sanitized_transaction, &index) in transactions_with_indexes { - scheduler.schedule_execution(&(sanitized_transaction, index)); + scheduler.schedule_execution(&(sanitized_transaction, index))?; } + + Ok(()) } // take needless &mut only to communicate its semantic mutability to humans... @@ -356,7 +407,7 @@ impl BankWithSchedulerInner { "wait_for_scheduler_termination(slot: {}, reason: {:?}): started at {:?}...", bank.slot(), reason, - std::thread::current(), + thread::current(), ); let mut scheduler = scheduler.write().unwrap(); @@ -378,14 +429,14 @@ impl BankWithSchedulerInner { reason, was_noop, result_with_timings.as_ref().map(|(result, _)| result), - std::thread::current(), + thread::current(), ); result_with_timings } fn drop_scheduler(&self) { - if std::thread::panicking() { + if thread::panicking() { error!( "BankWithSchedulerInner::drop_scheduler(): slot: {} skipping due to already panicking...", self.bank.slot(), @@ -438,7 +489,7 @@ mod tests { fn setup_mocked_scheduler_with_extra( bank: Arc, is_dropped_flags: impl Iterator, - f: Option, + f: Option)>, ) -> InstalledSchedulerBox { let mut mock = MockInstalledScheduler::new(); let seq = Arc::new(Mutex::new(Sequence::new())); @@ -446,7 +497,10 @@ mod tests { mock.expect_context() .times(1) .in_sequence(&mut seq.lock().unwrap()) - .return_const(SchedulingContext::new(bank)); + .return_const(SchedulingContext::new( + SchedulingMode::BlockVerification, + bank, + )); for wait_reason in is_dropped_flags { let seq_cloned = seq.clone(); @@ -482,7 +536,7 @@ mod tests { setup_mocked_scheduler_with_extra( bank, is_dropped_flags, - None:: ()>, + None::) -> ()>, ) } @@ -538,12 +592,14 @@ mod tests { Some(setup_mocked_scheduler_with_extra( bank, [false].into_iter(), - Some(|mocked: &mut MockInstalledScheduler| { - mocked - .expect_pause_for_recent_blockhash() - .times(1) - .returning(|| ()); - }), + Some( + |mocked: &mut MockInstalledScheduler| { + mocked + .expect_pause_for_recent_blockhash() + .times(1) + .returning(|| ()); + }, + ), )), ); goto_end_of_slot_with_scheduler(&bank); @@ -569,15 +625,18 @@ mod tests { let mocked_scheduler = setup_mocked_scheduler_with_extra( bank.clone(), [true].into_iter(), - Some(|mocked: &mut MockInstalledScheduler| { - mocked - .expect_schedule_execution() - .times(1) - .returning(|(_, _)| ()); - }), + Some( + |mocked: &mut MockInstalledScheduler| { + mocked + .expect_schedule_execution() + .times(1) + .returning(|(_, _)| Ok(())); + }, + ), ); let bank = BankWithScheduler::new(bank, Some(mocked_scheduler)); - bank.schedule_transaction_executions([(&tx0, &0)].into_iter()); + bank.schedule_transaction_executions([(&tx0, &0)].into_iter()) + .unwrap(); } } diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 034a98623419a8..a81c62115a6935 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -69,7 +69,7 @@ serde = { workspace = true } serde_bytes = { workspace = true } serde_derive = { workspace = true } serde_json = { workspace = true, optional = true } -serde_with = { workspace = true, features = ["macros"] } +serde_with = { workspace = true, features = ["macros", "alloc"] } sha2 = { workspace = true } sha3 = { workspace = true, optional = true } siphasher = { workspace = true } diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs index 12cc8ac7a232bc..64079236c1eb35 100644 --- a/sdk/src/lib.rs +++ b/sdk/src/lib.rs @@ -98,6 +98,7 @@ pub mod reserved_account_keys; pub mod reward_info; pub mod reward_type; pub mod rpc_port; +pub mod scheduling; pub mod secp256k1_instruction; pub mod shred_version; pub mod signature; diff --git a/sdk/src/scheduling.rs b/sdk/src/scheduling.rs new file mode 100644 index 00000000000000..aa39f7a8b08e8d --- /dev/null +++ b/sdk/src/scheduling.rs @@ -0,0 +1,11 @@ +//! Primitive types relevant to transaction scheduling +#![cfg(feature = "full")] + +#[derive(Debug, Clone, Copy)] +pub enum SchedulingMode { + BlockVerification, +} + +pub trait WithSchedulingMode { + fn mode(&self) -> SchedulingMode; +} diff --git a/sdk/src/transaction/sanitized.rs b/sdk/src/transaction/sanitized.rs index fe951c7ff57147..d01e56ed5be8d7 100644 --- a/sdk/src/transaction/sanitized.rs +++ b/sdk/src/transaction/sanitized.rs @@ -38,11 +38,17 @@ pub struct SanitizedTransaction { } /// Set of accounts that must be locked for safe transaction processing -#[derive(Debug, Clone, Default, Eq, PartialEq)] +use serde_with::serde_as; +use serde_with::DisplayFromStr; + +#[serde_as] +#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize)] pub struct TransactionAccountLocks<'a> { /// List of readonly account key locks + #[serde_as(as = "Vec")] pub readonly: Vec<&'a Pubkey>, /// List of writable account key locks + #[serde_as(as = "Vec")] pub writable: Vec<&'a Pubkey>, } diff --git a/unified-scheduler-logic/Cargo.toml b/unified-scheduler-logic/Cargo.toml index b05cec41a7c862..e1dd176a2bd510 100644 --- a/unified-scheduler-logic/Cargo.toml +++ b/unified-scheduler-logic/Cargo.toml @@ -11,5 +11,25 @@ edition = { workspace = true } [dependencies] assert_matches = { workspace = true } +qualifier_attr = { workspace = true } solana-sdk = { workspace = true } static_assertions = { workspace = true } +#[[bench]] +#name = "bench-with-iai-callgrind" +#harness = false + +[dev-dependencies] +# See order-crates-for-publishing.py for using this unusual `path = "."` +solana-unified-scheduler-logic = { path = ".", features = ["dev-context-only-utils"] } +triomphe = { version = "0.1.11" } + +[target."cfg(target_os = \"linux\")".dev-dependencies] +iai-callgrind = { version = "0.10.2", features = [ + "client_requests" +] } + +[target."cfg(not(target_os = \"linux\"))".dev-dependencies] +iai-callgrind = { version = "0.10.2" } + +[features] +dev-context-only-utils = [] diff --git a/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs new file mode 100644 index 00000000000000..a267874ef51cbb --- /dev/null +++ b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs @@ -0,0 +1,668 @@ +#![cfg(feature = "dummy")] +#![allow(clippy::arithmetic_side_effects)] + +#[global_allocator] +static GLOBAL: B = B; + +struct A(T); + +unsafe impl std::marker::Sync for A {} + +static LOCAL_ALLOCATOR: A> = A(std::cell::UnsafeCell::new(BL::new())); + +struct BL { + cursor: *mut u8, + limit: *mut u8, + bytes: [u8; Self::BLOCK_SIZE], +} + +impl BL { + const BLOCK_SIZE: usize = 100_000_000; + + const fn new() -> Self { + Self { + cursor: usize::max_value() as _, + limit: usize::max_value() as _, + bytes: [0; Self::BLOCK_SIZE], + } + } + + #[inline(always)] + pub fn alloc2(&mut self, bytes: usize) -> *mut u8 { + loop { + self.cursor = unsafe { (((self.cursor.sub(bytes)) as usize) & !15) as _ }; + if self.cursor >= self.limit { + return self.cursor; + } else if self.limit == usize::max_value() as _ { + self.limit = self.bytes.as_mut_ptr(); + self.cursor = unsafe { self.limit.add(Self::BLOCK_SIZE) }; + continue; + } else { + panic!("out of memory form BL"); + } + } + } +} + +use std::{ + alloc::{GlobalAlloc, Layout}, + hint::black_box, +}; + +struct B; + +unsafe impl GlobalAlloc for B { + #[inline(always)] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + (*LOCAL_ALLOCATOR.0.get()).alloc2(layout.size()) + } + + #[inline(always)] + unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {} +} + +use { + assert_matches::assert_matches, + iai_callgrind::{ + client_requests::callgrind::toggle_collect, library_benchmark, library_benchmark_group, + main, + }, + solana_sdk::{ + instruction::{AccountMeta, Instruction}, + message::Message, + pubkey::Pubkey, + signature::Signer, + signer::keypair::Keypair, + transaction::{SanitizedTransaction, Transaction}, + }, + solana_unified_scheduler_logic::{SchedulingStateMachine, UsageQueue}, +}; + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for i in 0..account_count { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + toggle_collect(); + let task = scheduler.schedule_task(task); + toggle_collect(); + task.unwrap(); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_drop_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + + toggle_collect(); + drop(task); + toggle_collect(); +} + +#[library_benchmark] +#[bench::one(1)] +fn bench_insert_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + + let mut b = std::collections::BTreeMap::new(); + toggle_collect(); + b.insert(task.index, task.clone()); + b.insert(task.index + 1, task.clone()); + b.remove(&task.index); + b.remove(&(task.index + 1)); + //b.insert(task.index + 4, task); + toggle_collect(); + drop(b); +} + +#[library_benchmark] +#[bench::arc_new(1)] +#[bench::arc_new_and_clone(2)] +#[bench::rc_new(3)] +#[bench::rc_new_and_clone(4)] +fn bench_arc(account_count: usize) { + toggle_collect(); + + { + let b; + match account_count { + 1 => { + toggle_collect(); + b = black_box(std::sync::Arc::new(black_box(3_u32))); + } + 2 => { + b = black_box(std::sync::Arc::new(black_box(3_u32))); + toggle_collect(); + std::mem::forget(black_box(b.clone())); + } + _ => { + let b; + match account_count { + 3 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + } + 4 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + black_box(b.clone()); + } + _ => panic!(), + } + toggle_collect(); + drop(b); + return; + } + } + toggle_collect(); + drop(b); + } +} + +#[library_benchmark] +#[bench::arc_new(1)] +#[bench::arc_new_and_clone(2)] +#[bench::rc_new(3)] +#[bench::rc_new_and_clone(4)] +fn bench_triomphe_arc(account_count: usize) { + toggle_collect(); + + { + let b; + match account_count { + 1 => { + toggle_collect(); + b = black_box(triomphe::Arc::new(black_box(3_u32))); + } + 2 => { + b = black_box(triomphe::Arc::new(black_box(3_u32))); + toggle_collect(); + std::mem::forget(black_box(b.clone())); + } + _ => { + let b; + match account_count { + 3 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + } + 4 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + black_box(b.clone()); + } + _ => panic!(), + } + toggle_collect(); + drop(b); + return; + } + } + toggle_collect(); + drop(b); + } +} + +#[library_benchmark] +#[bench::one(1)] +fn bench_heaviest_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + + let mut b = std::collections::BTreeMap::new(); + b.insert(task.index, task.clone()); + b.insert(task.index + 1, task.clone()); + b.insert(task.index + 2, task.clone()); + let mut c = std::collections::BTreeMap::new(); + c.insert(task.index + 3, task.clone()); + c.insert(task.index + 4, task.clone()); + c.insert(task.index + 5, task.clone()); + + toggle_collect(); + let d = b.first_key_value(); + let e = c.first_key_value(); + let f = std::cmp::min_by(d, e, |x, y| x.map(|x| x.0).cmp(&y.map(|y| y.0))).map(|x| x.1); + assert_matches!(f.map(|f| f.task_index()), Some(0)); + toggle_collect(); + dbg!(f); + + drop(b); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_task_conflicting(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + let task2 = task.clone(); + toggle_collect(); + assert_matches!(scheduler.schedule_task(task2), None); + toggle_collect(); + drop(task); +} + +#[library_benchmark] +#[bench::min(3, 0)] +#[bench::one(3, 1)] +#[bench::two(2, 2)] +#[bench::three(3, 3)] +#[bench::normal(3, 32)] +#[bench::large(3, 64)] +#[bench::large2(3, 128)] +#[bench::large3(3, 256)] +#[bench::large4(3, 1024)] +#[bench::large5(3, 2048)] +fn bench_schedule_task_conflicting_hot(account_count: usize, task_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let mut usage_queues: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + scheduler.schedule_task(task).unwrap(); + for i in 1..=task_count { + let task = SchedulingStateMachine::create_task(tx0.clone(), i, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + assert_matches!(scheduler.schedule_task(task), None); + } + + let task = SchedulingStateMachine::create_task(tx0.clone(), task_count + 1, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + let task2 = task.clone(); + + toggle_collect(); + assert_matches!(scheduler.schedule_task(task2), None); + toggle_collect(); + + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_deschedule_task_conflicting(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + assert_matches!(scheduler.schedule_task(task.clone()), None); + + toggle_collect(); + scheduler.deschedule_task(&task); + toggle_collect(); + + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_unblocked_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let mut usage_queues: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + let task2 = SchedulingStateMachine::create_task(tx0, 1, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + assert_matches!(scheduler.schedule_task(task2), None); + scheduler.deschedule_task(&task); + toggle_collect(); + let retried_task = scheduler.schedule_next_unblocked_task(); + toggle_collect(); + let retried_task = retried_task.unwrap(); + assert_eq!(task.transaction(), retried_task.transaction()); + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::small(16)] +#[bench::normal(32)] +#[bench::large(64)] +//#[bench::max(128)] +fn bench_end_to_end_worst(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let mut usage_queues: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let task = scheduler.schedule_task(task).unwrap(); + for i in 1..account_count { + let mut accounts = vec![memo_ix.accounts[i].clone()]; + //let mut accounts = vec![AccountMeta::new(Keypair::new().pubkey(), true)]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let ixs = vec![memo_ix]; + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task2 = SchedulingStateMachine::create_task(tx0, i, &mut |address| { + usage_queues.entry(address).or_default().clone() + }); + toggle_collect(); + let scheduled_task = scheduler.schedule_task(task2.clone()); + toggle_collect(); + drop(scheduled_task); + } + + toggle_collect(); + scheduler.deschedule_task(&task); + if let Some(_cc) = account_count.checked_sub(1) { + //assert_eq!(scheduler.unblocked_task_count(), cc); + //let mut c = 0; + while let Some(retried_task) = scheduler.schedule_next_unblocked_task() { + //c += 1; + //scheduler.deschedule_task(&retried_task); + toggle_collect(); + drop::(retried_task); + toggle_collect(); + } + //assert_eq!(c, cc); + } + toggle_collect(); + + //assert_eq!(task2.task_index(), retried_task.task_index()); + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_deschedule_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for i in 0..account_count { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| UsageQueue::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + toggle_collect(); + scheduler.deschedule_task(&task); + toggle_collect(); + drop(task); +} + +library_benchmark_group!( + name = bench_scheduling_state_machine; + benchmarks = bench_end_to_end_worst, bench_arc, bench_triomphe_arc, bench_drop_task, bench_insert_task, bench_heaviest_task, bench_schedule_task, bench_schedule_task_conflicting, bench_schedule_task_conflicting_hot, bench_deschedule_task, bench_deschedule_task_conflicting, bench_schedule_unblocked_task + //benchmarks = bench_arc, bench_triomphe_arc + //benchmarks = bench_end_to_end_worst +); + +main!(library_benchmark_groups = bench_scheduling_state_machine); diff --git a/unified-scheduler-logic/src/lib.rs b/unified-scheduler-logic/src/lib.rs index 2bae4f603582f5..2f82baf693acff 100644 --- a/unified-scheduler-logic/src/lib.rs +++ b/unified-scheduler-logic/src/lib.rs @@ -95,6 +95,8 @@ //! susceptible to the buffer bloat problem by itself as explained by the description and validated //! by the mentioned benchmark above. Thus, this should be solved elsewhere, specifically at the //! scheduler pool. +#[cfg(feature = "dev-context-only-utils")] +use qualifier_attr::field_qualifiers; use { crate::utils::{ShortCounter, Token, TokenCell}, assert_matches::assert_matches, @@ -105,6 +107,8 @@ use { /// Internal utilities. Namely this contains [`ShortCounter`] and [`TokenCell`]. mod utils { + #[cfg(feature = "dev-context-only-utils")] + use qualifier_attr::qualifiers; use std::{ any::{self, TypeId}, cell::{RefCell, UnsafeCell}, @@ -116,6 +120,7 @@ mod utils { /// A really tiny counter to hide `.checked_{add,sub}` all over the place. /// /// It's caller's reponsibility to ensure this (backed by [`u32`]) never overflow. + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] #[derive(Debug, Clone, Copy)] pub(super) struct ShortCounter(u32); @@ -249,6 +254,7 @@ mod utils { /// existence of mutable access over them by requiring the token itself to be mutably borrowed /// to get a mutable reference to the internal value of `TokenCell`. // *mut is used to make this type !Send and !Sync + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] pub(super) struct Token(PhantomData<*mut V>); impl Token { @@ -411,6 +417,7 @@ type BlockedUsageCountToken = Token; const_assert_eq!(mem::size_of::(), 0); /// Internal scheduling data about a particular task. +#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(index(pub)))] #[derive(Debug)] pub struct TaskInner { transaction: SanitizedTransaction, @@ -614,6 +621,7 @@ const_assert_eq!(mem::size_of::(), 8); /// A high-level `struct`, managing the overall scheduling of [tasks](Task), to be used by /// `solana-unified-scheduler-pool`. +#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(count_token(pub)))] pub struct SchedulingStateMachine { unblocked_task_queue: VecDeque, active_task_count: ShortCounter, diff --git a/unified-scheduler-pool/Cargo.toml b/unified-scheduler-pool/Cargo.toml index 1d57a9307f7a47..a0ae095ec8b3d4 100644 --- a/unified-scheduler-pool/Cargo.toml +++ b/unified-scheduler-pool/Cargo.toml @@ -11,22 +11,43 @@ edition = { workspace = true } [dependencies] assert_matches = { workspace = true } +cpu-time = { workspace = true } crossbeam-channel = { workspace = true } dashmap = { workspace = true } derivative = { workspace = true } log = { workspace = true } qualifier_attr = { workspace = true } +rustix = { workspace = true } +serde_json = { workspace = true } solana-ledger = { workspace = true } +solana-measure = { workspace = true } +solana-metrics = { workspace = true } solana-program-runtime = { workspace = true } solana-runtime = { workspace = true } solana-sdk = { workspace = true } solana-unified-scheduler-logic = { workspace = true } solana-vote = { workspace = true } +[target."cfg(target_os = \"linux\")".dependencies] +procfs = { workspace = true } + [dev-dependencies] -assert_matches = { workspace = true } +bincode = { workspace = true } +criterion = "0.5.1" +log = { workspace = true } +rand = { workspace = true } solana-logger = { workspace = true } +solana-nohash-hasher = { workspace = true } solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } +# See order-crates-for-publishing.py for using this unusual `path = "."` +solana-unified-scheduler-pool = { path = ".", features = ["dev-context-only-utils"] } + +[target.'cfg(not(target_env = "msvc"))'.dependencies] +jemallocator = { workspace = true } + +[[bench]] +name = "lib" +harness = false [features] dev-context-only-utils = [] diff --git a/unified-scheduler-pool/benches/lib.rs b/unified-scheduler-pool/benches/lib.rs new file mode 100644 index 00000000000000..7284eec8b03065 --- /dev/null +++ b/unified-scheduler-pool/benches/lib.rs @@ -0,0 +1,217 @@ +#![allow(unused_imports, dead_code)] +#![feature(test)] + +extern crate test; + +#[cfg(not(target_env = "msvc"))] +use jemallocator::Jemalloc; + +#[cfg(not(target_env = "msvc"))] +#[global_allocator] +static GLOBAL: Jemalloc = Jemalloc; + +use { + solana_program_runtime::timings::ExecuteTimings, + solana_runtime::{ + bank::Bank, + bank_forks::BankForks, + genesis_utils::{create_genesis_config, GenesisConfigInfo}, + installed_scheduler_pool::{ + DefaultScheduleExecutionArg, InstalledScheduler, SchedulingContext, + }, + prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_sdk::{ + scheduling::SchedulingMode, + transaction::{Result, SanitizedTransaction}, + }, + solana_unified_scheduler_logic::{SchedulingStateMachine, UsageQueue}, + solana_unified_scheduler_pool::{ + HandlerContext, PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler, + }, + std::sync::Arc, +}; + +#[derive(Debug, Clone)] +struct DummyTaskHandler; + +impl TaskHandler for DummyTaskHandler { + fn handle( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + _transaction: &SanitizedTransaction, + _index: usize, + _handler_context: &HandlerContext, + ) { + } + + fn create>( + _pool: &SchedulerPool, + ) -> Self { + Self + } +} + +fn setup_dummy_fork_graph(bank: Bank) -> Arc { + let slot = bank.slot(); + let bank_fork = BankForks::new_rw_arc(bank); + let bank = bank_fork.read().unwrap().get(slot).unwrap(); + bank.transaction_processor + .program_cache + .write() + .unwrap() + .set_fork_graph(bank_fork); + bank +} + +use solana_sdk::{ + instruction::{AccountMeta, Instruction}, + message::Message, + pubkey::Pubkey, + signature::Signer, + signer::keypair::Keypair, + transaction::Transaction, +}; + +fn do_bench_tx_throughput(label: &str, bencher: &mut Criterion) { + solana_logger::setup(); + + /* + let GenesisConfigInfo { + genesis_config, + .. + } = create_genesis_config(10_000); + */ + let payer = Keypair::new(); + + let mut accounts = vec![]; + for i in 0..100 { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + /* + let bank = Bank::new_for_tests(&genesis_config); + let bank = setup_dummy_fork_graph(bank); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::, _, _>::new( + None, + None, + None, + ignored_prioritization_fee_cache, + ); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + */ + + let (s, r) = crossbeam_channel::bounded(1000); + + use std::sync::atomic::AtomicUsize; + let i = Arc::new(AtomicUsize::default()); + use std::sync::Mutex; + let usage_queues: Arc< + Mutex>, + > = Arc::new(Mutex::new(std::collections::HashMap::new())); + /* + for _ in 0..5 { + std::thread::Builder::new() + .name("solScGen".to_owned()) + .spawn({ + let usage_queues = usage_queues.clone(); + let i = i.clone(); + let tx1 = tx0.clone(); + let s = s.clone(); + move || loop { + let tasks = std::iter::repeat_with(|| SchedulingStateMachine::create_task(tx1.clone(), i.fetch_add(1, std::sync::atomic::Ordering::Relaxed), &mut |address| { + usage_queues.lock().unwrap().entry(address).or_default().clone() + })).take(100).collect::>(); + if s.send(tasks).is_err() { + break; + } + } + }) + .unwrap(); + } + std::thread::sleep(std::time::Duration::from_secs(5)); + */ + + //assert_eq!(bank.transaction_count(), 0); + //let mut scheduler = pool.do_take_scheduler(context); + + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let tasks = std::iter::repeat_with(|| { + SchedulingStateMachine::create_task( + tx0.clone(), + i.fetch_add(1, std::sync::atomic::Ordering::Relaxed), + &mut |address| { + usage_queues + .lock() + .unwrap() + .entry(address) + .or_default() + .clone() + }, + ) + }) + .take(100) + .collect::>(); + s.send(tasks).unwrap(); + + bencher.bench_function(label, |b| { + b.iter(|| { + for _ in 0..600 { + let mut first_task = None; + let tt = r.recv().unwrap(); + let mut new_tasks = Vec::with_capacity(tt.len()); + for t in tt { + /* + scheduler.schedule_task(t); + */ + if let Some(task) = scheduler.schedule_task(t) { + first_task = Some(task); + } + } + scheduler.deschedule_task(first_task.as_ref().unwrap()); + new_tasks.push(first_task.unwrap()); + while let Some(unblocked_task) = scheduler.schedule_next_unblocked_task() { + scheduler.deschedule_task(&unblocked_task); + new_tasks.push(unblocked_task); + } + assert!(scheduler.has_no_active_task()); + s.send(new_tasks).unwrap(); + } + /* + scheduler.pause_for_recent_blockhash(); + scheduler.clear_session_result_with_timings(); + scheduler.restart_session(); + */ + }) + }); +} + +fn bench_entrypoint(bencher: &mut Criterion) { + do_bench_tx_throughput("bench_tx_throughput", bencher) +} + +use criterion::{criterion_group, criterion_main, Criterion}; +criterion_group!(benches, bench_entrypoint); +criterion_main!(benches); diff --git a/unified-scheduler-pool/benches/scheduler.rs b/unified-scheduler-pool/benches/scheduler.rs new file mode 100644 index 00000000000000..aab32811352bea --- /dev/null +++ b/unified-scheduler-pool/benches/scheduler.rs @@ -0,0 +1,923 @@ +#![cfg(feature = "dummy")] +#![feature(test)] +#![allow(clippy::arithmetic_side_effects)] + +#[cfg(not(target_env = "msvc"))] +#[global_allocator] +static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; + +extern crate test; + +use { + assert_matches::assert_matches, + log::*, + rand::{thread_rng, Rng}, + solana_program_runtime::timings::ExecuteTimings, + solana_runtime::{ + bank::Bank, + genesis_utils::{create_genesis_config, GenesisConfigInfo}, + installed_scheduler_pool::{ + InstalledScheduler, ResultWithTimings, ScheduleExecutionArg, SchedulerId, + SchedulingContext, SchedulingMode, WithTransactionAndIndex, + }, + prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_sdk::{ + scheduling::SchedulingMode, + system_transaction, + transaction::{Result, SanitizedTransaction}, + }, + solana_unified_scheduler_pool::{ + PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler, + }, + std::{ + fmt::Debug, + marker::{PhantomData, Send, Sync}, + mem, + sync::Arc, + }, + test::Bencher, +}; + +const TX_COUNT: usize = 10_000; + +#[derive(Debug, Default, Clone)] +struct ScheduleExecutionArgForBench; + +// use Arc-ed transaction for very cheap .clone() so that the consumer is never starved for +// incoming transactions. +type TransactionWithIndexForBench = Arc<(SanitizedTransaction, usize)>; + +impl ScheduleExecutionArg for ScheduleExecutionArgForBench { + type TransactionWithIndex<'_tx> = TransactionWithIndexForBench; +} + +#[derive(Debug, Default, Clone)] +struct BenchFriendlyHandler( + PhantomData, +); + +impl TaskHandler + for BenchFriendlyHandler +{ + fn create>(_pool: &SchedulerPool) -> Self { + Self(PhantomData) + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + //std::hint::black_box(bank.clone()); + let mut i = 0; + for _ in 0..10 { + if MUTATE_ARC { + //for _ in 0..2 { + std::hint::black_box((Arc::downgrade(bank)).upgrade().unwrap()); + //} + } + // call random one of Bank's lightweight-and-very-multi-threaded-friendly methods which take a + // transaction inside this artifical tight loop. + i += bank.get_fee_for_message_with_lamports_per_signature(transaction.message(), i) + } + std::hint::black_box(i); + } +} + +type BenchFriendlyHandlerWithArcMutation = BenchFriendlyHandler; +type BenchFriendlyHandlerWithoutArcMutation = + BenchFriendlyHandler; + +fn run_bench< + F: FnOnce(Arc>, SchedulingContext) -> I, + I: SpawnableScheduler, + TH: TaskHandler, +>( + bencher: &mut Bencher, + create_scheduler: F, +) { + solana_logger::setup(); + + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + + let mut scheduler = create_scheduler(pool, context.clone()); + let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + 2, + genesis_config.hash(), + )); + let tx_with_index = TransactionWithIndexForBench::new((tx0.clone(), 0)); + bencher.iter(|| { + for _ in 0..TX_COUNT { + scheduler.schedule_execution(tx_with_index.clone()); + } + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context.clone()); + }); +} + +mod blocking_ref { + use {super::*, solana_runtime::installed_scheduler_pool::DefaultScheduleExecutionArg}; + + #[bench] + fn bench_without_arc_mutation(bencher: &mut Bencher) { + solana_logger::setup(); + + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + + let mut scheduler = PooledScheduler::<_, DefaultScheduleExecutionArg>::do_spawn( + pool, + context.clone(), + BenchFriendlyHandler::<_, false>::default(), + ); + let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + 2, + genesis_config.hash(), + )); + let tx_with_index = &(tx0, 0); + bencher.iter(|| { + for _ in 0..TX_COUNT { + scheduler.schedule_execution(tx_with_index); + } + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context.clone()); + }); + } +} + +mod blocking { + use super::*; + + type BlockingScheduler = PooledScheduler; + + #[bench] + fn bench_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + BlockingScheduler::do_spawn( + pool, + context, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + BlockingScheduler::do_spawn( + pool, + context, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } +} + +mod nonblocking { + use super::*; + + #[derive(Debug)] + pub(super) struct NonblockingScheduler + Clone> { + id: SchedulerId, + pub(crate) pool: Arc>, + transaction_sender: crossbeam_channel::Sender, + result_receiver: crossbeam_channel::Receiver<(Result<()>, ExecuteTimings, usize)>, + lane_count: usize, + context: SchedulingContext, + _phantom: PhantomData, + } + + enum ChainedChannel { + Payload(TransactionWithIndexForBench), + NextContext(SchedulingContext), + NextChannel(Box), + } + + type ChannelPair = ( + crossbeam_channel::Receiver, + crossbeam_channel::Sender<(Result<()>, ExecuteTimings, usize)>, + ); + + trait WithChannelPair { + fn unwrap_channel_pair(&mut self) -> ChannelPair; + } + + struct ChannelPairOption(Option); + + impl WithChannelPair for ChannelPairOption { + fn unwrap_channel_pair(&mut self) -> ChannelPair { + self.0.take().unwrap() + } + } + + impl + Clone> + SpawnableScheduler for NonblockingScheduler + { + fn spawn( + _pool: Arc>, + _initial_context: SchedulingContext, + _handler: H, + ) -> Self { + unimplemented!(); + } + + fn retire_if_stale(&mut self) -> bool { + unimplemented!(); + } + } + + impl + Clone> NonblockingScheduler { + pub(super) fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + lane_count: usize, + handler: H, + ) -> Self { + let (transaction_sender, transaction_receiver) = + crossbeam_channel::unbounded::(); + let (result_sender, result_receiver) = crossbeam_channel::unbounded(); + + for _ in 0..lane_count { + let mut bank = Arc::clone(initial_context.bank()); + let mut transaction_receiver = transaction_receiver.clone(); + let mut result_sender = result_sender.clone(); + std::thread::spawn({ + let pool = pool.clone(); + let handler = handler.clone(); + move || { + let mut result = Ok(()); + let mut timings = ExecuteTimings::default(); + let mut count = 0; + while let Ok(message) = transaction_receiver.recv() { + match message { + ChainedChannel::Payload(with_transaction_and_index) => { + count += 1; + with_transaction_and_index.with_transaction_and_index( + |transaction, index| { + H::handle( + &handler, + &mut result, + &mut timings, + &bank, + transaction, + index, + &pool, + ); + }, + ); + } + ChainedChannel::NextContext(next_context) => { + bank = next_context.bank().clone(); + } + ChainedChannel::NextChannel(mut next_receiver_box) => { + result_sender + .send(( + mem::replace(&mut result, Ok(())), + mem::take(&mut timings), + mem::take(&mut count), + )) + .unwrap(); + (transaction_receiver, result_sender) = + next_receiver_box.unwrap_channel_pair(); + } + } + } + } + }); + } + + Self { + id: thread_rng().gen::(), + pool, + transaction_sender, + result_receiver, + lane_count, + context: initial_context, + _phantom: PhantomData, + } + } + } + impl + Clone> + InstalledScheduler for NonblockingScheduler + { + fn id(&self) -> SchedulerId { + self.id + } + + fn context(&self) -> &SchedulingContext { + &self.context + } + + fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) { + self.transaction_sender + .send(ChainedChannel::Payload(transaction_with_index)) + .unwrap(); + } + + fn wait_for_termination(&mut self, _is_dropped: bool) -> Option { + let (next_transaction_sender, next_transaction_receiver) = + crossbeam_channel::unbounded::(); + let (next_result_sender, next_result_receiver) = crossbeam_channel::unbounded(); + for _ in 0..self.lane_count { + let (next_transaction_receiver, next_result_sender) = ( + next_transaction_receiver.clone(), + next_result_sender.clone(), + ); + self.transaction_sender + .send(ChainedChannel::NextChannel(Box::new(ChannelPairOption( + Some((next_transaction_receiver, next_result_sender)), + )))) + .unwrap(); + } + self.transaction_sender = next_transaction_sender; + + let mut overall_result = Ok(()); + let mut overall_timings = ExecuteTimings::default(); + + while let Ok((result, timings, count)) = self.result_receiver.recv() { + match result { + Ok(()) => {} + Err(e) => overall_result = Err(e), + } + overall_timings.accumulate(&timings); + trace!("received: {count:?}"); + } + self.result_receiver = next_result_receiver; + + Some((overall_result, overall_timings)) + } + + /* + fn return_to_pool(self: Box) { + self.pool.clone().return_scheduler(self) + } + */ + fn pause_for_recent_blockhash(&mut self) { + todo!() + } + } + + #[bench] + fn bench_with_01_thread_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 1, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_01_thread_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 1, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_04_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 4, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_04_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 4, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_08_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 8, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_08_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 8, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_16_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 16, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_16_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 16, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } +} + +// demonstrate meaningfully differing performance profile regarding multi worker thread utilization +// with saturated transaction execution for each bench scenarios, with/without the existence of +// artificial and needless synchronizations. +// conversely, the whole InstallableScheduler machinery can be justified as it can eliminate these +// synchronizations altogether to bare minimum (i.e. bank freeze). +#[cfg(feature = "dummy")] +mod thread_utilization { + use { + super::*, + crate::nonblocking::NonblockingScheduler, + solana_nohash_hasher::IntSet, + solana_sdk::{ + signature::Signature, signer::keypair::Keypair, + system_instruction::SystemInstruction::Transfer, transaction::TransactionAccountLocks, + }, + std::{collections::HashMap, sync::Mutex, thread::sleep, time::Duration}, + }; + + #[derive(Debug, Clone)] + struct SleepyHandler; + + impl TaskHandler for SleepyHandler { + fn create>(_pool: &SchedulerPool) -> Self { + Self + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + let Ok(Transfer { lamports: sleep_ms }) = + bincode::deserialize(&transaction.message().instructions()[0].data) + else { + panic!() + }; + + sleep(Duration::from_millis(sleep_ms)); + } + } + + enum Step { + Batch(Vec), + // mimic periodic or contention-induced synchronization with this artificial blocking + MaySynchronize, + } + + const WORKER_THREAD_COUNT: usize = 10; + + fn simulate_synchronization_point>( + scheduler: &mut T, + context: SchedulingContext, + ) { + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context); + } + + fn run_scenario_and_finalize>( + bencher: &mut Bencher, + really_synchronize: bool, + scheduler: &mut T, + context: SchedulingContext, + create_scenario: impl Fn() -> Vec, + ) { + let scenario = &create_scenario(); + bencher.iter(|| { + for step in scenario { + match step { + Step::Batch(txes) => { + for tx in txes { + scheduler.schedule_execution(tx.clone()); + } + } + Step::MaySynchronize => { + if really_synchronize { + simulate_synchronization_point(scheduler, context.clone()); + } + } + } + } + simulate_synchronization_point(scheduler, context.clone()); + }) + } + + // frequent synchronization creates non-zero idling time among some of worker threads, given + // batches with mixed transactions. then, it adds up as these kinds synchronizations occurs over + // processing + fn bench_random_execution_durations(bencher: &mut Bencher, really_synchronize: bool) { + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + + let create_tx_with_index = |index| { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + // simulate somewhat realistic work load; txes finish at different timings + thread_rng().gen_range(1..10), + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, index)) + }; + + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + let mut scheduler = + NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, SleepyHandler); + + run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || { + const TX_PER_BATCH: usize = 20; + const SYNCHRONIZATION_PER_BENCH_ITER: usize = 10; + + (0..SYNCHRONIZATION_PER_BENCH_ITER) + .flat_map(|_| { + [ + Step::Batch((0..TX_PER_BATCH).map(create_tx_with_index).collect()), + Step::MaySynchronize, + ] + }) + .collect() + }); + } + + #[bench] + fn bench_random_execution_durations_with_interleaved_synchronization(bencher: &mut Bencher) { + bench_random_execution_durations(bencher, true); + } + + #[bench] + fn bench_random_execution_durations_without_interleaved_synchronization(bencher: &mut Bencher) { + bench_random_execution_durations(bencher, false); + } + + #[derive(Debug, Clone)] + struct SleepyHandlerWithCompletionSignal(crossbeam_channel::Sender); + + impl TaskHandler for SleepyHandlerWithCompletionSignal { + fn create>(_pool: &SchedulerPool) -> Self { + // not needed for bench... + unimplemented!(); + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + let Ok(Transfer { lamports: sleep_ms }) = + bincode::deserialize(&transaction.message().instructions()[0].data) + else { + panic!() + }; + + sleep(Duration::from_millis(sleep_ms)); + + self.0.send(*transaction.signature()).unwrap(); + } + } + + // a wrapper InstallableScheduler to integrate with dep graph scheduling logic + #[derive(Debug)] + struct NonblockingSchedulerWithDepGraph { + inner_scheduler: NonblockingScheduler, + pending_transactions: Mutex>, + completion_receiver: crossbeam_channel::Receiver, + } + + impl InstalledScheduler for NonblockingSchedulerWithDepGraph { + fn id(&self) -> SchedulerId { + self.inner_scheduler.id() + } + + fn context(&self) -> &SchedulingContext { + self.inner_scheduler.context() + } + + fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) { + // just buffer all the txes to work with the dep graph outer loop nicely, which needs + // some buffering to schedule efficiently + // note taht the prompt execution as soon as entering into schedule_execution() isn't + // needed for these particular bench purposes. so, buffering is okay in that regard. + self.pending_transactions + .lock() + .unwrap() + .push(transaction_with_index.0.clone()); + } + + fn wait_for_termination(&mut self, is_dropped: bool) -> Option { + // execute all the pending transactions now! + self.execute_batches( + self.context().bank(), + &std::mem::take(&mut *self.pending_transactions.lock().unwrap()), + &self.completion_receiver, + ) + .unwrap(); + + self.inner_scheduler.wait_for_termination(is_dropped) + } + + /* + fn return_to_pool(self: Box) { + Box::new(self.inner_scheduler).return_to_pool() + } + */ + } + + /* + impl InstallableScheduler for NonblockingSchedulerWithDepGraph { + fn replace_context(&mut self, context: SchedulingContext) { + self.inner_scheduler.replace_context(context) + } + } + */ + + // adapted from https://github.com/jito-foundation/jito-solana/pull/294; retained to be as-is + // as much as possible by the use of some wrapper type hackery. + impl NonblockingSchedulerWithDepGraph { + // for each index, builds a transaction dependency graph of indices that need to execute before + // the current one. + // The returned Vec> is a 1:1 mapping for the indices that need to be executed + // before that index can be executed + fn build_dependency_graph( + tx_account_locks: &[TransactionAccountLocks], + ) -> Vec> { + // build a map whose key is a pubkey + value is a sorted vector of all indices that + // lock that account + let mut indices_read_locking_account = HashMap::new(); + let mut indicies_write_locking_account = HashMap::new(); + tx_account_locks + .iter() + .enumerate() + .for_each(|(idx, tx_account_locks)| { + for account in &tx_account_locks.readonly { + indices_read_locking_account + .entry(**account) + .and_modify(|indices: &mut Vec| indices.push(idx)) + .or_insert_with(|| vec![idx]); + } + for account in &tx_account_locks.writable { + indicies_write_locking_account + .entry(**account) + .and_modify(|indices: &mut Vec| indices.push(idx)) + .or_insert_with(|| vec![idx]); + } + }); + + tx_account_locks + .iter() + .enumerate() + .map(|(idx, account_locks)| { + let mut dep_graph: IntSet = IntSet::default(); + + let readlock_conflict_accs = account_locks.writable.iter(); + let writelock_conflict_accs = account_locks + .readonly + .iter() + .chain(account_locks.writable.iter()); + + for acc in readlock_conflict_accs { + if let Some(indices) = indices_read_locking_account.get(acc) { + dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx)); + } + } + + for acc in writelock_conflict_accs { + if let Some(indices) = indicies_write_locking_account.get(acc) { + dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx)); + } + } + dep_graph + }) + .collect() + } + + fn execute_batches( + &self, + bank: &Arc, + pending_transactions: &[SanitizedTransaction], + receiver: &crossbeam_channel::Receiver, + ) -> Result<()> { + if pending_transactions.is_empty() { + return Ok(()); + } + + let mut tx_account_locks: Vec<_> = Vec::with_capacity(pending_transactions.len()); + for tx in pending_transactions { + tx_account_locks + .push(tx.get_account_locks(bank.get_transaction_account_lock_limit())?); + } + + // the dependency graph contains the indices that must be executed (marked with + // State::Done) before they can be executed + let dependency_graph = Self::build_dependency_graph(&tx_account_locks); + + #[derive(Clone)] + enum State { + Blocked, + Processing, + Done, + } + + let mut processing_states: Vec = vec![State::Blocked; dependency_graph.len()]; + let mut signature_indices: HashMap<&Signature, usize> = + HashMap::with_capacity(dependency_graph.len()); + signature_indices.extend( + pending_transactions + .iter() + .enumerate() + .map(|(idx, tx)| (tx.signature(), idx)), + ); + + loop { + let mut is_done = true; + for idx in 0..processing_states.len() { + match processing_states[idx] { + State::Blocked => { + is_done = false; + + // if all the dependent txs are executed, this transaction can be + // scheduled for execution. + if dependency_graph[idx] + .iter() + .all(|idx| matches!(processing_states[*idx], State::Done)) + { + self.inner_scheduler.schedule_execution(Arc::new(( + pending_transactions[idx].clone(), + idx, + ))); + // this idx can be scheduled and moved to processing + processing_states[idx] = State::Processing; + } + } + State::Processing => { + is_done = false; + } + State::Done => {} + } + } + + if is_done { + break; + } + + let mut executor_responses: Vec<_> = vec![receiver.recv().unwrap()]; + executor_responses.extend(receiver.try_iter()); + for r in &executor_responses { + processing_states[*signature_indices.get(r).unwrap()] = State::Done; + } + } + Ok(()) + } + } + + // frequent synchronizations hampers efficient (= parallelizable) scheduling of several chunks + // of txes which are tied together for each common account locks. Ideally those independent chunks can be + // executed in parallel, which each is consuming one worker thread as a form of serialized runs + // of processing. However, should synchronizations occurs between boundaries of those chunks + // arrival, it cannot schedule the later-coming one because it firstly flush out the the first + // one + // in other words, this is just a re-manifestation of perf. issue coming from write barriers in + // general. + fn bench_long_serialized_runs(bencher: &mut Bencher, really_synchronize: bool) { + let GenesisConfigInfo { genesis_config, .. } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let (kp1, kp2) = (Keypair::new(), Keypair::new()); + + let create_tx_of_serialized_run1 = || { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &kp1, + &solana_sdk::pubkey::new_rand(), + 10, + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, 0)) + }; + let create_tx_of_serialized_run2 = || { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &kp2, + &solana_sdk::pubkey::new_rand(), + 10, + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, 0)) + }; + + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + let (completion_sender, completion_receiver) = crossbeam_channel::unbounded(); + let handler = SleepyHandlerWithCompletionSignal(completion_sender); + let tx_lock_ignoring_scheduler = + NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, handler); + let tx_lock_adhering_scheduler = NonblockingSchedulerWithDepGraph { + inner_scheduler: tx_lock_ignoring_scheduler, + pending_transactions: Mutex::new(Vec::default()), + completion_receiver, + }; + let mut scheduler = tx_lock_adhering_scheduler; + run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || { + (0..1) + .flat_map(|_| { + [ + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::MaySynchronize, + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::MaySynchronize, + ] + }) + .collect() + }); + } + + #[bench] + fn bench_long_serialized_runs_with_interleaved_synchronization(bencher: &mut Bencher) { + bench_long_serialized_runs(bencher, true); + } + + #[bench] + fn bench_long_serialized_runs_without_interleaved_synchronization(bencher: &mut Bencher) { + bench_long_serialized_runs(bencher, false); + } +} diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 0b7c5495b0accc..1ef0c5b053c1c4 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -1,8 +1,3 @@ -//! NOTE: While the unified scheduler is fully functional and moderately performant even with -//! mainnet-beta, it has known resource-exhaustion related security issues for replaying -//! specially-crafted blocks produced by malicious leaders. Thus, this experimental and -//! nondefault functionality is exempt from the bug bounty program for now. -//! //! Transaction scheduling code. //! //! This crate implements 3 solana-runtime traits (`InstalledScheduler`, `UninstalledScheduler` and @@ -17,37 +12,47 @@ use qualifier_attr::qualifiers; use { assert_matches::assert_matches, - crossbeam_channel::{self, never, select, Receiver, RecvError, SendError, Sender}, + cpu_time::ThreadTime, + crossbeam_channel::{ + self, disconnected, never, select_biased, Receiver, RecvError, RecvTimeoutError, SendError, + Sender, TryRecvError, + }, dashmap::DashMap, derivative::Derivative, log::*, solana_ledger::blockstore_processor::{ execute_batch, TransactionBatchWithIndexes, TransactionStatusSender, }, + solana_measure::measure::Measure, + solana_metrics::datapoint_info_at, solana_program_runtime::timings::ExecuteTimings, solana_runtime::{ bank::Bank, + compute_budget_details::GetComputeBudgetDetails, installed_scheduler_pool::{ - InstalledScheduler, InstalledSchedulerBox, InstalledSchedulerPool, - InstalledSchedulerPoolArc, ResultWithTimings, SchedulerId, SchedulingContext, - UninstalledScheduler, UninstalledSchedulerBox, + DefaultScheduleExecutionArg, InstalledScheduler, InstalledSchedulerPool, + InstalledSchedulerPoolArc, ResultWithTimings, ScheduleExecutionArg, SchedulerId, + SchedulingContext, UninstalledScheduler, UninstalledSchedulerBox, + WithTransactionAndIndex, }, prioritization_fee_cache::PrioritizationFeeCache, }, solana_sdk::{ + clock::Slot, pubkey::Pubkey, - transaction::{Result, SanitizedTransaction}, + transaction::{Result, SanitizedTransaction, TransactionError}, }, solana_unified_scheduler_logic::{SchedulingStateMachine, Task, UsageQueue}, solana_vote::vote_sender_types::ReplayVoteSender, std::{ + env, fmt::Debug, - marker::PhantomData, sync::{ atomic::{AtomicU64, Ordering::Relaxed}, - Arc, Mutex, OnceLock, Weak, + Arc, Mutex, OnceLock, RwLock, RwLockReadGuard, Weak, }, thread::{self, JoinHandle}, + time::{Duration, Instant, SystemTime}, }, }; @@ -57,7 +62,12 @@ type AtomicSchedulerId = AtomicU64; // contains some internal fields, whose types aren't available in solana-runtime (currently // TransactionStatusSender; also, PohRecorder in the future)... #[derive(Debug)] -pub struct SchedulerPool, TH: TaskHandler> { +pub struct SchedulerPool +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ scheduler_inners: Mutex>, handler_count: usize, handler_context: HandlerContext, @@ -73,7 +83,11 @@ pub struct SchedulerPool, TH: TaskHandler> { // memory increase. weak_self: Weak, next_scheduler_id: AtomicSchedulerId, - _phantom: PhantomData, + // prune schedulers, stop idling scheduler's threads, sanity check on the + // usage queue loader after scheduler is returned. + cleaner_sender: Sender>>>, + cleaner_exit_signal_sender: Sender<()>, + cleaner_thread: Mutex>>, } #[derive(Debug)] @@ -84,13 +98,117 @@ pub struct HandlerContext { prioritization_fee_cache: Arc, } -pub type DefaultSchedulerPool = - SchedulerPool, DefaultTaskHandler>; +pub type DefaultSchedulerPool = SchedulerPool< + PooledScheduler, + DefaultTaskHandler, + DefaultScheduleExecutionArg, +>; + +struct WatchedThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + thread_manager: Weak>>, + #[cfg(target_os = "linux")] + tick: u64, + #[cfg(target_os = "linux")] + updated_at: Instant, +} -impl SchedulerPool +impl WatchedThreadManager where - S: SpawnableScheduler, - TH: TaskHandler, + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn new(thread_manager: Weak>>) -> Self { + Self { + thread_manager, + #[cfg(target_os = "linux")] + tick: 0, + #[cfg(target_os = "linux")] + updated_at: Instant::now(), + } + } + + fn retire_if_stale(&mut self) -> bool { + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + let Some(thread_manager) = self.thread_manager.upgrade() else { + return false; + }; + + // The following linux-only code implements an eager native thread reclaiming, which is + // only useful if the solana-validator sees many unrooted forks. Such hostile situations + // should NEVER happen on remotely-uncontrollable ledgers created by solana-test-validator. + // And it's generally not expected mainnet-beta validators (or any live clusters for that + // matter) to be run on non-linux OSes at all. + // + // Thus, this OS-specific implementation can be justified because this enables the hot-path + // (the scheduler main thread) to omit VDSO calls and timed-out futex syscalls by relying on + // this out-of-bound cleaner for a defensive thread reclaiming. + #[cfg(target_os = "linux")] + { + let Some(tid) = thread_manager.read().unwrap().active_tid_if_not_primary() else { + self.tick = 0; + self.updated_at = Instant::now(); + return true; + }; + + let pid = std::process::id(); + let task = procfs::process::Process::new(pid.try_into().unwrap()) + .unwrap() + .task_from_tid(tid) + .unwrap(); + let stat = task.stat().unwrap(); + let current_tick = stat.utime.checked_add(stat.stime).unwrap(); + if current_tick > self.tick { + self.tick = current_tick; + self.updated_at = Instant::now(); + } else { + // 5x of 400ms block time + const IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM: Duration = Duration::from_secs(2); + + let elapsed = self.updated_at.elapsed(); + if elapsed > IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM { + const BITS_PER_HEX_DIGIT: usize = 4; + let thread_manager = &mut thread_manager.write().unwrap(); + info!( + "[sch_{:0width$x}]: cleaner: retire_if_stale(): stopping thread manager ({tid}/{} <= {}/{:?})...", + thread_manager.scheduler_id, + current_tick, + self.tick, + elapsed, + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + thread_manager.suspend(); + self.tick = 0; + self.updated_at = Instant::now(); + } + } + } + + true + } +} + +impl Drop for SchedulerPool +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn drop(&mut self) { + info!("SchedulerPool::drop() is successfully called"); + } +} + +impl SchedulerPool +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { // Some internal impl and test code want an actual concrete type, NOT the // `dyn InstalledSchedulerPool`. So don't merge this into `Self::new_dyn()`. @@ -105,7 +223,64 @@ where let handler_count = handler_count.unwrap_or(Self::default_handler_count()); assert!(handler_count >= 1); - Arc::new_cyclic(|weak_self| Self { + let (scheduler_pool_sender, scheduler_pool_receiver) = crossbeam_channel::bounded(1); + let (cleaner_sender, cleaner_receiver) = crossbeam_channel::unbounded(); + let (cleaner_exit_signal_sender, cleaner_exit_signal_receiver) = + crossbeam_channel::unbounded(); + + let cleaner_main_loop = || { + move || { + let scheduler_pool: Arc = scheduler_pool_receiver.recv().unwrap(); + drop(scheduler_pool_receiver); + + let mut thread_managers: Vec> = vec![]; + + 'outer: loop { + let mut schedulers = scheduler_pool.scheduler_inners.lock().unwrap(); + let schedulers_len_pre_retain = schedulers.len(); + schedulers.retain_mut(|scheduler| scheduler.retire_if_stale()); + let schedulers_len_post_retain = schedulers.len(); + drop(schedulers); + + let thread_manager_len_pre_retain = thread_managers.len(); + thread_managers.retain_mut(|thread_manager| thread_manager.retire_if_stale()); + + let thread_manager_len_pre_push = thread_managers.len(); + 'inner: loop { + match cleaner_receiver.try_recv() { + Ok(thread_manager) => { + thread_managers.push(WatchedThreadManager::new(thread_manager)) + } + Err(TryRecvError::Disconnected) => break 'outer, + Err(TryRecvError::Empty) => break 'inner, + } + } + + info!( + "cleaner: unused schedulers in the pool: {} => {}, all thread managers: {} => {} => {}", + schedulers_len_pre_retain, + schedulers_len_post_retain, + thread_manager_len_pre_retain, + thread_manager_len_pre_push, + thread_managers.len(), + ); + // wait for signal with timeout here instead of recv_timeout() to write all the + // preceeding logs at once. + match cleaner_exit_signal_receiver.recv_timeout(Duration::from_secs(1)) { + Ok(()) | Err(RecvTimeoutError::Disconnected) => break 'outer, + Err(RecvTimeoutError::Timeout) => continue, + } + } + info!("cleaner thread terminating!"); + } + }; + + let cleaner_thread = thread::Builder::new() + .name("solScCleaner".to_owned()) + .spawn(cleaner_main_loop()) + .unwrap(); + + let scheduler_pool = Arc::new_cyclic(|weak_self| Self { scheduler_inners: Mutex::default(), handler_count, handler_context: HandlerContext { @@ -115,9 +290,13 @@ where prioritization_fee_cache, }, weak_self: weak_self.clone(), - next_scheduler_id: AtomicSchedulerId::default(), - _phantom: PhantomData, - }) + next_scheduler_id: AtomicSchedulerId::new(PRIMARY_SCHEDULER_ID), + cleaner_thread: Mutex::new(Some(cleaner_thread)), + cleaner_sender, + cleaner_exit_signal_sender, + }); + scheduler_pool_sender.send(scheduler_pool.clone()).unwrap(); + scheduler_pool } // This apparently-meaningless wrapper is handy, because some callers explicitly want @@ -128,7 +307,7 @@ where transaction_status_sender: Option, replay_vote_sender: Option, prioritization_fee_cache: Arc, - ) -> InstalledSchedulerPoolArc { + ) -> InstalledSchedulerPoolArc { Self::new( handler_count, log_messages_bytes_limit, @@ -156,16 +335,21 @@ where .push(scheduler); } + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] fn do_take_scheduler(&self, context: SchedulingContext) -> S { // pop is intentional for filo, expecting relatively warmed-up scheduler due to having been // returned recently - if let Some(inner) = self.scheduler_inners.lock().expect("not poisoned").pop() { - S::from_inner(inner, context) + if let Some(pooled_inner) = self.scheduler_inners.lock().expect("not poisoned").pop() { + S::from_inner(pooled_inner, context) } else { - S::spawn(self.self_arc(), context) + S::spawn(self.self_arc(), context, TH::create(self)) } } + fn register_to_cleaner(&self, thread_manager: Weak>>) { + self.cleaner_sender.send(thread_manager).unwrap(); + } + #[cfg(feature = "dev-context-only-utils")] pub fn pooled_scheduler_count(&self) -> usize { self.scheduler_inners.lock().expect("not poisoned").len() @@ -203,18 +387,41 @@ where } } -impl InstalledSchedulerPool for SchedulerPool +impl InstalledSchedulerPool for SchedulerPool where - S: SpawnableScheduler, - TH: TaskHandler, + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { - fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox { + fn take_scheduler(&self, context: SchedulingContext) -> Box> { Box::new(self.do_take_scheduler(context)) } + + fn uninstalled_from_bank_forks(self: Arc) { + self.scheduler_inners.lock().unwrap().clear(); + self.cleaner_exit_signal_sender.send(()).unwrap(); + let () = self + .cleaner_thread + .lock() + .unwrap() + .take() + .unwrap() + .join() + .unwrap(); + info!( + "SchedulerPool::uninstalled_from_bank_forks(): joined cleaner thread at {:?}...", + thread::current() + ); + } } -pub trait TaskHandler: Send + Sync + Debug + Sized + 'static { +pub trait TaskHandler: + Send + Sync + Debug + Sized + Clone + 'static +{ + fn create>(pool: &SchedulerPool) -> Self; + fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -224,11 +431,16 @@ pub trait TaskHandler: Send + Sync + Debug + Sized + 'static { ); } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct DefaultTaskHandler; -impl TaskHandler for DefaultTaskHandler { +impl TaskHandler for DefaultTaskHandler { + fn create>(_pool: &SchedulerPool) -> Self { + Self + } + fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -259,15 +471,31 @@ impl TaskHandler for DefaultTaskHandler { struct ExecutedTask { task: Task, result_with_timings: ResultWithTimings, + slot: Slot, + thx: usize, + handler_timings: Option, +} + +pub struct HandlerTimings { + finish_time: SystemTime, + execution_us: u64, + execution_cpu_us: u128, } impl ExecutedTask { - fn new_boxed(task: Task) -> Box { + fn new_boxed(task: Task, thx: usize, slot: Slot) -> Box { Box::new(Self { task, result_with_timings: initialized_result_with_timings(), + slot, + thx, + handler_timings: None, }) } + + fn is_err(&self) -> bool { + self.result_with_timings.0.is_err() + } } // A very tiny generic message type to signal about opening and closing of subchannels, which are @@ -283,6 +511,7 @@ enum SubchanneledPayload { } type NewTaskPayload = SubchanneledPayload; +type RetiredTaskPayload = SubchanneledPayload, ()>; // A tiny generic message type to synchronize multiple threads everytime some contextual data needs // to be switched (ie. SchedulingContext), just using a single communication channel. @@ -370,6 +599,14 @@ mod chained_channel { self.aux_sender = chained_aux_sender; Ok(()) } + + pub(super) fn len(&self) -> usize { + self.sender.len() + } + + pub(super) fn aux_len(&self) -> usize { + self.aux_sender.len() + } } // P doesn't need to be `: Clone`, yet rustc derive can't handle it. @@ -447,14 +684,14 @@ impl UsageQueueLoader { pub fn load(&self, address: Pubkey) -> UsageQueue { self.usage_queues.entry(address).or_default().clone() } -} -// (this is slow needing atomic mem reads. However, this can be turned into a lot faster -// optimizer-friendly version as shown in this crossbeam pr: -// https://github.com/crossbeam-rs/crossbeam/pull/1047) -fn disconnected() -> Receiver { - // drop the sender residing at .0, returning an always-disconnected receiver. - crossbeam_channel::unbounded().1 + pub fn usage_queue_count(&self) -> usize { + self.usage_queues.len() + } + + pub fn clear(&self) { + self.usage_queues.clear(); + } } fn initialized_result_with_timings() -> ResultWithTimings { @@ -462,49 +699,174 @@ fn initialized_result_with_timings() -> ResultWithTimings { } #[derive(Debug)] -pub struct PooledScheduler { - inner: PooledSchedulerInner, +pub struct PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + inner: PooledSchedulerInner, context: SchedulingContext, } #[derive(Debug)] -pub struct PooledSchedulerInner, TH: TaskHandler> { - thread_manager: ThreadManager, +pub struct PooledSchedulerInner +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + thread_manager: Arc>>, usage_queue_loader: UsageQueueLoader, + pooled_at: Instant, +} + +impl PooledSchedulerInner +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn pooled_since(&self) -> Duration { + self.pooled_at.elapsed() + } + + fn suspend_thread_manager(&mut self) { + debug!("suspend_thread_manager()"); + self.thread_manager.write().unwrap().suspend(); + } + + fn id(&self) -> SchedulerId { + self.thread_manager.read().unwrap().scheduler_id + } } +type Tid = i32; +// The linux's tid (essentially is in the pid name space) is guaranteed to be non-zero; so +// using 0 for special purpose at user-land is totally safe. +#[cfg_attr(target_os = "linux", allow(dead_code))] +const DUMMY_TID: Tid = 0; + +#[derive(Default)] +struct LogInterval(usize); + +impl LogInterval { + fn increment(&mut self) -> bool { + let should_log = self.0 % 1000 == 0; + self.0 = self.0.checked_add(1).unwrap(); + should_log + } +} + +const PRIMARY_SCHEDULER_ID: SchedulerId = 0; + // This type manages the OS threads for scheduling and executing transactions. The term // `session` is consistently used to mean a group of Tasks scoped under a single SchedulingContext. // This is equivalent to a particular bank for block verification. However, new terms is introduced // here to mean some continuous time over multiple continuous banks/slots for the block production, // which is planned to be implemented in the future. #[derive(Debug)] -struct ThreadManager, TH: TaskHandler> { +struct ThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ scheduler_id: SchedulerId, - pool: Arc>, + pool: Arc>, + handler: TH, new_task_sender: Sender, - new_task_receiver: Receiver, + new_task_receiver: Option>, session_result_sender: Sender>, session_result_receiver: Receiver>, session_result_with_timings: Option, - scheduler_thread: Option>, + scheduler_thread_and_tid: Option<(JoinHandle>, Tid)>, handler_threads: Vec>, + accumulator_thread: Option>, } -impl PooledScheduler { - fn do_spawn(pool: Arc>, initial_context: SchedulingContext) -> Self { +impl PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn do_spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self { Self::from_inner( - PooledSchedulerInner:: { - thread_manager: ThreadManager::new(pool), + PooledSchedulerInner { + thread_manager: Arc::new(RwLock::new(ThreadManager::new(pool.clone(), handler))), usage_queue_loader: UsageQueueLoader::default(), + pooled_at: Instant::now(), }, initial_context, ) } + + #[cfg(feature = "dev-context-only-utils")] + pub fn clear_session_result_with_timings(&mut self) { + assert_matches!( + self.inner + .thread_manager + .write() + .unwrap() + .take_session_result_with_timings(), + (Ok(_), _) + ); + } + + #[cfg(feature = "dev-context-only-utils")] + pub fn restart_session(&mut self) { + self.inner + .thread_manager + .write() + .unwrap() + .start_session(&self.context); + } + + #[cfg(feature = "dev-context-only-utils")] + pub fn schedule_task(&self, task: Task) { + self.inner.thread_manager.read().unwrap().send_task(task); + } + + fn ensure_thread_manager_resumed( + &self, + context: &SchedulingContext, + ) -> std::result::Result>, TransactionError> + { + let mut was_already_active = false; + loop { + let read = self.inner.thread_manager.read().unwrap(); + if !read.is_suspended() { + debug!( + "{}", + if was_already_active { + "ensure_thread_manager_resumed(): was already active." + } else { + "ensure_thread_manager_resumed(): wasn't already active..." + } + ); + return Ok(read); + } else { + debug!("ensure_thread_manager_resumed(): will start threads..."); + drop(read); + let mut write = self.inner.thread_manager.write().unwrap(); + write.start_or_try_resume_threads(context)?; + drop(write); + was_already_active = false; + } + } + } } -impl, TH: TaskHandler> ThreadManager { - fn new(pool: Arc>) -> Self { +impl ThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn new(pool: Arc>, handler: TH) -> Self { let (new_task_sender, new_task_receiver) = crossbeam_channel::unbounded(); let (session_result_sender, session_result_receiver) = crossbeam_channel::unbounded(); let handler_count = pool.handler_count; @@ -512,23 +874,40 @@ impl, TH: TaskHandler> ThreadManager { Self { scheduler_id: pool.new_scheduler_id(), pool, + handler, new_task_sender, - new_task_receiver, + new_task_receiver: Some(new_task_receiver), session_result_sender, session_result_receiver, session_result_with_timings: None, - scheduler_thread: None, + scheduler_thread_and_tid: None, handler_threads: Vec::with_capacity(handler_count), + accumulator_thread: None, } } + fn is_suspended(&self) -> bool { + self.scheduler_thread_and_tid.is_none() + } + + pub fn take_scheduler_thread(&mut self) -> Option>> { + self.scheduler_thread_and_tid + .take() + .map(|(thread, _tid)| thread) + } + fn execute_task_with_handler( + handler: &TH, bank: &Arc, executed_task: &mut Box, handler_context: &HandlerContext, + send_metrics: bool, ) { + let handler_timings = + send_metrics.then_some((Measure::start("process_message_time"), ThreadTime::now())); debug!("handling task at {:?}", thread::current()); TH::handle( + handler, &mut executed_task.result_with_timings.0, &mut executed_task.result_with_timings.1, bank, @@ -536,30 +915,76 @@ impl, TH: TaskHandler> ThreadManager { executed_task.task.task_index(), handler_context, ); + if let Some((mut wall_time, cpu_time)) = handler_timings { + executed_task.handler_timings = Some(HandlerTimings { + finish_time: SystemTime::now(), + execution_cpu_us: cpu_time.elapsed().as_micros(), + execution_us: { + // make wall time is longer than cpu time, always + wall_time.stop(); + wall_time.as_us() + }, + }); + } } fn accumulate_result_with_timings( - (result, timings): &mut ResultWithTimings, + (_result, timings): &mut ResultWithTimings, executed_task: Box, ) { - match executed_task.result_with_timings.0 { - Ok(()) => {} - Err(error) => { - error!("error is detected while accumulating....: {error:?}"); - // Override errors intentionally for simplicity, not retaining the - // first error unlike the block verification in the - // blockstore_processor. This will be addressed with more - // full-fledged impl later. - *result = Err(error); - } + assert_matches!(executed_task.result_with_timings.0, Ok(())); + + if let Some(handler_timings) = &executed_task.handler_timings { + let thread = format!("solScExLane{:02}", executed_task.thx); + let signature = executed_task.task.transaction().signature().to_string(); + let account_locks_in_json = serde_json::to_string( + &executed_task + .task + .transaction() + .get_account_locks_unchecked(), + ) + .unwrap(); + let status = format!("{:?}", executed_task.result_with_timings.0); + let compute_unit_price = executed_task + .task + .transaction() + .get_compute_budget_details(false) + .map(|d| d.compute_unit_price) + .unwrap_or_default(); + + datapoint_info_at!( + handler_timings.finish_time, + "transaction_timings", + ("slot", executed_task.slot, i64), + ("index", executed_task.task.task_index(), i64), + ("thread", thread, String), + ("signature", signature, String), + ("account_locks_in_json", account_locks_in_json, String), + ("status", status, String), + ("duration", handler_timings.execution_us, i64), + ("cpu_duration", handler_timings.execution_cpu_us, i64), + ("compute_units", 0 /*task.cu*/, i64), + ("priority", compute_unit_price, i64), // old name is kept for compat... + ); } timings.accumulate(&executed_task.result_with_timings.1); + drop(executed_task); } fn take_session_result_with_timings(&mut self) -> ResultWithTimings { self.session_result_with_timings.take().unwrap() } + fn reset_session_on_error(&mut self) -> Result<()> { + let err = self + .session_result_with_timings + .replace(initialized_result_with_timings()) + .unwrap() + .0; + assert_matches!(err, Err(_)); + err + } + fn put_session_result_with_timings(&mut self, result_with_timings: ResultWithTimings) { assert_matches!( self.session_result_with_timings @@ -568,7 +993,24 @@ impl, TH: TaskHandler> ThreadManager { ); } - fn start_threads(&mut self, context: &SchedulingContext) { + fn start_or_try_resume_threads(&mut self, context: &SchedulingContext) -> Result<()> { + if !self.is_suspended() { + // this can't be promoted to panic! as read => write upgrade isn't completely + // race-free in ensure_thread_manager_resumed()... + warn!("try_resume(): already resumed"); + return Ok(()); + } else if self + .session_result_with_timings + .as_ref() + .map(|(result, _)| result.is_err()) + .unwrap_or(false) + { + warn!("try_resume(): skipping resuming due to err, while resetting session result"); + return self.reset_session_on_error(); + } + debug!("try_resume(): doing now"); + + let send_metrics = env::var("SOLANA_TRANSACTION_TIMINGS").is_ok(); // Firstly, setup bi-directional messaging between the scheduler and handlers to pass // around tasks, by creating 2 channels (one for to-be-handled tasks from the scheduler to // the handlers and the other for finished tasks from the handlers to the scheduler). @@ -660,7 +1102,14 @@ impl, TH: TaskHandler> ThreadManager { let (finished_idle_task_sender, finished_idle_task_receiver) = crossbeam_channel::unbounded::>(); - let mut result_with_timings = self.session_result_with_timings.take(); + let (retired_task_sender, retired_task_receiver) = + crossbeam_channel::unbounded::(); + let (accumulated_result_sender, accumulated_result_receiver) = + crossbeam_channel::unbounded::>(); + + let scheduler_id = self.scheduler_id; + let mut slot = context.bank().slot(); + let (tid_sender, tid_receiver) = crossbeam_channel::bounded(1); // High-level flow of new tasks: // 1. the replay stage thread send a new task. @@ -669,12 +1118,16 @@ impl, TH: TaskHandler> ThreadManager { // 4. the handler thread processes the dispatched task. // 5. the handler thread reply back to the scheduler thread as an executed task. // 6. the scheduler thread post-processes the executed task. + // 7. the scheduler thread send the executed task to the accumulator thread. + // 8. the accumulator thread examines the executed task's result and accumulate its timing, + // finally dropping the transaction inside the executed task. let scheduler_main_loop = || { let handler_count = self.pool.handler_count; let session_result_sender = self.session_result_sender.clone(); - let new_task_receiver = self.new_task_receiver.clone(); + let mut new_task_receiver = self.new_task_receiver.take().unwrap(); let mut session_ending = false; + let mut thread_suspending = false; // Now, this is the main loop for the scheduler thread, which is a special beast. // @@ -724,24 +1177,43 @@ impl, TH: TaskHandler> ThreadManager { } }; + const BITS_PER_HEX_DIGIT: usize = 4; let mut state_machine = unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; - - loop { - if let Ok(NewTaskPayload::OpenSubchannel(context)) = new_task_receiver.recv() { - // signal about new SchedulingContext to handler threads - runnable_task_sender - .send_chained_channel(context, handler_count) - .unwrap(); - assert_matches!( - result_with_timings.replace(initialized_result_with_timings()), - None + let mut log_interval = LogInterval::default(); + // hint compiler about inline[never] and unlikely? + macro_rules! log_scheduler { + ($prefix:tt) => { + info!( + "[sch_{:0width$x}]: slot: {}[{:12}]({}{}): state_machine(({}(+{})=>{})/{}|{}) channels(<{} >{}+{} <{}+{})", + scheduler_id, slot, + (if ($prefix) == "step" { "interval" } else { $prefix }), + (if session_ending {"S"} else {"-"}), (if thread_suspending {"T"} else {"-"}), + state_machine.active_task_count(), state_machine.unblocked_task_queue_count(), state_machine.handled_task_count(), + state_machine.total_task_count(), + state_machine.unblocked_task_count(), + new_task_receiver.len(), + runnable_task_sender.len(), runnable_task_sender.aux_len(), + finished_blocked_task_receiver.len(), finished_idle_task_receiver.len(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, ); - } else { - unreachable!(); - } + }; + } + trace!("solScheduler thread is running at: {:?}", thread::current()); + tid_sender + .send({ + #[cfg(not(target_os = "linux"))] + let tid = DUMMY_TID; + #[cfg(target_os = "linux")] + let tid = rustix::thread::gettid().as_raw_nonzero().get(); + tid + }) + .unwrap(); + log_scheduler!("T:started"); + + while !thread_suspending { let mut is_finished = false; while !is_finished { // ALL recv selectors are eager-evaluated ALWAYS by current crossbeam impl, @@ -751,9 +1223,6 @@ impl, TH: TaskHandler> ThreadManager { let dummy_unblocked_task_receiver = dummy_receiver(state_machine.has_unblocked_task()); - // (Assume this is biased; i.e. select_biased! in this crossbeam pr: - // https://github.com/rust-lang/futures-rs/pull/1976) - // // There's something special called dummy_unblocked_task_receiver here. // This odd pattern was needed to react to newly unblocked tasks from // _not-crossbeam-channel_ event sources, precisely at the specified @@ -764,13 +1233,25 @@ impl, TH: TaskHandler> ThreadManager { // consistent. Note that unified scheduler will go // into busy looping to seek lowest latency eventually. However, not now, // to measure _actual_ cpu usage easily with the select approach. - select! { + let state_change = select_biased! { recv(finished_blocked_task_receiver) -> executed_task => { let executed_task = executed_task.unwrap(); - state_machine.deschedule_task(&executed_task.task); - let result_with_timings = result_with_timings.as_mut().unwrap(); - Self::accumulate_result_with_timings(result_with_timings, executed_task); + if executed_task.is_err() { + log_scheduler!("S+T:aborted"); + // MUST: clear the usage queue loader before reusing this scheduler + // ... + session_result_sender.send(None).unwrap(); + // be explicit about specifically dropping this receiver + drop(new_task_receiver); + // this timings aren't for the accumulated one. but + // caller doesn't care. + return Some(executed_task.result_with_timings); + } else { + state_machine.deschedule_task(&executed_task.task); + retired_task_sender.send_buffered(RetiredTaskPayload::Payload(executed_task)).unwrap(); + } + "step" }, recv(dummy_unblocked_task_receiver) -> dummy => { assert_matches!(dummy, Err(RecvError)); @@ -779,89 +1260,229 @@ impl, TH: TaskHandler> ThreadManager { .schedule_next_unblocked_task() .expect("unblocked task"); runnable_task_sender.send_payload(task).unwrap(); + "step" }, recv(new_task_receiver) -> message => { - assert!(!session_ending); + assert!(message.is_err() || (!session_ending && !thread_suspending)); - match message.unwrap() { - NewTaskPayload::Payload(task) => { + match message { + Ok(NewTaskPayload::Payload(task)) => { if let Some(task) = state_machine.schedule_task(task) { runnable_task_sender.send_aux_payload(task).unwrap(); } + "step" } - NewTaskPayload::CloseSubchannel => { + Ok(NewTaskPayload::CloseSubchannel) => { session_ending = true; + "S:ending" } - NewTaskPayload::OpenSubchannel(_context) => { + Ok(NewTaskPayload::OpenSubchannel(_context)) => { unreachable!(); } + Err(_) => { + assert!(!thread_suspending); + thread_suspending = true; + + // Err(_) on new_task_receiver guarantees + // that there's no live sender and no messages to be + // received anymore; so dropping by overriding it with + // never() should pose no possibility of missed messages. + new_task_receiver = never(); + + "T:suspending" + } } }, recv(finished_idle_task_receiver) -> executed_task => { let executed_task = executed_task.unwrap(); - state_machine.deschedule_task(&executed_task.task); - let result_with_timings = result_with_timings.as_mut().unwrap(); - Self::accumulate_result_with_timings(result_with_timings, executed_task); + if executed_task.is_err() { + log_scheduler!("S+T:aborted"); + session_result_sender.send(None).unwrap(); + // be explicit about specifically dropping this receiver + drop(new_task_receiver); + // this timings aren't for the accumulated one. but + // caller doesn't care. + return Some(executed_task.result_with_timings); + } else { + state_machine.deschedule_task(&executed_task.task); + retired_task_sender.send_buffered(RetiredTaskPayload::Payload(executed_task)).unwrap(); + } + "step" }, }; + if state_change != "step" || log_interval.increment() { + log_scheduler!(state_change); + } - is_finished = session_ending && state_machine.has_no_active_task(); + is_finished = (session_ending || thread_suspending) + && state_machine.has_no_active_task(); } if session_ending { + log_scheduler!("S:ended"); state_machine.reinitialize(); + log_interval = LogInterval::default(); + retired_task_sender + .send(RetiredTaskPayload::CloseSubchannel) + .unwrap(); session_result_sender .send(Some( - result_with_timings - .take() + accumulated_result_receiver + .recv() + .unwrap() .unwrap_or_else(initialized_result_with_timings), )) .unwrap(); - session_ending = false; + if !thread_suspending { + session_ending = false; + } + } + + if !thread_suspending { + match new_task_receiver.recv() { + Ok(NewTaskPayload::OpenSubchannel(context)) => { + slot = context.bank().slot(); + // signal about new SchedulingContext to handler threads + runnable_task_sender + .send_chained_channel(context, handler_count) + .unwrap(); + retired_task_sender + .send(RetiredTaskPayload::OpenSubchannel(())) + .unwrap(); + log_scheduler!("S:started"); + } + Err(_) => { + assert!(!thread_suspending); + thread_suspending = true; + log_scheduler!("T:suspending"); + continue; + } + Ok(_) => { + unreachable!(); + } + } } } + + log_scheduler!("T:suspended"); + let scheduler_result_with_timings = if session_ending { + None + } else { + retired_task_sender + .send(RetiredTaskPayload::CloseSubchannel) + .unwrap(); + accumulated_result_receiver.recv().unwrap() + }; + trace!( + "solScheduler thread is terminating at: {:?}", + thread::current() + ); + scheduler_result_with_timings } }; - let handler_main_loop = || { + let handler_main_loop = |thx| { let pool = self.pool.clone(); + let handler = self.handler.clone(); let mut runnable_task_receiver = runnable_task_receiver.clone(); let finished_blocked_task_sender = finished_blocked_task_sender.clone(); let finished_idle_task_sender = finished_idle_task_sender.clone(); - move || loop { - let (task, sender) = select! { - recv(runnable_task_receiver.for_select()) -> message => { - if let Some(task) = runnable_task_receiver.after_select(message.unwrap()) { - (task, &finished_blocked_task_sender) - } else { - continue; - } - }, - recv(runnable_task_receiver.aux_for_select()) -> task => { - if let Ok(task) = task { - (task, &finished_idle_task_sender) - } else { - continue; - } - }, - }; - let mut task = ExecutedTask::new_boxed(task); - Self::execute_task_with_handler( - runnable_task_receiver.context().bank(), - &mut task, - &pool.handler_context, + move || { + trace!( + "solScHandler{:02} thread is running at: {:?}", + thx, + thread::current() ); - sender.send(task).unwrap(); + loop { + let (task, sender) = select_biased! { + recv(runnable_task_receiver.for_select()) -> message => { + match message { + Ok(message) => { + if let Some(task) = runnable_task_receiver.after_select(message) { + (task, &finished_blocked_task_sender) + } else { + continue; + } + }, + Err(_) => break, + } + }, + recv(runnable_task_receiver.aux_for_select()) -> task => { + if let Ok(task) = task { + (task, &finished_idle_task_sender) + } else { + continue; + } + }, + }; + let bank = runnable_task_receiver.context().bank(); + let mut task = ExecutedTask::new_boxed(task, thx, bank.slot()); + Self::execute_task_with_handler( + &handler, + bank, + &mut task, + &pool.handler_context, + send_metrics, + ); + if sender.send(task).is_err() { + break; + } + } + trace!( + "solScHandler{:02} thread is terminating at: {:?}", + thx, + thread::current() + ); + } + }; + + let mut accumulator_result_with_timings = self.session_result_with_timings.take(); + + let accumulator_main_loop = || { + move || 'outer: loop { + match retired_task_receiver.recv_timeout(Duration::from_millis(40)) { + Ok(RetiredTaskPayload::Payload(executed_task)) => { + Self::accumulate_result_with_timings( + accumulator_result_with_timings.as_mut().unwrap(), + executed_task, + ); + } + Ok(RetiredTaskPayload::OpenSubchannel(())) => { + assert_matches!( + accumulator_result_with_timings + .replace(initialized_result_with_timings()), + None + ); + } + Ok(RetiredTaskPayload::CloseSubchannel) => { + if accumulated_result_sender + .send(accumulator_result_with_timings.take()) + .is_err() + { + break 'outer; + } + } + Err(RecvTimeoutError::Disconnected) => break 'outer, + Err(RecvTimeoutError::Timeout) => continue, + } } }; - self.scheduler_thread = Some( + self.scheduler_thread_and_tid = Some(( thread::Builder::new() .name("solScheduler".to_owned()) .spawn(scheduler_main_loop()) .unwrap(), + tid_receiver.recv().unwrap(), + )); + + self.accumulator_thread = Some( + thread::Builder::new() + .name("solScAccmltr".to_owned()) + .spawn(accumulator_main_loop()) + .unwrap(), ); self.handler_threads = (0..self.pool.handler_count) @@ -869,97 +1490,194 @@ impl, TH: TaskHandler> ThreadManager { |thx| { thread::Builder::new() .name(format!("solScHandler{:02}", thx)) - .spawn(handler_main_loop()) + .spawn(handler_main_loop(thx)) .unwrap() } }) .collect(); + Ok(()) } - fn send_task(&self, task: Task) { + fn send_task(&self, task: Task) -> bool { debug!("send_task()"); self.new_task_sender .send(NewTaskPayload::Payload(task)) - .unwrap() + .is_err() } fn end_session(&mut self) { - if self.session_result_with_timings.is_some() { + debug!("end_session(): will end session..."); + if self.is_suspended() { + debug!("end_session(): no threads.."); + assert_matches!(self.session_result_with_timings, Some(_)); + return; + } else if self.session_result_with_timings.is_some() { debug!("end_session(): already result resides within thread manager.."); return; } - debug!("end_session(): will end session..."); - self.new_task_sender + let mut abort_detected = self + .new_task_sender .send(NewTaskPayload::CloseSubchannel) - .unwrap(); + .is_err(); if let Some(result_with_timings) = self.session_result_receiver.recv().unwrap() { + assert!(!abort_detected); self.put_session_result_with_timings(result_with_timings); + } else { + abort_detected = true; + } + + if abort_detected { + self.suspend(); } } fn start_session(&mut self, context: &SchedulingContext) { - assert_matches!(self.session_result_with_timings, None); - self.new_task_sender - .send(NewTaskPayload::OpenSubchannel(context.clone())) - .unwrap(); + if !self.is_suspended() { + assert_matches!(self.session_result_with_timings, None); + self.new_task_sender + .send(NewTaskPayload::OpenSubchannel(context.clone())) + .unwrap(); + } else { + self.put_session_result_with_timings(initialized_result_with_timings()); + assert_matches!(self.start_or_try_resume_threads(context), Ok(())); + } + } + + fn suspend(&mut self) { + let Some(scheduler_thread) = self.take_scheduler_thread() else { + warn!("suspend(): already suspended..."); + return; + }; + debug!("suspend(): terminating threads by {:?}", thread::current()); + + let (s, r) = crossbeam_channel::unbounded(); + (self.new_task_sender, self.new_task_receiver) = (s, Some(r)); + + let () = self.accumulator_thread.take().unwrap().join().unwrap(); + for thread in self.handler_threads.drain(..) { + debug!("joining...: {:?}", thread); + () = thread.join().unwrap(); + } + if let Some(result_with_timings) = scheduler_thread.join().unwrap() { + self.put_session_result_with_timings(result_with_timings); + } + + debug!( + "suspend(): successfully suspended threads by {:?}", + thread::current() + ); + } + + fn is_primary(&self) -> bool { + self.scheduler_id == PRIMARY_SCHEDULER_ID + } + + #[cfg(target_os = "linux")] + fn active_tid_if_not_primary(&self) -> Option { + if self.is_primary() { + // always exempt from cleaner... + None + } else { + self.scheduler_thread_and_tid.as_ref().map(|&(_, tid)| tid) + } } } -pub trait SpawnableScheduler: InstalledScheduler { - type Inner: Debug + Send + Sync; +pub trait SpawnableScheduler: InstalledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + type Inner: Debug + Send + Sync + RetirableSchedulerInner; fn into_inner(self) -> (ResultWithTimings, Self::Inner); fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self; - fn spawn(pool: Arc>, initial_context: SchedulingContext) -> Self + fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self where Self: Sized; } -impl SpawnableScheduler for PooledScheduler { - type Inner = PooledSchedulerInner; +pub trait RetirableSchedulerInner { + fn retire_if_stale(&mut self) -> bool; +} - fn into_inner(mut self) -> (ResultWithTimings, Self::Inner) { +impl SpawnableScheduler for PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + type Inner = PooledSchedulerInner; + + fn into_inner(self) -> (ResultWithTimings, Self::Inner) { let result_with_timings = { - let manager = &mut self.inner.thread_manager; + let manager = &mut self.inner.thread_manager.write().unwrap(); manager.end_session(); manager.take_session_result_with_timings() }; (result_with_timings, self.inner) } - fn from_inner(mut inner: Self::Inner, context: SchedulingContext) -> Self { - inner.thread_manager.start_session(&context); + fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self { + inner + .thread_manager + .write() + .unwrap() + .start_session(&context); Self { inner, context } } - fn spawn(pool: Arc>, initial_context: SchedulingContext) -> Self { - let mut scheduler = Self::do_spawn(pool, initial_context); - scheduler - .inner - .thread_manager - .start_threads(&scheduler.context); + fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self { + let scheduler = Self::do_spawn(pool.clone(), initial_context, handler); + pool.register_to_cleaner(Arc::downgrade(&scheduler.inner.thread_manager)); scheduler } } -impl InstalledScheduler for PooledScheduler { +impl InstalledScheduler for PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ fn id(&self) -> SchedulerId { - self.inner.thread_manager.scheduler_id + self.inner.id() } fn context(&self) -> &SchedulingContext { &self.context } - fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) { - let task = SchedulingStateMachine::create_task(transaction.clone(), index, &mut |pubkey| { - self.inner.usage_queue_loader.load(pubkey) - }); - self.inner.thread_manager.send_task(task); + fn schedule_execution( + &self, + transaction_with_index: SEA::TransactionWithIndex<'_>, + ) -> Result<()> { + transaction_with_index.with_transaction_and_index(|transaction, index| { + let task = + SchedulingStateMachine::create_task(transaction.clone(), index, &mut |pubkey| { + self.inner.usage_queue_loader.load(pubkey) + }); + let abort_detected = self + .ensure_thread_manager_resumed(&self.context)? + .send_task(task); + if abort_detected { + let thread_manager = &mut self.inner.thread_manager.write().unwrap(); + thread_manager.suspend(); + thread_manager.reset_session_on_error() + } else { + Ok(()) + } + }) } fn wait_for_termination( @@ -971,17 +1689,78 @@ impl InstalledScheduler for PooledScheduler { } fn pause_for_recent_blockhash(&mut self) { - self.inner.thread_manager.end_session(); + self.inner.thread_manager.write().unwrap().end_session(); + } +} + +impl UninstalledScheduler for PooledSchedulerInner +where + S: SpawnableScheduler>, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn return_to_pool(mut self: Box) { + let pool = self.thread_manager.write().unwrap().pool.clone(); + self.pooled_at = Instant::now(); + pool.return_scheduler(*self) } } -impl UninstalledScheduler for PooledSchedulerInner +impl RetirableSchedulerInner for PooledSchedulerInner where - S: SpawnableScheduler>, - TH: TaskHandler, + S: SpawnableScheduler>, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { - fn return_to_pool(self: Box) { - self.thread_manager.pool.clone().return_scheduler(*self) + fn retire_if_stale(&mut self) -> bool { + // reap threads after 10mins of inactivity for any pooled (idle) schedulers. The primary + // scheduler is special-cased to empty its usage queue loader book instead, for easier + // monitoring to accumulate os-level thread metrics. The duration is chosen based on the + // rough estimation from the frequency of short-lived forks on the mainnet-beta, with + // consideration of some increased forking at epoch boundaries. + const IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM: Duration = Duration::from_secs(600); + + const BITS_PER_HEX_DIGIT: usize = 4; + let usage_queue_count = self.usage_queue_loader.usage_queue_count(); + if usage_queue_count < 200_000 { + info!( + "[sch_{:0width$x}]: cleaner: usage queue loader book size: {usage_queue_count}...", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + } else if self.thread_manager.read().unwrap().is_primary() { + info!( + "[sch_{:0width$x}]: cleaner: too big usage queue loader book size: {usage_queue_count}...; emptying the primary scheduler", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.usage_queue_loader.clear(); + return true; + } else { + info!( + "[sch_{:0width$x}]: cleaner: too big usage queue loader book size: {usage_queue_count}...; retiring scheduler", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.suspend_thread_manager(); + return false; + } + + let pooled_duration = self.pooled_since(); + if pooled_duration <= IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM { + true + } else if !self.thread_manager.read().unwrap().is_primary() { + info!( + "[sch_{:0width$x}]: cleaner: retiring unused scheduler after {:?}...", + self.id(), + pooled_duration, + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.suspend_thread_manager(); + false + } else { + true + } } } @@ -989,7 +1768,6 @@ where mod tests { use { super::*, - assert_matches::assert_matches, solana_runtime::{ bank::Bank, bank_forks::BankForks, @@ -1000,11 +1778,17 @@ mod tests { solana_sdk::{ clock::{Slot, MAX_PROCESSING_AGE}, pubkey::Pubkey, + scheduling::SchedulingMode, signer::keypair::Keypair, system_transaction, transaction::{SanitizedTransaction, TransactionError}, }, - std::{sync::Arc, thread::JoinHandle}, + std::{ + mem, + sync::Arc, + thread::{self, sleep, JoinHandle}, + time::Duration, + }, }; #[test] @@ -1017,7 +1801,10 @@ mod tests { // this indirectly proves that there should be circular link because there's only one Arc // at this moment now - assert_eq!((Arc::strong_count(&pool), Arc::weak_count(&pool)), (1, 1)); + assert_eq!( + (Arc::strong_count(&pool), Arc::weak_count(&pool)), + (1 + 1 /* todo */, 1) + ); let debug = format!("{pool:#?}"); assert!(!debug.is_empty()); } @@ -1030,7 +1817,7 @@ mod tests { let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = SchedulingContext::new(bank); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank); let scheduler = pool.take_scheduler(context); let debug = format!("{scheduler:#?}"); @@ -1045,7 +1832,7 @@ mod tests { let pool = DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank); let scheduler1 = pool.do_take_scheduler(context.clone()); let scheduler_id1 = scheduler1.id(); @@ -1074,7 +1861,7 @@ mod tests { let pool = DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank); let mut scheduler = pool.do_take_scheduler(context.clone()); // should never panic. @@ -1096,8 +1883,10 @@ mod tests { let new_bank = &Arc::new(Bank::default_for_tests()); assert!(!Arc::ptr_eq(old_bank, new_bank)); - let old_context = &SchedulingContext::new(old_bank.clone()); - let new_context = &SchedulingContext::new(new_bank.clone()); + let old_context = + &SchedulingContext::new(SchedulingMode::BlockVerification, old_bank.clone()); + let new_context = + &SchedulingContext::new(SchedulingMode::BlockVerification, new_bank.clone()); let scheduler = pool.do_take_scheduler(old_context.clone()); let scheduler_id = scheduler.id(); @@ -1114,11 +1903,14 @@ mod tests { let bank = Bank::default_for_tests(); let bank_forks = BankForks::new_rw_arc(bank); - let mut bank_forks = bank_forks.write().unwrap(); + let mut bank_forks_write = bank_forks.write().unwrap(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - bank_forks.install_scheduler_pool(pool); + bank_forks_write.install_scheduler_pool(pool); + bank_forks_write.prepare_to_drop(); + drop(bank_forks_write); + drop::(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap()); } #[test] @@ -1182,11 +1974,11 @@ mod tests { let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); assert_eq!(bank.transaction_count(), 0); let scheduler = pool.take_scheduler(context); - scheduler.schedule_execution(&(tx0, 0)); + scheduler.schedule_execution(&(tx0, 0)).unwrap(); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_matches!(bank.wait_for_completed_scheduler(), Some((Ok(()), _))); assert_eq!(bank.transaction_count(), 1); @@ -1207,7 +1999,7 @@ mod tests { let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let mut scheduler = pool.take_scheduler(context); let unfunded_keypair = Keypair::new(); @@ -1219,9 +2011,9 @@ mod tests { genesis_config.hash(), )); assert_eq!(bank.transaction_count(), 0); - scheduler.schedule_execution(&(bad_tx, 0)); + scheduler.schedule_execution(&(bad_tx, 0)).unwrap(); // simulate the task-sending thread is stalled for some reason. - std::thread::sleep(std::time::Duration::from_secs(1)); + sleep(Duration::from_secs(1)); assert_eq!(bank.transaction_count(), 0); let good_tx_after_bad_tx = @@ -1237,25 +2029,25 @@ mod tests { .result, Ok(_) ); - scheduler.schedule_execution(&(good_tx_after_bad_tx, 1)); + sleep(Duration::from_secs(3)); + scheduler + .schedule_execution(&(good_tx_after_bad_tx, 1)) + .unwrap_err(); + error!("last pause!"); scheduler.pause_for_recent_blockhash(); // transaction_count should remain same as scheduler should be bailing out. // That's because we're testing the serialized failing execution case in this test. - // However, currently threaded impl can't properly abort in this situtation.. - // so, 1 should be observed, intead of 0. // Also note that bank.transaction_count() is generally racy by nature, because // blockstore_processor and unified_scheduler both tend to process non-conflicting batches // in parallel as part of the normal operation. - assert_eq!(bank.transaction_count(), 1); + assert_eq!(bank.transaction_count(), 0); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_matches!( bank.wait_for_completed_scheduler(), - Some(( - Err(solana_sdk::transaction::TransactionError::AccountNotFound), - _timings - )) + Some((Ok(()), _timings)) ); + pool.uninstalled_from_bank_forks(); } #[test] @@ -1266,10 +2058,17 @@ mod tests { const BLOCKED_TRANSACTION_INDEX: usize = 1; static LOCK_TO_STALL: Mutex<()> = Mutex::new(()); - #[derive(Debug)] + #[derive(Debug, Clone)] struct StallingHandler; - impl TaskHandler for StallingHandler { + impl TaskHandler for StallingHandler { + fn create>( + _pool: &SchedulerPool, + ) -> Self { + Self + } + fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -1282,7 +2081,8 @@ mod tests { BLOCKED_TRANSACTION_INDEX => {} _ => unreachable!(), }; - DefaultTaskHandler::handle( + >::handle( + &DefaultTaskHandler, result, timings, bank, @@ -1316,25 +2116,27 @@ mod tests { let bank = Bank::new_for_tests(&genesis_config); let bank = setup_dummy_fork_graph(bank); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new_dyn( - None, - None, - None, - None, - ignored_prioritization_fee_cache, - ); - let context = SchedulingContext::new(bank.clone()); + let pool = SchedulerPool::< + PooledScheduler, + _, + _, + >::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); assert_eq!(bank.transaction_count(), 0); let scheduler = pool.take_scheduler(context); // Stall handling tx0 and tx1 let lock_to_stall = LOCK_TO_STALL.lock().unwrap(); - scheduler.schedule_execution(&(tx0, STALLED_TRANSACTION_INDEX)); - scheduler.schedule_execution(&(tx1, BLOCKED_TRANSACTION_INDEX)); + scheduler + .schedule_execution(&(tx0, STALLED_TRANSACTION_INDEX)) + .unwrap(); + scheduler + .schedule_execution(&(tx1, BLOCKED_TRANSACTION_INDEX)) + .unwrap(); // Wait a bit for the scheduler thread to decide to block tx1 - std::thread::sleep(std::time::Duration::from_secs(1)); + sleep(Duration::from_secs(1)); // Resume handling by unlocking LOCK_TO_STALL drop(lock_to_stall); @@ -1347,10 +2149,17 @@ mod tests { fn test_scheduler_mismatched_scheduling_context_race() { solana_logger::setup(); - #[derive(Debug)] + #[derive(Debug, Clone)] struct TaskAndContextChecker; - impl TaskHandler for TaskAndContextChecker { + impl TaskHandler for TaskAndContextChecker { + fn create>( + _pool: &SchedulerPool, + ) -> Self { + Self + } + fn handle( + &self, _result: &mut Result<()>, _timings: &mut ExecuteTimings, bank: &Arc, @@ -1379,7 +2188,11 @@ mod tests { )); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = SchedulerPool::, _>::new( + let pool = SchedulerPool::< + PooledScheduler, + _, + _, + >::new( Some(4), // spawn 4 threads None, None, @@ -1395,8 +2208,8 @@ mod tests { 2, genesis_config.hash(), )); - let context0 = &SchedulingContext::new(bank0.clone()); - let context1 = &SchedulingContext::new(bank1.clone()); + let context0 = &SchedulingContext::new(SchedulingMode::BlockVerification, bank0.clone()); + let context1 = &SchedulingContext::new(SchedulingMode::BlockVerification, bank1.clone()); // Exercise the scheduler by busy-looping to expose the race condition for (context, index) in [(context0, 0), (context1, 1)] @@ -1405,7 +2218,7 @@ mod tests { .take(10000) { let scheduler = pool.take_scheduler(context.clone()); - scheduler.schedule_execution(&(dummy_tx, index)); + scheduler.schedule_execution(&(dummy_tx, index)).unwrap(); scheduler.wait_for_termination(false).1.return_to_pool(); } } @@ -1415,7 +2228,7 @@ mod tests { Mutex, Mutex>>, SchedulingContext, - Arc>, + Arc>, ); impl AsyncScheduler { @@ -1434,7 +2247,7 @@ mod tests { } } - impl InstalledScheduler + impl InstalledScheduler for AsyncScheduler { fn id(&self) -> SchedulerId { @@ -1445,20 +2258,24 @@ mod tests { &self.2 } - fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) { + fn schedule_execution( + &self, + &(transaction, index): &(&SanitizedTransaction, usize), + ) -> Result<()> { let transaction_and_index = (transaction.clone(), index); let context = self.context().clone(); let pool = self.3.clone(); - self.1.lock().unwrap().push(std::thread::spawn(move || { + self.1.lock().unwrap().push(thread::spawn(move || { // intentionally sleep to simulate race condition where register_recent_blockhash // is handle before finishing executing scheduled transactions - std::thread::sleep(std::time::Duration::from_secs(1)); + sleep(Duration::from_secs(1)); let mut result = Ok(()); let mut timings = ExecuteTimings::default(); - ::handle( + >::handle( + &DefaultTaskHandler, &mut result, &mut timings, context.bank(), @@ -1468,6 +2285,8 @@ mod tests { ); (result, timings) })); + + Ok(()) } fn wait_for_termination( @@ -1475,7 +2294,7 @@ mod tests { _is_dropped: bool, ) -> (ResultWithTimings, UninstalledSchedulerBox) { self.do_wait(); - let result_with_timings = std::mem::replace( + let result_with_timings = mem::replace( &mut *self.0.lock().unwrap(), initialized_result_with_timings(), ); @@ -1500,7 +2319,8 @@ mod tests { } } - impl SpawnableScheduler + impl + SpawnableScheduler for AsyncScheduler { // well, i wish i can use ! (never type)..... @@ -1515,8 +2335,9 @@ mod tests { } fn spawn( - pool: Arc>, + pool: Arc>, initial_context: SchedulingContext, + _handler: DefaultTaskHandler, ) -> Self { AsyncScheduler::( Mutex::new(initialized_result_with_timings()), @@ -1527,6 +2348,14 @@ mod tests { } } + impl RetirableSchedulerInner + for AsyncScheduler + { + fn retire_if_stale(&mut self) -> bool { + unimplemented!(); + } + } + fn do_test_scheduler_schedule_execution_recent_blockhash_edge_case< const TRIGGER_RACE_CONDITION: bool, >() { @@ -1556,24 +2385,24 @@ mod tests { ); } let bank = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - SchedulerPool::, DefaultTaskHandler>::new_dyn( - None, - None, - None, - None, - ignored_prioritization_fee_cache, - ); + let pool = SchedulerPool::, _, _>::new_dyn( + None, + None, + None, + None, + ignored_prioritization_fee_cache, + ); let scheduler = pool.take_scheduler(context); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_eq!(bank.transaction_count(), 0); // schedule but not immediately execute transaction - bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter()); + bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter()) + .unwrap(); // this calls register_recent_blockhash internally bank.fill_bank_with_ticks_for_tests(); @@ -1648,7 +2477,15 @@ mod tests { prioritization_fee_cache, }; - DefaultTaskHandler::handle(result, timings, bank, tx, 0, handler_context); + >::handle( + &DefaultTaskHandler, + result, + timings, + bank, + tx, + 0, + handler_context, + ); assert_matches!(result, Err(TransactionError::AccountLoadedTwice)); } } diff --git a/validator/src/cli.rs b/validator/src/cli.rs index 0eed324a9a9d0c..948c32f0e6cb3f 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -1514,7 +1514,6 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .arg( Arg::with_name("block_verification_method") .long("block-verification-method") - .hidden(hidden_unless_forced()) .value_name("METHOD") .takes_value(true) .possible_values(BlockVerificationMethod::cli_names()) @@ -1531,7 +1530,6 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .arg( Arg::with_name("unified_scheduler_handler_threads") .long("unified-scheduler-handler-threads") - .hidden(hidden_unless_forced()) .value_name("COUNT") .takes_value(true) .validator(|s| is_within_range(s, 1..))