From 23adc1d3cfa34f387fa4c5973252f54b4ca53a2c Mon Sep 17 00:00:00 2001 From: AvhiMaz Date: Wed, 12 Nov 2025 16:53:12 +0530 Subject: [PATCH 1/4] fix: re-enable test_scheduler_drop_idle by resolving race condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove #[ignore] attribute and fix timing to eliminate race condition in test_scheduler_drop_idle, which was disabled in PR #8278. The test verifies the scheduler pool's cleaner thread correctly removes idle schedulers while preserving recently-pooled ones. Root cause: The original test used a 100ms idle threshold with 1000ms sleep, but timing was still unreliable due to system variations. The race occurred when old_scheduler and new_scheduler had unclear age differences. Fix: - Use explicit 300ms idle threshold for this test (instead of 100ms) - Sleep 350ms before returning new_scheduler (provides 50ms+ safety margin) - This guarantees old_scheduler is idle while new_scheduler is not Result: - old_scheduler: 350ms old (> 300ms threshold) → definitely idle → removed - new_scheduler: ~0ms old (< 300ms threshold) → definitely not idle → kept - Test is now deterministic and matches expected checkpoint sequence Fixes #8279 Signed-off-by: AvhiMaz --- unified-scheduler-pool/src/lib.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 49926bdd53b0da..a94d448a2a9f76 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -2872,7 +2872,6 @@ mod tests { const SHORTENED_MAX_POOLING_DURATION: Duration = Duration::from_millis(100); #[test] - #[ignore] fn test_scheduler_drop_idle() { agave_logger::setup(); @@ -2884,6 +2883,9 @@ mod tests { ]); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + // Use a longer pooling duration for this test to avoid race conditions. + // old_scheduler and new_scheduler need to have a clear age difference. + let test_max_pooling_duration = Duration::from_millis(300); let pool_raw = DefaultSchedulerPool::do_new( None, None, @@ -2891,7 +2893,7 @@ mod tests { None, ignored_prioritization_fee_cache, SHORTENED_POOL_CLEANER_INTERVAL, - SHORTENED_MAX_POOLING_DURATION, + test_max_pooling_duration, DEFAULT_MAX_USAGE_QUEUE_COUNT, DEFAULT_TIMEOUT_DURATION, ); @@ -2905,8 +2907,10 @@ mod tests { let new_scheduler_id = new_scheduler.id(); Box::new(old_scheduler.into_inner().1).return_to_pool(); - // sleepless_testing can't be used; wait a bit here to see real progress of wall time... - sleep(SHORTENED_MAX_POOLING_DURATION * 10); + // Wait for old_scheduler to be considered idle by the cleaner (> 300ms old). + // This ensures when the cleaner runs, old_scheduler will definitely be idle. + sleep(Duration::from_millis(350)); + Box::new(new_scheduler.into_inner().1).return_to_pool(); // Block solScCleaner until we see returned schedlers... @@ -2916,12 +2920,9 @@ mod tests { // See the old (= idle) scheduler gone only after solScCleaner did its job... sleepless_testing::at(&TestCheckPoint::AfterIdleSchedulerCleaned); - // The following assertion is racy. - // - // We need to make sure new_scheduler isn't treated as idle up to now since being returned - // to the pool after sleep(SHORTENED_MAX_POOLING_DURATION * 10). - // Removing only old_scheduler is the expected behavior. So, make - // SHORTENED_MAX_POOLING_DURATION rather long... + // After the cleaner finishes, only the new_scheduler should remain. + // The old_scheduler is idle (pooled > 100ms ago) so it gets removed. + // The new_scheduler was just pooled so it's not idle yet. assert_eq!(pool_raw.scheduler_inners.lock().unwrap().len(), 1); assert_eq!( pool_raw From 5781563deac55a49bc1867edcf7b7e72b4d7c881 Mon Sep 17 00:00:00 2001 From: AvhiMaz Date: Mon, 17 Nov 2025 12:51:49 +0530 Subject: [PATCH 2/4] Address all reviewer feedback on test_scheduler_drop_idle - Move SHORTENED_MAX_POOLING_DURATION constant to test_scheduler_drop_stale where it's used - Convert test_max_pooling_duration from let to const TEST_MAX_POOLING_DURATION following conventions - Create TEST_WAIT_FOR_IDLE constant (500ms) with explicit safety margin tied to TEST_MAX_POOLING_DURATION - Increase safety margin from 50ms to 200ms for CI reliability - Add detailed comments explaining 300ms pooling duration tradeoff (speed vs race condition window) - Update assertion comment with explicit timing guarantees: old_scheduler ~500ms old (exceeds 300ms threshold), new_scheduler ~50ms old (below threshold) Signed-off-by: AvhiMaz --- unified-scheduler-pool/src/lib.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index a94d448a2a9f76..69d067d5a04538 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -2869,7 +2869,6 @@ mod tests { } const SHORTENED_POOL_CLEANER_INTERVAL: Duration = Duration::from_millis(1); - const SHORTENED_MAX_POOLING_DURATION: Duration = Duration::from_millis(100); #[test] fn test_scheduler_drop_idle() { @@ -2882,10 +2881,15 @@ mod tests { &TestCheckPoint::AfterIdleSchedulerCleaned, ]); + // Use 300ms pooling duration as a balance between: + // - Keeping the test fast (as small as possible) + // - Providing a large enough window to avoid the race condition where the second + // scheduler could also be freed before we can assert only one remains + const TEST_MAX_POOLING_DURATION: Duration = Duration::from_millis(300); + // Add a generous safety margin to avoid flakiness in CI. We need old_scheduler to be + // reliably older than TEST_MAX_POOLING_DURATION when the cleaner runs. + const TEST_WAIT_FOR_IDLE: Duration = Duration::from_millis(500); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - // Use a longer pooling duration for this test to avoid race conditions. - // old_scheduler and new_scheduler need to have a clear age difference. - let test_max_pooling_duration = Duration::from_millis(300); let pool_raw = DefaultSchedulerPool::do_new( None, None, @@ -2893,7 +2897,7 @@ mod tests { None, ignored_prioritization_fee_cache, SHORTENED_POOL_CLEANER_INTERVAL, - test_max_pooling_duration, + TEST_MAX_POOLING_DURATION, DEFAULT_MAX_USAGE_QUEUE_COUNT, DEFAULT_TIMEOUT_DURATION, ); @@ -2907,9 +2911,8 @@ mod tests { let new_scheduler_id = new_scheduler.id(); Box::new(old_scheduler.into_inner().1).return_to_pool(); - // Wait for old_scheduler to be considered idle by the cleaner (> 300ms old). - // This ensures when the cleaner runs, old_scheduler will definitely be idle. - sleep(Duration::from_millis(350)); + // Wait for old_scheduler to be considered idle by the cleaner. + sleep(TEST_WAIT_FOR_IDLE); Box::new(new_scheduler.into_inner().1).return_to_pool(); @@ -2920,9 +2923,8 @@ mod tests { // See the old (= idle) scheduler gone only after solScCleaner did its job... sleepless_testing::at(&TestCheckPoint::AfterIdleSchedulerCleaned); - // After the cleaner finishes, only the new_scheduler should remain. - // The old_scheduler is idle (pooled > 100ms ago) so it gets removed. - // The new_scheduler was just pooled so it's not idle yet. + // Only new_scheduler should remain. old_scheduler (~500ms old) exceeds the + // 300ms idle threshold, while new_scheduler (~50ms old) does not. assert_eq!(pool_raw.scheduler_inners.lock().unwrap().len(), 1); assert_eq!( pool_raw @@ -3013,6 +3015,7 @@ mod tests { #[test] fn test_scheduler_drop_stale() { + const SHORTENED_MAX_POOLING_DURATION: Duration = Duration::from_millis(100); agave_logger::setup(); let _progress = sleepless_testing::setup(&[ From af5aaaa1ba11e448872fe40ad81845a811972053 Mon Sep 17 00:00:00 2001 From: AvhiMaz Date: Mon, 17 Nov 2025 23:58:09 +0530 Subject: [PATCH 3/4] refactor: tie test duration constants together and improve comment accuracy Signed-off-by: AvhiMaz --- unified-scheduler-pool/src/lib.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 69d067d5a04538..6fe33857e6475e 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -2885,10 +2885,10 @@ mod tests { // - Keeping the test fast (as small as possible) // - Providing a large enough window to avoid the race condition where the second // scheduler could also be freed before we can assert only one remains - const TEST_MAX_POOLING_DURATION: Duration = Duration::from_millis(300); - // Add a generous safety margin to avoid flakiness in CI. We need old_scheduler to be - // reliably older than TEST_MAX_POOLING_DURATION when the cleaner runs. - const TEST_WAIT_FOR_IDLE: Duration = Duration::from_millis(500); + const TEST_MAX_POOLING_DURATION_MS: u64 = 300; + const TEST_MAX_POOLING_DURATION: Duration = Duration::from_millis(TEST_MAX_POOLING_DURATION_MS); + const TEST_WAIT_FOR_IDLE_MS: u64 = TEST_MAX_POOLING_DURATION_MS + 200; + const TEST_WAIT_FOR_IDLE: Duration = Duration::from_millis(TEST_WAIT_FOR_IDLE_MS); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool_raw = DefaultSchedulerPool::do_new( None, @@ -2923,8 +2923,8 @@ mod tests { // See the old (= idle) scheduler gone only after solScCleaner did its job... sleepless_testing::at(&TestCheckPoint::AfterIdleSchedulerCleaned); - // Only new_scheduler should remain. old_scheduler (~500ms old) exceeds the - // 300ms idle threshold, while new_scheduler (~50ms old) does not. + // Only new_scheduler should remain. old_scheduler exceeds the + // TEST_MAX_POOLING_DURATION idle threshold, while new_scheduler does not. assert_eq!(pool_raw.scheduler_inners.lock().unwrap().len(), 1); assert_eq!( pool_raw From a037011751b2ac2478dfc12268bddcf0bae294a0 Mon Sep 17 00:00:00 2001 From: AvhiMaz Date: Wed, 19 Nov 2025 13:23:51 +0530 Subject: [PATCH 4/4] fix: cargo fmt Signed-off-by: AvhiMaz --- unified-scheduler-pool/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 6fe33857e6475e..3e8f6532983812 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -2886,7 +2886,8 @@ mod tests { // - Providing a large enough window to avoid the race condition where the second // scheduler could also be freed before we can assert only one remains const TEST_MAX_POOLING_DURATION_MS: u64 = 300; - const TEST_MAX_POOLING_DURATION: Duration = Duration::from_millis(TEST_MAX_POOLING_DURATION_MS); + const TEST_MAX_POOLING_DURATION: Duration = + Duration::from_millis(TEST_MAX_POOLING_DURATION_MS); const TEST_WAIT_FOR_IDLE_MS: u64 = TEST_MAX_POOLING_DURATION_MS + 200; const TEST_WAIT_FOR_IDLE: Duration = Duration::from_millis(TEST_WAIT_FOR_IDLE_MS); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));