diff --git a/include/envoy/event/dispatcher.h b/include/envoy/event/dispatcher.h index a23872326c271..1e0b52a10f270 100644 --- a/include/envoy/event/dispatcher.h +++ b/include/envoy/event/dispatcher.h @@ -157,7 +157,14 @@ class Dispatcher { * called) or non-blocking mode where only active events will be executed and then * run() will return. */ - enum class RunType { Block, NonBlock }; + enum class RunType { + Block, // Executes any events that have been activated, then exit. + NonBlock, // Waits for any pending events to activate, executes them, + // then exits. Exits immediately if there are no pending or + // active events. + RunUntilExit // Runs the event-loop until loopExit() is called, blocking + // until there are pending or active events. + }; virtual void run(RunType type) PURE; /** diff --git a/source/common/event/BUILD b/source/common/event/BUILD index a07cf826d7349..478fc28eb4c87 100644 --- a/source/common/event/BUILD +++ b/source/common/event/BUILD @@ -98,6 +98,7 @@ envoy_cc_library( deps = [ ":libevent_lib", ":timer_lib", + "//include/envoy/event:dispatcher_interface", "//include/envoy/event:timer_interface", "//source/common/common:assert_lib", ], diff --git a/source/common/event/dispatcher_impl.cc b/source/common/event/dispatcher_impl.cc index 995e551b83519..8e737de4de17f 100644 --- a/source/common/event/dispatcher_impl.cc +++ b/source/common/event/dispatcher_impl.cc @@ -164,12 +164,7 @@ void DispatcherImpl::run(RunType type) { // not guarantee that events are run in any particular order. So even if we post() and call // event_base_once() before some other event, the other event might get called first. runPostCallbacks(); - - if (type == RunType::NonBlock) { - base_scheduler_.nonBlockingLoop(); - } else { - base_scheduler_.blockingLoop(); - } + base_scheduler_.run(type); } void DispatcherImpl::runPostCallbacks() { diff --git a/source/common/event/libevent_scheduler.cc b/source/common/event/libevent_scheduler.cc index ab28dc2a6f246..5b35ffd18447e 100644 --- a/source/common/event/libevent_scheduler.cc +++ b/source/common/event/libevent_scheduler.cc @@ -15,21 +15,30 @@ TimerPtr LibeventScheduler::createTimer(const TimerCb& cb) { return std::make_unique(libevent_, cb); }; -void LibeventScheduler::nonBlockingLoop() { +void LibeventScheduler::run(Dispatcher::RunType mode) { + int flag = 0; + switch (mode) { + case Dispatcher::RunType::NonBlock: + flag = EVLOOP_NONBLOCK; #ifdef WIN32 - // On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever. - // This is because libevent only supports level triggering on Windows, and so the write - // event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the - // loop will run at most once - const int flag = EVLOOP_NONBLOCK | EVLOOP_ONCE; -#else - const int flag = EVLOOP_NONBLOCK; + // On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever. + // This is because libevent only supports level triggering on Windows, and so the write + // event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the + // loop will run at most once + flag |= EVLOOP_NONBLOCK | EVLOOP_ONCE; #endif + break; + case Dispatcher::RunType::Block: + // The default flags have 'block' behavior. See + // http://www.wangafu.net/~nickm/libevent-book/Ref3_eventloop.html + break; + case Dispatcher::RunType::RunUntilExit: + flag = EVLOOP_NO_EXIT_ON_EMPTY; + break; + } event_base_loop(libevent_.get(), flag); } -void LibeventScheduler::blockingLoop() { event_base_loop(libevent_.get(), 0); } - void LibeventScheduler::loopExit() { event_base_loopexit(libevent_.get(), nullptr); } } // namespace Event diff --git a/source/common/event/libevent_scheduler.h b/source/common/event/libevent_scheduler.h index b15dc26ef0551..5a41e1ccf6c4f 100644 --- a/source/common/event/libevent_scheduler.h +++ b/source/common/event/libevent_scheduler.h @@ -1,5 +1,6 @@ #pragma once +#include "envoy/event/dispatcher.h" #include "envoy/event/timer.h" #include "common/event/libevent.h" @@ -18,14 +19,11 @@ class LibeventScheduler : public Scheduler { TimerPtr createTimer(const TimerCb& cb) override; /** - * Runs the libevent loop once, without blocking. - */ - void nonBlockingLoop(); - - /** - * Runs the libevent loop once, with block. + * Runs the event loop. + * + * @param mode The mode in which to run the event loop. */ - void blockingLoop(); + void run(Dispatcher::RunType mode); /** * Exits the libevent loop. diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index ae75f884afd0d..67b55ca008401 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -17,10 +17,10 @@ namespace Envoy { namespace Server { GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, - Api::Api& api) - : time_source_(api.timeSource()), miss_timeout_(config.wdMissTimeout()), - megamiss_timeout_(config.wdMegaMissTimeout()), kill_timeout_(config.wdKillTimeout()), - multi_kill_timeout_(config.wdMultiKillTimeout()), + Api::Api& api, std::unique_ptr&& test_interlock) + : test_interlock_hook_(std::move(test_interlock)), time_source_(api.timeSource()), + miss_timeout_(config.wdMissTimeout()), megamiss_timeout_(config.wdMegaMissTimeout()), + kill_timeout_(config.wdKillTimeout()), multi_kill_timeout_(config.wdMultiKillTimeout()), loop_interval_([&]() -> std::chrono::milliseconds { // The loop interval is simply the minimum of all specified intervals, // but we must account for the 0=disabled case. This lambda takes care @@ -32,15 +32,28 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio }()), watchdog_miss_counter_(stats_scope.counter("server.watchdog_miss")), watchdog_megamiss_counter_(stats_scope.counter("server.watchdog_mega_miss")), - run_thread_(true) { + dispatcher_(api.allocateDispatcher()), + loop_timer_(dispatcher_->createTimer([this]() { step(); })), run_thread_(true) { start(api); } +GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, + Api::Api& api) + : GuardDogImpl(stats_scope, config, api, std::make_unique()) {} + GuardDogImpl::~GuardDogImpl() { stop(); } -void GuardDogImpl::threadRoutine() { - do { - const auto now = time_source_.monotonicTime(); +void GuardDogImpl::step() { + { + Thread::LockGuard guard(mutex_); + if (!run_thread_) { + return; + } + } + + const auto now = time_source_.monotonicTime(); + + { bool seen_one_multi_timeout(false); Thread::LockGuard guard(wd_lock_); for (auto& watched_dog : watched_dogs_) { @@ -79,7 +92,15 @@ void GuardDogImpl::threadRoutine() { } } } - } while (waitOrDetectStop()); + } + + { + Thread::LockGuard guard(mutex_); + test_interlock_hook_->signalFromImpl(now); + if (run_thread_) { + loop_timer_->enableTimer(loop_interval_); + } + } } WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadIdPtr&& thread_id) { @@ -111,41 +132,19 @@ void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) { } } -bool GuardDogImpl::waitOrDetectStop() { - force_checked_event_.notifyAll(); - Thread::LockGuard guard(exit_lock_); - // Spurious wakeups are OK without explicit handling. We'll just check - // earlier than strictly required for that round. - - // Preferably, we should be calling - // time_system_.waitFor(exit_lock_, exit_event_, loop_interval_); - // here, but that makes GuardDogMissTest.* very flaky. The reason that - // directly calling condvar waitFor works is that it doesn't advance - // simulated time, which the test is carefully controlling. - // - // One alternative approach that would be easier to test is to use a private - // dispatcher and a TimerCB to execute the loop body of threadRoutine(). In - // this manner, the same dynamics would occur in production, with added - // overhead from libevent, But then the unit-test would purely control the - // advancement of time, and thus be more robust. Another variation would be - // to run this watchdog on the main-thread dispatcher, though such an approach - // could not detect when the main-thread was stuck. - exit_event_.waitFor(exit_lock_, loop_interval_); // NO_CHECK_FORMAT(real_time) - - return run_thread_; -} - void GuardDogImpl::start(Api::Api& api) { - run_thread_ = true; - thread_ = api.threadFactory().createThread([this]() -> void { threadRoutine(); }); + Thread::LockGuard guard(mutex_); + thread_ = api.threadFactory().createThread( + [this]() -> void { dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit); }); + loop_timer_->enableTimer(std::chrono::milliseconds(0)); } void GuardDogImpl::stop() { { - Thread::LockGuard guard(exit_lock_); + Thread::LockGuard guard(mutex_); run_thread_ = false; - exit_event_.notifyAll(); } + dispatcher_->exit(); if (thread_) { thread_->join(); thread_.reset(); diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index c01cbda1bbc57..7f07ba898ce71 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -32,13 +32,40 @@ namespace Server { */ class GuardDogImpl : public GuardDog { public: + /** + * Defines a test interlock hook to enable tests to synchronize the guard-dog + * execution so they can probe current counter values. The default + * implementation that runs in production has empty methods, which are + * overridden in the implementation used during tests. + */ + class TestInterlockHook { + public: + virtual ~TestInterlockHook() = default; + + /** + * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs + * up to a particular point in time. + */ + virtual void signalFromImpl(MonotonicTime) {} + + /** + * Called from GuardDog tests to block until the implementation has reached + * the desired point in time. + */ + virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {} + }; + /** * @param stats_scope Statistics scope to write watchdog_miss and * watchdog_mega_miss events into. * @param config Configuration object. + * @param api API object. + * @param test_interlock a hook for enabling interlock with unit tests. * * See the configuration documentation for details on the timeout settings. */ + GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api, + std::unique_ptr&& test_interlock); GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api); ~GuardDogImpl(); @@ -46,10 +73,17 @@ class GuardDogImpl : public GuardDog { * Exposed for testing purposes only (but harmless to call): */ int loopIntervalForTest() const { return loop_interval_.count(); } + + /** + * Test hook to force a step() to catch up with the current simulated + * time. This is inlined so that it does not need to be present in the + * production binary. + */ void forceCheckForTest() { - exit_event_.notifyAll(); - Thread::LockGuard guard(exit_lock_); - force_checked_event_.wait(exit_lock_); + Thread::LockGuard guard(mutex_); + MonotonicTime now = time_source_.monotonicTime(); + loop_timer_->enableTimer(std::chrono::milliseconds(0)); + test_interlock_hook_->waitFromTest(mutex_, now); } // Server::GuardDog @@ -57,12 +91,8 @@ class GuardDogImpl : public GuardDog { void stopWatching(WatchDogSharedPtr wd) override; private: - void threadRoutine(); - /** - * @return True if we should continue, false if signalled to stop. - */ - bool waitOrDetectStop(); - void start(Api::Api& api) EXCLUSIVE_LOCKS_REQUIRED(exit_lock_); + void start(Api::Api& api); + void step(); void stop(); // Per the C++ standard it is OK to use these in ctor initializer as long as // it is after kill and multikill timeout values are initialized. @@ -76,6 +106,7 @@ class GuardDogImpl : public GuardDog { bool megamiss_alerted_{}; }; + std::unique_ptr test_interlock_hook_; TimeSource& time_source_; const std::chrono::milliseconds miss_timeout_; const std::chrono::milliseconds megamiss_timeout_; @@ -87,10 +118,10 @@ class GuardDogImpl : public GuardDog { std::vector watched_dogs_ GUARDED_BY(wd_lock_); Thread::MutexBasicLockable wd_lock_; Thread::ThreadPtr thread_; - Thread::MutexBasicLockable exit_lock_; - Thread::CondVar exit_event_; - bool run_thread_ GUARDED_BY(exit_lock_); - Thread::CondVar force_checked_event_; + Event::DispatcherPtr dispatcher_; + Event::TimerPtr loop_timer_; + Thread::MutexBasicLockable mutex_; + bool run_thread_ GUARDED_BY(mutex_); }; } // namespace Server diff --git a/test/server/BUILD b/test/server/BUILD index 1c64006680419..7256645a22b35 100644 --- a/test/server/BUILD +++ b/test/server/BUILD @@ -96,6 +96,7 @@ envoy_cc_test( deps = [ "//include/envoy/common:time_interface", "//source/common/api:api_lib", + "//source/common/common:macros", "//source/common/common:utility_lib", "//source/common/stats:stats_lib", "//source/server:guarddog_lib", diff --git a/test/server/guarddog_impl_test.cc b/test/server/guarddog_impl_test.cc index 3e3f2ccd0ff75..2a398ed31530b 100644 --- a/test/server/guarddog_impl_test.cc +++ b/test/server/guarddog_impl_test.cc @@ -5,6 +5,7 @@ #include "envoy/common/time.h" #include "common/api/api_impl.h" +#include "common/common/macros.h" #include "common/common/utility.h" #include "server/guarddog_impl.h" @@ -13,6 +14,7 @@ #include "test/mocks/server/mocks.h" #include "test/mocks/stats/mocks.h" #include "test/test_common/simulated_time_system.h" +#include "test/test_common/test_time.h" #include "test/test_common/utility.h" #include "gmock/gmock.h" @@ -25,15 +27,58 @@ namespace Envoy { namespace Server { namespace { -class GuardDogTestBase : public testing::Test { +class DebugTestInterlock : public GuardDogImpl::TestInterlockHook { +public: + // GuardDogImpl::TestInterlockHook + virtual void signalFromImpl(MonotonicTime time) { + impl_reached_ = time; + impl_.notifyAll(); + } + + virtual void waitFromTest(Thread::MutexBasicLockable& mutex, MonotonicTime time) + EXCLUSIVE_LOCKS_REQUIRED(mutex) { + while (impl_reached_ < time) { + impl_.wait(mutex); + } + } + +private: + Thread::CondVar impl_; + MonotonicTime impl_reached_; +}; + +// We want to make sure guard-dog is tested with both simulated time and real +// time, to ensure that it works in production, and that it works in the context +// of integration tests which are much easier to control with simulated time. +enum class TimeSystemType { Real, Simulated }; + +class GuardDogTestBase : public testing::TestWithParam { protected: - GuardDogTestBase() : api_(Api::createApiForTest(stats_store_, time_system_)) {} + GuardDogTestBase() + : time_system_(makeTimeSystem()), api_(Api::createApiForTest(stats_store_, *time_system_)) {} + + static std::unique_ptr makeTimeSystem() { + if (GetParam() == TimeSystemType::Real) { + return std::make_unique(); + } + ASSERT(GetParam() == TimeSystemType::Simulated); + return std::make_unique(); + } + + void initGuardDog(Stats::Scope& stats_scope, const Server::Configuration::Main& config) { + guard_dog_ = std::make_unique(stats_scope, config, *api_, + std::make_unique()); + } - Event::SimulatedTimeSystem time_system_; + std::unique_ptr time_system_; Stats::IsolatedStoreImpl stats_store_; Api::ApiPtr api_; + std::unique_ptr guard_dog_; }; +INSTANTIATE_TEST_SUITE_P(TimeSystemType, GuardDogTestBase, + testing::ValuesIn({TimeSystemType::Real, TimeSystemType::Simulated})); + /** * Death test caveat: Because of the way we die gcov doesn't receive coverage * information from the forked process that is checked for successful death. @@ -52,10 +97,10 @@ class GuardDogDeathTest : public GuardDogTestBase { */ void SetupForDeath() { InSequence s; - guard_dog_ = std::make_unique(fakestats_, config_kill_, *api_); + initGuardDog(fakestats_, config_kill_); unpet_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); guard_dog_->forceCheckForTest(); - time_system_.sleep(std::chrono::milliseconds(500)); + time_system_->sleep(std::chrono::milliseconds(99)); // 1 ms shy of death. } /** @@ -64,18 +109,17 @@ class GuardDogDeathTest : public GuardDogTestBase { */ void SetupForMultiDeath() { InSequence s; - guard_dog_ = std::make_unique(fakestats_, config_multikill_, *api_); + initGuardDog(fakestats_, config_multikill_); auto unpet_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); guard_dog_->forceCheckForTest(); auto second_dog_ = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); guard_dog_->forceCheckForTest(); - time_system_.sleep(std::chrono::milliseconds(501)); + time_system_->sleep(std::chrono::milliseconds(499)); // 1 ms shy of multi-death. } NiceMock config_kill_; NiceMock config_multikill_; NiceMock fakestats_; - std::unique_ptr guard_dog_; WatchDogSharedPtr unpet_dog_; WatchDogSharedPtr second_dog_; }; @@ -84,54 +128,57 @@ class GuardDogDeathTest : public GuardDogTestBase { // a different name. class GuardDogAlmostDeadTest : public GuardDogDeathTest {}; -TEST_F(GuardDogDeathTest, KillDeathTest) { +TEST_P(GuardDogDeathTest, KillDeathTest) { // Is it German for "The Function"? Almost... auto die_function = [&]() -> void { SetupForDeath(); + time_system_->sleep(std::chrono::milliseconds(401)); // 400 ms past death. guard_dog_->forceCheckForTest(); }; + // Why do it this way? Any threads must be started inside the death test // statement and this is the easiest way to accomplish that. EXPECT_DEATH(die_function(), ""); } -TEST_F(GuardDogAlmostDeadTest, KillNoFinalCheckTest) { - // This does everything the death test does except the final force check that - // should actually result in dying. The death test does not verify that there +TEST_P(GuardDogAlmostDeadTest, KillNoFinalCheckTest) { + // This does everything the death test does, except allow enough time to + // expire to reach the death panic. The death test does not verify that there // was not a crash *before* the expected line, so this test checks that. SetupForDeath(); } -TEST_F(GuardDogDeathTest, MultiKillDeathTest) { +TEST_P(GuardDogDeathTest, MultiKillDeathTest) { auto die_function = [&]() -> void { SetupForMultiDeath(); + time_system_->sleep(std::chrono::milliseconds(2)); // 1 ms past multi-death. guard_dog_->forceCheckForTest(); }; EXPECT_DEATH(die_function(), ""); } -TEST_F(GuardDogAlmostDeadTest, MultiKillNoFinalCheckTest) { - // This does everything the death test does except the final force check that +TEST_P(GuardDogAlmostDeadTest, MultiKillNoFinalCheckTest) { + // This does everything the death test does not except the final force check that // should actually result in dying. The death test does not verify that there // was not a crash *before* the expected line, so this test checks that. SetupForMultiDeath(); } -TEST_F(GuardDogAlmostDeadTest, NearDeathTest) { +TEST_P(GuardDogAlmostDeadTest, NearDeathTest) { // This ensures that if only one thread surpasses the multiple kill threshold // there is no death. The positive case is covered in MultiKillDeathTest. InSequence s; - GuardDogImpl gd(fakestats_, config_multikill_, *api_); - auto unpet_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); - auto pet_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); + initGuardDog(fakestats_, config_multikill_); + auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); + auto pet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); // This part "waits" 600 milliseconds while one dog is touched every 100, and // the other is not. 600ms is over the threshold of 500ms for multi-kill but // only one is nonresponsive, so there should be no kill (single kill // threshold of 1s is not reached). for (int i = 0; i < 6; i++) { - time_system_.sleep(std::chrono::milliseconds(100)); + time_system_->sleep(std::chrono::milliseconds(100)); pet_dog->touch(); - gd.forceCheckForTest(); + guard_dog_->forceCheckForTest(); } } @@ -143,113 +190,113 @@ class GuardDogMissTest : public GuardDogTestBase { NiceMock config_mega_; }; -TEST_F(GuardDogMissTest, MissTest) { +TEST_P(GuardDogMissTest, MissTest) { // This test checks the actual collected statistics after doing some timer // advances that should and shouldn't increment the counters. - GuardDogImpl gd(stats_store_, config_miss_, *api_); + initGuardDog(stats_store_, config_miss_); // We'd better start at 0: EXPECT_EQ(0UL, stats_store_.counter("server.watchdog_miss").value()); - auto unpet_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); + auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); // At 300ms we shouldn't have hit the timeout yet: - time_system_.sleep(std::chrono::milliseconds(300)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(300)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(0UL, stats_store_.counter("server.watchdog_miss").value()); // This should push it past the 500ms limit: - time_system_.sleep(std::chrono::milliseconds(250)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(250)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(1UL, stats_store_.counter("server.watchdog_miss").value()); - gd.stopWatching(unpet_dog); + guard_dog_->stopWatching(unpet_dog); unpet_dog = nullptr; } -TEST_F(GuardDogMissTest, MegaMissTest) { +TEST_P(GuardDogMissTest, MegaMissTest) { // This test checks the actual collected statistics after doing some timer // advances that should and shouldn't increment the counters. - GuardDogImpl gd(stats_store_, config_mega_, *api_); - auto unpet_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); + initGuardDog(stats_store_, config_mega_); + auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); // We'd better start at 0: EXPECT_EQ(0UL, stats_store_.counter("server.watchdog_mega_miss").value()); // This shouldn't be enough to increment the stat: - time_system_.sleep(std::chrono::milliseconds(499)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(499)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(0UL, stats_store_.counter("server.watchdog_mega_miss").value()); // Just 2ms more will make it greater than 500ms timeout: - time_system_.sleep(std::chrono::milliseconds(2)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(2)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(1UL, stats_store_.counter("server.watchdog_mega_miss").value()); - gd.stopWatching(unpet_dog); + guard_dog_->stopWatching(unpet_dog); unpet_dog = nullptr; } -TEST_F(GuardDogMissTest, MissCountTest) { +TEST_P(GuardDogMissTest, MissCountTest) { // This tests a flake discovered in the MissTest where real timeout or // spurious condition_variable wakeup causes the counter to get incremented // more than it should be. - GuardDogImpl gd(stats_store_, config_miss_, *api_); - auto sometimes_pet_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); + initGuardDog(stats_store_, config_miss_); + auto sometimes_pet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); // These steps are executed once without ever touching the watchdog. // Then the last step is to touch the watchdog and repeat the steps. // This verifies that the behavior is reset back to baseline after a touch. for (unsigned long i = 0; i < 2; i++) { EXPECT_EQ(i, stats_store_.counter("server.watchdog_miss").value()); // This shouldn't be enough to increment the stat: - time_system_.sleep(std::chrono::milliseconds(499)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(499)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(i, stats_store_.counter("server.watchdog_miss").value()); // And if we force re-execution of the loop it still shouldn't be: - gd.forceCheckForTest(); + guard_dog_->forceCheckForTest(); EXPECT_EQ(i, stats_store_.counter("server.watchdog_miss").value()); // Just 2ms more will make it greater than 500ms timeout: - time_system_.sleep(std::chrono::milliseconds(2)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(2)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(i + 1, stats_store_.counter("server.watchdog_miss").value()); // Spurious wakeup, we should still only have one miss counted. - gd.forceCheckForTest(); + guard_dog_->forceCheckForTest(); EXPECT_EQ(i + 1, stats_store_.counter("server.watchdog_miss").value()); // When we finally touch the dog we should get one more increment once the // timeout value expires: sometimes_pet_dog->touch(); } - time_system_.sleep(std::chrono::milliseconds(1000)); + time_system_->sleep(std::chrono::milliseconds(1000)); sometimes_pet_dog->touch(); // Make sure megamiss still works: EXPECT_EQ(0UL, stats_store_.counter("server.watchdog_mega_miss").value()); - time_system_.sleep(std::chrono::milliseconds(1500)); - gd.forceCheckForTest(); + time_system_->sleep(std::chrono::milliseconds(1500)); + guard_dog_->forceCheckForTest(); EXPECT_EQ(1UL, stats_store_.counter("server.watchdog_mega_miss").value()); - gd.stopWatching(sometimes_pet_dog); + guard_dog_->stopWatching(sometimes_pet_dog); sometimes_pet_dog = nullptr; } -TEST_F(GuardDogTestBase, StartStopTest) { +TEST_P(GuardDogTestBase, StartStopTest) { NiceMock stats; NiceMock config(0, 0, 0, 0); - GuardDogImpl gd(stats, config, *api_); + initGuardDog(stats, config); } -TEST_F(GuardDogTestBase, LoopIntervalNoKillTest) { +TEST_P(GuardDogTestBase, LoopIntervalNoKillTest) { NiceMock stats; NiceMock config(40, 50, 0, 0); - GuardDogImpl gd(stats, config, *api_); - EXPECT_EQ(gd.loopIntervalForTest(), 40); + initGuardDog(stats, config); + EXPECT_EQ(guard_dog_->loopIntervalForTest(), 40); } -TEST_F(GuardDogTestBase, LoopIntervalTest) { +TEST_P(GuardDogTestBase, LoopIntervalTest) { NiceMock stats; NiceMock config(100, 90, 1000, 500); - GuardDogImpl gd(stats, config, *api_); - EXPECT_EQ(gd.loopIntervalForTest(), 90); + initGuardDog(stats, config); + EXPECT_EQ(guard_dog_->loopIntervalForTest(), 90); } -TEST_F(GuardDogTestBase, WatchDogThreadIdTest) { +TEST_P(GuardDogTestBase, WatchDogThreadIdTest) { NiceMock stats; NiceMock config(100, 90, 1000, 500); - GuardDogImpl gd(stats, config, *api_); - auto watched_dog = gd.createWatchDog(api_->threadFactory().currentThreadId()); + initGuardDog(stats, config); + auto watched_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId()); EXPECT_EQ(watched_dog->threadId().debugString(), api_->threadFactory().currentThreadId()->debugString()); - gd.stopWatching(watched_dog); + guard_dog_->stopWatching(watched_dog); } // If this test fails it is because the std::chrono::steady_clock::duration type has become @@ -258,7 +305,7 @@ TEST_F(GuardDogTestBase, WatchDogThreadIdTest) { // // The WatchDog/GuardDog relies on this being a lock free atomic for perf reasons so some workaround // will be required if this test starts failing. -TEST_F(GuardDogTestBase, AtomicIsAtomicTest) { +TEST_P(GuardDogTestBase, AtomicIsAtomicTest) { std::atomic atomic_time; ASSERT_EQ(atomic_time.is_lock_free(), true); } diff --git a/test/test_common/simulated_time_system.cc b/test/test_common/simulated_time_system.cc index 00d239479d148..6727634b5f8d5 100644 --- a/test/test_common/simulated_time_system.cc +++ b/test/test_common/simulated_time_system.cc @@ -25,44 +25,64 @@ class SimulatedTimeSystemHelper::Alarm : public Timer { // Timer void disableTimer() override; void enableTimer(const std::chrono::milliseconds& duration) override; - bool enabled() override { return armed_; } + bool enabled() override { + Thread::LockGuard lock(time_system_.mutex_); + return armed_; + } + + void disableTimerLockHeld() EXCLUSIVE_LOCKS_REQUIRED(time_system_.mutex_); - void setTime(MonotonicTime time) { time_ = time; } + void setTimeLockHeld(MonotonicTime time) EXCLUSIVE_LOCKS_REQUIRED(time_system_.mutex_) { + time_ = time; + } /** * Activates the timer so it will be run the next time the libevent loop is run, * typically via Dispatcher::run(). */ - void activate() { + void activateLockHeld() EXCLUSIVE_LOCKS_REQUIRED(time_system_.mutex_) { + ASSERT(armed_); armed_ = false; - std::chrono::milliseconds duration = std::chrono::milliseconds::zero(); time_system_.incPending(); + + // We don't want to activate the alarm under lock, as it will make a libevent call, + // and libevent itself uses locks: + // https://github.com/libevent/libevent/blob/29cc8386a2f7911eaa9336692a2c5544d8b4734f/event.c#L1917 + time_system_.mutex_.unlock(); + std::chrono::milliseconds duration = std::chrono::milliseconds::zero(); base_timer_->enableTimer(duration); + time_system_.mutex_.lock(); } - MonotonicTime time() const { + MonotonicTime time() const EXCLUSIVE_LOCKS_REQUIRED(time_system_.mutex_) { ASSERT(armed_); return time_; } + SimulatedTimeSystemHelper& timeSystem() { return time_system_; } uint64_t index() const { return index_; } private: + friend SimulatedTimeSystemHelper::CompareAlarms; + void runAlarm(TimerCb cb) { - time_system_.decPending(); + // Capture time_system_ in a local in case the alarm gets deleted in the callback. + SimulatedTimeSystemHelper& time_system = time_system_; cb(); + time_system.decPending(); } TimerPtr base_timer_; SimulatedTimeSystemHelper& time_system_; - MonotonicTime time_; - uint64_t index_; - bool armed_; + MonotonicTime time_ GUARDED_BY(time_system_.mutex_); + const uint64_t index_; + bool armed_ GUARDED_BY(time_system_.mutex_); }; // Compare two alarms, based on wakeup time and insertion order. Returns true if // a comes before b. -bool SimulatedTimeSystemHelper::CompareAlarms::operator()(const Alarm* a, const Alarm* b) const { +bool SimulatedTimeSystemHelper::CompareAlarms::operator()(const Alarm* a, const Alarm* b) const + EXCLUSIVE_LOCKS_REQUIRED(a->time_system_.mutex_, b->time_system_.mutex_) { if (a != b) { if (a->time() < b->time()) { return true; @@ -97,20 +117,26 @@ SimulatedTimeSystemHelper::Alarm::Alarm::~Alarm() { } void SimulatedTimeSystemHelper::Alarm::Alarm::disableTimer() { + Thread::LockGuard lock(time_system_.mutex_); + disableTimerLockHeld(); +} + +void SimulatedTimeSystemHelper::Alarm::Alarm::disableTimerLockHeld() { if (armed_) { - time_system_.removeAlarm(this); + time_system_.removeAlarmLockHeld(this); armed_ = false; } } void SimulatedTimeSystemHelper::Alarm::Alarm::enableTimer( const std::chrono::milliseconds& duration) { - disableTimer(); + Thread::LockGuard lock(time_system_.mutex_); + disableTimerLockHeld(); armed_ = true; if (duration.count() == 0) { - activate(); + activateLockHeld(); } else { - time_system_.addAlarm(this, duration); + time_system_.addAlarmLockHeld(this, duration); } } @@ -152,9 +178,9 @@ void SimulatedTimeSystemHelper::sleep(const Duration& duration) { setMonotonicTimeAndUnlock(monotonic_time); } -Thread::CondVar::WaitStatus SimulatedTimeSystemHelper::waitFor(Thread::MutexBasicLockable& mutex, - Thread::CondVar& condvar, - const Duration& duration) noexcept { +Thread::CondVar::WaitStatus SimulatedTimeSystemHelper::waitFor( + Thread::MutexBasicLockable& mutex, Thread::CondVar& condvar, + const Duration& duration) noexcept EXCLUSIVE_LOCKS_REQUIRED(mutex) { const Duration real_time_poll_delay( std::min(std::chrono::duration_cast(std::chrono::milliseconds(50)), duration)); const MonotonicTime end_time = monotonicTime() + duration; @@ -177,7 +203,8 @@ Thread::CondVar::WaitStatus SimulatedTimeSystemHelper::waitFor(Thread::MutexBasi setMonotonicTimeAndUnlock(end_time); } else { // If there's another alarm pending, sleep forward to it. - MonotonicTime next_wakeup = (*alarms_.begin())->time(); + Alarm* alarm = (*alarms_.begin()); + MonotonicTime next_wakeup = alarmTimeLockHeld(alarm); setMonotonicTimeAndUnlock(std::min(next_wakeup, end_time)); } } else { @@ -189,21 +216,33 @@ Thread::CondVar::WaitStatus SimulatedTimeSystemHelper::waitFor(Thread::MutexBasi return Thread::CondVar::WaitStatus::Timeout; } +MonotonicTime SimulatedTimeSystemHelper::alarmTimeLockHeld(Alarm* alarm) NO_THREAD_SAFETY_ANALYSIS { + // We disable thread-safety analysis as the compiler can't detect that + // alarm_->timeSystem() == this, so we must be holding the right mutex. + ASSERT(&(alarm->timeSystem()) == this); + return alarm->time(); +} + +void SimulatedTimeSystemHelper::alarmActivateLockHeld(Alarm* alarm) NO_THREAD_SAFETY_ANALYSIS { + // We disable thread-safety analysis as the compiler can't detect that + // alarm_->timeSystem() == this, so we must be holding the right mutex. + ASSERT(&(alarm->timeSystem()) == this); + alarm->activateLockHeld(); +} + int64_t SimulatedTimeSystemHelper::nextIndex() { Thread::LockGuard lock(mutex_); return index_++; } -void SimulatedTimeSystemHelper::addAlarm(Alarm* alarm, const std::chrono::milliseconds& duration) { - Thread::LockGuard lock(mutex_); - alarm->setTime(monotonic_time_ + duration); +void SimulatedTimeSystemHelper::addAlarmLockHeld( + Alarm* alarm, const std::chrono::milliseconds& duration) NO_THREAD_SAFETY_ANALYSIS { + ASSERT(&(alarm->timeSystem()) == this); + alarm->setTimeLockHeld(monotonic_time_ + duration); alarms_.insert(alarm); } -void SimulatedTimeSystemHelper::removeAlarm(Alarm* alarm) { - Thread::LockGuard lock(mutex_); - alarms_.erase(alarm); -} +void SimulatedTimeSystemHelper::removeAlarmLockHeld(Alarm* alarm) { alarms_.erase(alarm); } SchedulerPtr SimulatedTimeSystemHelper::createScheduler(Scheduler& base_scheduler) { return std::make_unique(*this, base_scheduler); @@ -223,20 +262,16 @@ void SimulatedTimeSystemHelper::setMonotonicTimeAndUnlock(const MonotonicTime& m while (!alarms_.empty()) { AlarmSet::iterator pos = alarms_.begin(); Alarm* alarm = *pos; - if (alarm->time() > monotonic_time) { + MonotonicTime alarm_time = alarmTimeLockHeld(alarm); + if (alarm_time > monotonic_time) { break; } - ASSERT(alarm->time() >= monotonic_time_); + ASSERT(alarm_time >= monotonic_time_); system_time_ += - std::chrono::duration_cast(alarm->time() - monotonic_time_); - monotonic_time_ = alarm->time(); + std::chrono::duration_cast(alarm_time - monotonic_time_); + monotonic_time_ = alarm_time; alarms_.erase(pos); - mutex_.unlock(); - // We don't want to activate the alarm under lock, as it will make a libevent call, - // and libevent itself uses locks: - // https://github.com/libevent/libevent/blob/29cc8386a2f7911eaa9336692a2c5544d8b4734f/event.c#L1917 - alarm->activate(); - mutex_.lock(); + alarmActivateLockHeld(alarm); } system_time_ += std::chrono::duration_cast(monotonic_time - monotonic_time_); diff --git a/test/test_common/simulated_time_system.h b/test/test_common/simulated_time_system.h index 413f1485c3eb8..11ccad33a8e87 100644 --- a/test/test_common/simulated_time_system.h +++ b/test/test_common/simulated_time_system.h @@ -61,6 +61,7 @@ class SimulatedTimeSystemHelper : public TestTimeSystem { private: class SimulatedScheduler; class Alarm; + friend class Alarm; // Needed to reference mutex for thread annotations. struct CompareAlarms { bool operator()(const Alarm* a, const Alarm* b) const; }; @@ -76,13 +77,17 @@ class SimulatedTimeSystemHelper : public TestTimeSystem { */ void setMonotonicTimeAndUnlock(const MonotonicTime& monotonic_time) UNLOCK_FUNCTION(mutex_); + MonotonicTime alarmTimeLockHeld(Alarm* alarm) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void alarmActivateLockHeld(Alarm* alarm) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // The simulation keeps a unique ID for each alarm to act as a deterministic // tie-breaker for alarm-ordering. int64_t nextIndex(); // Adds/removes an alarm. - void addAlarm(Alarm*, const std::chrono::milliseconds& duration); - void removeAlarm(Alarm*); + void addAlarmLockHeld(Alarm*, const std::chrono::milliseconds& duration) + EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void removeAlarmLockHeld(Alarm*) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Keeps track of how many alarms have been activated but not yet called, // which helps waitFor() determine when to give up and declare a timeout. diff --git a/test/test_common/simulated_time_system_test.cc b/test/test_common/simulated_time_system_test.cc index 84091269d45d1..4f1fdf6023b0c 100644 --- a/test/test_common/simulated_time_system_test.cc +++ b/test/test_common/simulated_time_system_test.cc @@ -33,12 +33,12 @@ class SimulatedTimeSystemTest : public testing::Test { void sleepMsAndLoop(int64_t delay_ms) { time_system_.sleep(std::chrono::milliseconds(delay_ms)); - base_scheduler_.nonBlockingLoop(); + base_scheduler_.run(Dispatcher::RunType::NonBlock); } void advanceSystemMsAndLoop(int64_t delay_ms) { time_system_.setSystemTime(time_system_.systemTime() + std::chrono::milliseconds(delay_ms)); - base_scheduler_.nonBlockingLoop(); + base_scheduler_.run(Dispatcher::RunType::NonBlock); } LibeventScheduler base_scheduler_; @@ -66,7 +66,7 @@ TEST_F(SimulatedTimeSystemTest, WaitFor) { std::atomic done(false); auto thread = Thread::threadFactoryForTest().createThread([this, &done]() { while (!done) { - base_scheduler_.blockingLoop(); + base_scheduler_.run(Dispatcher::RunType::Block); } }); Thread::CondVar condvar; diff --git a/tools/check_format.py b/tools/check_format.py index 7983dd5822aec..4be8bd4e6999b 100755 --- a/tools/check_format.py +++ b/tools/check_format.py @@ -388,7 +388,8 @@ def checkSourceLine(line, file_path, reportError): # legitimately show up in comments, for example this one. reportError("Don't use , use absl::Mutex for reader/writer locks.") if not whitelistedForRealTime(file_path) and not 'NO_CHECK_FORMAT(real_time)' in line: - if 'RealTimeSource' in line or 'RealTimeSystem' in line or \ + if 'RealTimeSource' in line or \ + ('RealTimeSystem' in line and not 'TestRealTimeSystem' in line) or \ 'std::chrono::system_clock::now' in line or 'std::chrono::steady_clock::now' in line or \ 'std::this_thread::sleep_for' in line or hasCondVarWaitFor(line): reportError("Don't reference real-world time sources from production code; use injection")