envoyproxy · jmarantz · Mar 27, 2019 · Mar 22, 2019 · Mar 23, 2019 · Mar 23, 2019
diff --git a/include/envoy/event/dispatcher.h b/include/envoy/event/dispatcher.h
@@ -157,7 +157,14 @@ class Dispatcher {
    *              called) or non-blocking mode where only active events will be executed and then
    *              run() will return.
    */
-  enum class RunType { Block, NonBlock };
+  enum class RunType {
+    Block,       // Executes any events that have been activated, then exit.
+    NonBlock,    // Waits for any pending events to activate, executes them,
+                 // then exits. Exits immediately if there are no pending or
+                 // active events.
+    RunUntilExit // Runs the event-loop until loopExit() is called, blocking
+                 // until there are pending or active events.
+  };
   virtual void run(RunType type) PURE;
 
   /**

diff --git a/source/common/event/BUILD b/source/common/event/BUILD
@@ -98,6 +98,7 @@ envoy_cc_library(
     deps = [
         ":libevent_lib",
         ":timer_lib",
+        "//include/envoy/event:dispatcher_interface",
         "//include/envoy/event:timer_interface",
         "//source/common/common:assert_lib",
     ],

diff --git a/source/common/event/dispatcher_impl.cc b/source/common/event/dispatcher_impl.cc
@@ -164,12 +164,7 @@ void DispatcherImpl::run(RunType type) {
   // not guarantee that events are run in any particular order. So even if we post() and call
   // event_base_once() before some other event, the other event might get called first.
   runPostCallbacks();
-
-  if (type == RunType::NonBlock) {
-    base_scheduler_.nonBlockingLoop();
-  } else {
-    base_scheduler_.blockingLoop();
-  }
+  base_scheduler_.run(type);
 }
 
 void DispatcherImpl::runPostCallbacks() {

diff --git a/source/common/event/libevent_scheduler.cc b/source/common/event/libevent_scheduler.cc
@@ -15,21 +15,30 @@ TimerPtr LibeventScheduler::createTimer(const TimerCb& cb) {
   return std::make_unique<TimerImpl>(libevent_, cb);
 };
 
-void LibeventScheduler::nonBlockingLoop() {
+void LibeventScheduler::run(Dispatcher::RunType mode) {
+  int flag = 0;
+  switch (mode) {
+  case Dispatcher::RunType::NonBlock:
+    flag = EVLOOP_NONBLOCK;
 #ifdef WIN32
-  // On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever.
-  // This is because libevent only supports level triggering on Windows, and so the write
-  // event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the
-  // loop will run at most once
-  const int flag = EVLOOP_NONBLOCK | EVLOOP_ONCE;
-#else
-  const int flag = EVLOOP_NONBLOCK;
+    // On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever.
+    // This is because libevent only supports level triggering on Windows, and so the write
+    // event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the
+    // loop will run at most once
+    flag |= EVLOOP_NONBLOCK | EVLOOP_ONCE;
 #endif
+    break;
+  case Dispatcher::RunType::Block:
+    // The default flags have 'block' behavior. See
+    // http://www.wangafu.net/~nickm/libevent-book/Ref3_eventloop.html
+    break;
+  case Dispatcher::RunType::RunUntilExit:
+    flag = EVLOOP_NO_EXIT_ON_EMPTY;
+    break;
+  }
   event_base_loop(libevent_.get(), flag);
 }
 
-void LibeventScheduler::blockingLoop() { event_base_loop(libevent_.get(), 0); }
-
 void LibeventScheduler::loopExit() { event_base_loopexit(libevent_.get(), nullptr); }
 
 } // namespace Event

diff --git a/source/common/event/libevent_scheduler.h b/source/common/event/libevent_scheduler.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "envoy/event/dispatcher.h"
 #include "envoy/event/timer.h"
 
 #include "common/event/libevent.h"
@@ -18,14 +19,11 @@ class LibeventScheduler : public Scheduler {
   TimerPtr createTimer(const TimerCb& cb) override;
 
   /**
-   * Runs the libevent loop once, without blocking.
-   */
-  void nonBlockingLoop();
-
-  /**
-   * Runs the libevent loop once, with block.
+   * Runs the event loop.
+   *
+   * @param mode The mode in which to run the event loop.
    */
-  void blockingLoop();
+  void run(Dispatcher::RunType mode);
 
   /**
    * Exits the libevent loop.

diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc
@@ -17,10 +17,10 @@ namespace Envoy {
 namespace Server {
 
 GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config,
-                           Api::Api& api)
-    : time_source_(api.timeSource()), miss_timeout_(config.wdMissTimeout()),
-      megamiss_timeout_(config.wdMegaMissTimeout()), kill_timeout_(config.wdKillTimeout()),
-      multi_kill_timeout_(config.wdMultiKillTimeout()),
+                           Api::Api& api, std::unique_ptr<TestInterlockHook>&& test_interlock)
+    : test_interlock_hook_(std::move(test_interlock)), time_source_(api.timeSource()),
+      miss_timeout_(config.wdMissTimeout()), megamiss_timeout_(config.wdMegaMissTimeout()),
+      kill_timeout_(config.wdKillTimeout()), multi_kill_timeout_(config.wdMultiKillTimeout()),
       loop_interval_([&]() -> std::chrono::milliseconds {
         // The loop interval is simply the minimum of all specified intervals,
         // but we must account for the 0=disabled case. This lambda takes care
@@ -32,15 +32,28 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio
       }()),
       watchdog_miss_counter_(stats_scope.counter("server.watchdog_miss")),
       watchdog_megamiss_counter_(stats_scope.counter("server.watchdog_mega_miss")),
-      run_thread_(true) {
+      dispatcher_(api.allocateDispatcher()),
+      loop_timer_(dispatcher_->createTimer([this]() { step(); })), run_thread_(true) {
   start(api);
 }
 
+GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config,
+                           Api::Api& api)
+    : GuardDogImpl(stats_scope, config, api, std::make_unique<TestInterlockHook>()) {}
+
 GuardDogImpl::~GuardDogImpl() { stop(); }
 
-void GuardDogImpl::threadRoutine() {
-  do {
-    const auto now = time_source_.monotonicTime();
+void GuardDogImpl::step() {
+  {
+    Thread::LockGuard guard(mutex_);
+    if (!run_thread_) {
+      return;
+    }
+  }
+
+  const auto now = time_source_.monotonicTime();
+
+  {
     bool seen_one_multi_timeout(false);
     Thread::LockGuard guard(wd_lock_);
     for (auto& watched_dog : watched_dogs_) {
@@ -79,7 +92,15 @@ void GuardDogImpl::threadRoutine() {
         }
       }
     }
-  } while (waitOrDetectStop());
+  }
+
+  {
+    Thread::LockGuard guard(mutex_);
+    test_interlock_hook_->signalFromImpl(now);
+    if (run_thread_) {
+      loop_timer_->enableTimer(loop_interval_);
+    }
+  }
 }
 
 WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadIdPtr&& thread_id) {
@@ -111,41 +132,19 @@ void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
   }
 }
 
-bool GuardDogImpl::waitOrDetectStop() {
-  force_checked_event_.notifyAll();
-  Thread::LockGuard guard(exit_lock_);
-  // Spurious wakeups are OK without explicit handling. We'll just check
-  // earlier than strictly required for that round.
-
-  // Preferably, we should be calling
-  //   time_system_.waitFor(exit_lock_, exit_event_, loop_interval_);
-  // here, but that makes GuardDogMissTest.* very flaky. The reason that
-  // directly calling condvar waitFor works is that it doesn't advance
-  // simulated time, which the test is carefully controlling.
-  //
-  // One alternative approach that would be easier to test is to use a private
-  // dispatcher and a TimerCB to execute the loop body of threadRoutine(). In
-  // this manner, the same dynamics would occur in production, with added
-  // overhead from libevent, But then the unit-test would purely control the
-  // advancement of time, and thus be more robust. Another variation would be
-  // to run this watchdog on the main-thread dispatcher, though such an approach
-  // could not detect when the main-thread was stuck.
-  exit_event_.waitFor(exit_lock_, loop_interval_); // NO_CHECK_FORMAT(real_time)
-
-  return run_thread_;
-}
-
 void GuardDogImpl::start(Api::Api& api) {
-  run_thread_ = true;
-  thread_ = api.threadFactory().createThread([this]() -> void { threadRoutine(); });
+  Thread::LockGuard guard(mutex_);
+  thread_ = api.threadFactory().createThread(
+      [this]() -> void { dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit); });
+  loop_timer_->enableTimer(std::chrono::milliseconds(0));
 }
 
 void GuardDogImpl::stop() {
   {
-    Thread::LockGuard guard(exit_lock_);
+    Thread::LockGuard guard(mutex_);
     run_thread_ = false;
-    exit_event_.notifyAll();
   }
+  dispatcher_->exit();
   if (thread_) {
     thread_->join();
     thread_.reset();

diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h
@@ -32,37 +32,67 @@ namespace Server {
  */
 class GuardDogImpl : public GuardDog {
 public:
+  /**
+   * Defines a test interlock hook to enable tests to synchronize the guard-dog
+   * execution so they can probe current counter values. The default
+   * implementation that runs in production has empty methods, which are
+   * overridden in the implementation used during tests.
+   */
+  class TestInterlockHook {
+  public:
+    virtual ~TestInterlockHook() = default;
+
+    /**
+     * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs
+     * up to a particular point in time.
+     */
+    virtual void signalFromImpl(MonotonicTime) {}
+
+    /**
+     * Called from GuardDog tests to block until the implementation has reached
+     * the desired point in time.
+     */
+    virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {}
+  };
+
   /**
    * @param stats_scope Statistics scope to write watchdog_miss and
    * watchdog_mega_miss events into.
    * @param config Configuration object.
+   * @param api API object.
+   * @param test_interlock a hook for enabling interlock with unit tests.
    *
    * See the configuration documentation for details on the timeout settings.
    */
+  GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api,
+               std::unique_ptr<TestInterlockHook>&& test_interlock);
   GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api);
   ~GuardDogImpl();
 
   /**
    * Exposed for testing purposes only (but harmless to call):
    */
   int loopIntervalForTest() const { return loop_interval_.count(); }
+
+  /**
+   * Test hook to force a step() to catch up with the current simulated
+   * time. This is inlined so that it does not need to be present in the
+   * production binary.
+   */
   void forceCheckForTest() {
-    exit_event_.notifyAll();
-    Thread::LockGuard guard(exit_lock_);
-    force_checked_event_.wait(exit_lock_);
+    Thread::LockGuard guard(mutex_);
+    MonotonicTime now = time_source_.monotonicTime();
+    loop_timer_->enableTimer(std::chrono::milliseconds(0));
+    test_interlock_hook_->waitFromTest(mutex_, now);
   }
 
   // Server::GuardDog
   WatchDogSharedPtr createWatchDog(Thread::ThreadIdPtr&& thread_id) override;
   void stopWatching(WatchDogSharedPtr wd) override;
 
 private:
-  void threadRoutine();
-  /**
-   * @return True if we should continue, false if signalled to stop.
-   */
-  bool waitOrDetectStop();
-  void start(Api::Api& api) EXCLUSIVE_LOCKS_REQUIRED(exit_lock_);
+  void start(Api::Api& api);
+  void step();
   void stop();
   // Per the C++ standard it is OK to use these in ctor initializer as long as
   // it is after kill and multikill timeout values are initialized.
@@ -76,6 +106,7 @@ class GuardDogImpl : public GuardDog {
     bool megamiss_alerted_{};
   };
 
+  std::unique_ptr<TestInterlockHook> test_interlock_hook_;
   TimeSource& time_source_;
   const std::chrono::milliseconds miss_timeout_;
   const std::chrono::milliseconds megamiss_timeout_;
@@ -87,10 +118,10 @@ class GuardDogImpl : public GuardDog {
   std::vector<WatchedDog> watched_dogs_ GUARDED_BY(wd_lock_);
   Thread::MutexBasicLockable wd_lock_;
   Thread::ThreadPtr thread_;
-  Thread::MutexBasicLockable exit_lock_;
-  Thread::CondVar exit_event_;
-  bool run_thread_ GUARDED_BY(exit_lock_);
-  Thread::CondVar force_checked_event_;
+  Event::DispatcherPtr dispatcher_;
+  Event::TimerPtr loop_timer_;
+  Thread::MutexBasicLockable mutex_;
+  bool run_thread_ GUARDED_BY(mutex_);
 };
 
 } // namespace Server

diff --git a/test/server/BUILD b/test/server/BUILD
@@ -96,6 +96,7 @@ envoy_cc_test(
     deps = [
         "//include/envoy/common:time_interface",
         "//source/common/api:api_lib",
+        "//source/common/common:macros",
         "//source/common/common:utility_lib",
         "//source/common/stats:stats_lib",
         "//source/server:guarddog_lib",