Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion include/envoy/event/dispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,14 @@ class Dispatcher {
* called) or non-blocking mode where only active events will be executed and then
* run() will return.
*/
enum class RunType { Block, NonBlock };
enum class RunType {
Block, // Executes any events that have been activated, then exit.
NonBlock, // Waits for any pending events to activate, executes them,
// then exits. Exits immediately if there are no pending or
// active events.
RunUntilExit // Runs the event-loop until loopExit() is called, blocking
// until there are pending or active events.
};
virtual void run(RunType type) PURE;

/**
Expand Down
1 change: 1 addition & 0 deletions source/common/event/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ envoy_cc_library(
deps = [
":libevent_lib",
":timer_lib",
"//include/envoy/event:dispatcher_interface",
"//include/envoy/event:timer_interface",
"//source/common/common:assert_lib",
],
Expand Down
7 changes: 1 addition & 6 deletions source/common/event/dispatcher_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,7 @@ void DispatcherImpl::run(RunType type) {
// not guarantee that events are run in any particular order. So even if we post() and call
// event_base_once() before some other event, the other event might get called first.
runPostCallbacks();

if (type == RunType::NonBlock) {
base_scheduler_.nonBlockingLoop();
} else {
base_scheduler_.blockingLoop();
}
base_scheduler_.run(type);
}

void DispatcherImpl::runPostCallbacks() {
Expand Down
29 changes: 19 additions & 10 deletions source/common/event/libevent_scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,30 @@ TimerPtr LibeventScheduler::createTimer(const TimerCb& cb) {
return std::make_unique<TimerImpl>(libevent_, cb);
};

void LibeventScheduler::nonBlockingLoop() {
void LibeventScheduler::run(Dispatcher::RunType mode) {
int flag = 0;
switch (mode) {
case Dispatcher::RunType::NonBlock:
flag = EVLOOP_NONBLOCK;
#ifdef WIN32
// On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever.
// This is because libevent only supports level triggering on Windows, and so the write
// event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the
// loop will run at most once
const int flag = EVLOOP_NONBLOCK | EVLOOP_ONCE;
#else
const int flag = EVLOOP_NONBLOCK;
// On Windows, EVLOOP_NONBLOCK will cause the libevent event_base_loop to run forever.
// This is because libevent only supports level triggering on Windows, and so the write
// event callbacks will trigger every time through the loop. Adding EVLOOP_ONCE ensures the
// loop will run at most once
flag |= EVLOOP_NONBLOCK | EVLOOP_ONCE;
#endif
break;
case Dispatcher::RunType::Block:
// The default flags have 'block' behavior. See
// http://www.wangafu.net/~nickm/libevent-book/Ref3_eventloop.html
break;
case Dispatcher::RunType::RunUntilExit:
flag = EVLOOP_NO_EXIT_ON_EMPTY;
break;
}
event_base_loop(libevent_.get(), flag);
}

void LibeventScheduler::blockingLoop() { event_base_loop(libevent_.get(), 0); }

void LibeventScheduler::loopExit() { event_base_loopexit(libevent_.get(), nullptr); }

} // namespace Event
Expand Down
12 changes: 5 additions & 7 deletions source/common/event/libevent_scheduler.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "envoy/event/dispatcher.h"
#include "envoy/event/timer.h"

#include "common/event/libevent.h"
Expand All @@ -18,14 +19,11 @@ class LibeventScheduler : public Scheduler {
TimerPtr createTimer(const TimerCb& cb) override;

/**
* Runs the libevent loop once, without blocking.
*/
void nonBlockingLoop();

/**
* Runs the libevent loop once, with block.
* Runs the event loop.
*
* @param mode The mode in which to run the event loop.
*/
void blockingLoop();
void run(Dispatcher::RunType mode);

/**
* Exits the libevent loop.
Expand Down
73 changes: 36 additions & 37 deletions source/server/guarddog_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ namespace Envoy {
namespace Server {

GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config,
Api::Api& api)
: time_source_(api.timeSource()), miss_timeout_(config.wdMissTimeout()),
megamiss_timeout_(config.wdMegaMissTimeout()), kill_timeout_(config.wdKillTimeout()),
multi_kill_timeout_(config.wdMultiKillTimeout()),
Api::Api& api, std::unique_ptr<TestInterlockHook>&& test_interlock)
: test_interlock_hook_(std::move(test_interlock)), time_source_(api.timeSource()),
miss_timeout_(config.wdMissTimeout()), megamiss_timeout_(config.wdMegaMissTimeout()),
kill_timeout_(config.wdKillTimeout()), multi_kill_timeout_(config.wdMultiKillTimeout()),
loop_interval_([&]() -> std::chrono::milliseconds {
// The loop interval is simply the minimum of all specified intervals,
// but we must account for the 0=disabled case. This lambda takes care
Expand All @@ -32,15 +32,28 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio
}()),
watchdog_miss_counter_(stats_scope.counter("server.watchdog_miss")),
watchdog_megamiss_counter_(stats_scope.counter("server.watchdog_mega_miss")),
run_thread_(true) {
dispatcher_(api.allocateDispatcher()),
loop_timer_(dispatcher_->createTimer([this]() { step(); })), run_thread_(true) {
start(api);
}

GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config,
Api::Api& api)
: GuardDogImpl(stats_scope, config, api, std::make_unique<TestInterlockHook>()) {}

GuardDogImpl::~GuardDogImpl() { stop(); }

void GuardDogImpl::threadRoutine() {
do {
const auto now = time_source_.monotonicTime();
void GuardDogImpl::step() {
{
Thread::LockGuard guard(mutex_);
if (!run_thread_) {
return;
}
}

const auto now = time_source_.monotonicTime();

{
bool seen_one_multi_timeout(false);
Thread::LockGuard guard(wd_lock_);
for (auto& watched_dog : watched_dogs_) {
Expand Down Expand Up @@ -79,7 +92,15 @@ void GuardDogImpl::threadRoutine() {
}
}
}
} while (waitOrDetectStop());
}

{
Thread::LockGuard guard(mutex_);
test_interlock_hook_->signalFromImpl(now);
if (run_thread_) {
loop_timer_->enableTimer(loop_interval_);
}
}
}

WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadIdPtr&& thread_id) {
Expand Down Expand Up @@ -111,41 +132,19 @@ void GuardDogImpl::stopWatching(WatchDogSharedPtr wd) {
}
}

bool GuardDogImpl::waitOrDetectStop() {
force_checked_event_.notifyAll();
Thread::LockGuard guard(exit_lock_);
// Spurious wakeups are OK without explicit handling. We'll just check
// earlier than strictly required for that round.

// Preferably, we should be calling
// time_system_.waitFor(exit_lock_, exit_event_, loop_interval_);
// here, but that makes GuardDogMissTest.* very flaky. The reason that
// directly calling condvar waitFor works is that it doesn't advance
// simulated time, which the test is carefully controlling.
//
// One alternative approach that would be easier to test is to use a private
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: this is exactly what I did in this PR.

// dispatcher and a TimerCB to execute the loop body of threadRoutine(). In
// this manner, the same dynamics would occur in production, with added
// overhead from libevent, But then the unit-test would purely control the
// advancement of time, and thus be more robust. Another variation would be
// to run this watchdog on the main-thread dispatcher, though such an approach
// could not detect when the main-thread was stuck.
exit_event_.waitFor(exit_lock_, loop_interval_); // NO_CHECK_FORMAT(real_time)
Copy link
Copy Markdown
Contributor Author

@jmarantz jmarantz Mar 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably worth pointing out here that this condvar-wait is entirely replaced by libevent in the new impl, via the use of a new private dispatcher owned by the guard-dog. We still need a condvar to avoid spinning over dispatcher-run while waiting for loop_interval_, but it's an untimed condvar wait, signaled by the firing of a timer.

It occurs to me that I could alternatively implement the Event::Scheduler interface with a new class CondvarScheduler which would the boil down to the exact same system-call we have here in the current version of guarddog_impl.cc. The benefit of this would be that we'd avoid carrying a dispatcher and a libevent base-ptr in the guarddog when there are no network or file-events to watch, simplifying the logic in operation. The cost is that implementing CondvarScheduler in general would add complexity to the PR, probably refactoring some of the SimulatedTimeSystem implementation of the alarm-set into something that would be used at runtime. As this functionality is already in libevent it seems simpler to re-use that.


return run_thread_;
}

void GuardDogImpl::start(Api::Api& api) {
run_thread_ = true;
thread_ = api.threadFactory().createThread([this]() -> void { threadRoutine(); });
Thread::LockGuard guard(mutex_);
thread_ = api.threadFactory().createThread(
[this]() -> void { dispatcher_->run(Event::Dispatcher::RunType::RunUntilExit); });
loop_timer_->enableTimer(std::chrono::milliseconds(0));
}

void GuardDogImpl::stop() {
{
Thread::LockGuard guard(exit_lock_);
Thread::LockGuard guard(mutex_);
run_thread_ = false;
exit_event_.notifyAll();
}
dispatcher_->exit();
if (thread_) {
thread_->join();
thread_.reset();
Expand Down
57 changes: 44 additions & 13 deletions source/server/guarddog_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,37 +32,67 @@ namespace Server {
*/
class GuardDogImpl : public GuardDog {
public:
/**
* Defines a test interlock hook to enable tests to synchronize the guard-dog
* execution so they can probe current counter values. The default
* implementation that runs in production has empty methods, which are
* overridden in the implementation used during tests.
*/
class TestInterlockHook {
public:
virtual ~TestInterlockHook() = default;

/**
* Called from GuardDogImpl to indicate that it has evaluated all watch-dogs
* up to a particular point in time.
*/
virtual void signalFromImpl(MonotonicTime) {}

/**
* Called from GuardDog tests to block until the implementation has reached
* the desired point in time.
*/
virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {}
};

/**
* @param stats_scope Statistics scope to write watchdog_miss and
* watchdog_mega_miss events into.
* @param config Configuration object.
* @param api API object.
* @param test_interlock a hook for enabling interlock with unit tests.
*
* See the configuration documentation for details on the timeout settings.
*/
GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api,
std::unique_ptr<TestInterlockHook>&& test_interlock);
GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuration::Main& config, Api::Api& api);
~GuardDogImpl();

/**
* Exposed for testing purposes only (but harmless to call):
*/
int loopIntervalForTest() const { return loop_interval_.count(); }

/**
* Test hook to force a step() to catch up with the current simulated
* time. This is inlined so that it does not need to be present in the
* production binary.
*/
void forceCheckForTest() {
exit_event_.notifyAll();
Thread::LockGuard guard(exit_lock_);
force_checked_event_.wait(exit_lock_);
Thread::LockGuard guard(mutex_);
MonotonicTime now = time_source_.monotonicTime();
loop_timer_->enableTimer(std::chrono::milliseconds(0));
test_interlock_hook_->waitFromTest(mutex_, now);
}

// Server::GuardDog
WatchDogSharedPtr createWatchDog(Thread::ThreadIdPtr&& thread_id) override;
void stopWatching(WatchDogSharedPtr wd) override;

private:
void threadRoutine();
/**
* @return True if we should continue, false if signalled to stop.
*/
bool waitOrDetectStop();
void start(Api::Api& api) EXCLUSIVE_LOCKS_REQUIRED(exit_lock_);
void start(Api::Api& api);
void step();
void stop();
// Per the C++ standard it is OK to use these in ctor initializer as long as
// it is after kill and multikill timeout values are initialized.
Expand All @@ -76,6 +106,7 @@ class GuardDogImpl : public GuardDog {
bool megamiss_alerted_{};
};

std::unique_ptr<TestInterlockHook> test_interlock_hook_;
TimeSource& time_source_;
const std::chrono::milliseconds miss_timeout_;
const std::chrono::milliseconds megamiss_timeout_;
Expand All @@ -87,10 +118,10 @@ class GuardDogImpl : public GuardDog {
std::vector<WatchedDog> watched_dogs_ GUARDED_BY(wd_lock_);
Thread::MutexBasicLockable wd_lock_;
Thread::ThreadPtr thread_;
Thread::MutexBasicLockable exit_lock_;
Thread::CondVar exit_event_;
bool run_thread_ GUARDED_BY(exit_lock_);
Thread::CondVar force_checked_event_;
Event::DispatcherPtr dispatcher_;
Event::TimerPtr loop_timer_;
Thread::MutexBasicLockable mutex_;
bool run_thread_ GUARDED_BY(mutex_);
};

} // namespace Server
Expand Down
1 change: 1 addition & 0 deletions test/server/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ envoy_cc_test(
deps = [
"//include/envoy/common:time_interface",
"//source/common/api:api_lib",
"//source/common/common:macros",
"//source/common/common:utility_lib",
"//source/common/stats:stats_lib",
"//source/server:guarddog_lib",
Expand Down
Loading