Skip to content

Commit

Permalink
[syncd] Add event name to timer watchdog (sonic-net#868)
Browse files Browse the repository at this point in the history
When watchdog thread will exceed 30sec it will print currently executing event like create/remove/get/set. This data can be also found in sairedis.rec, but this PR will also print this in syslog for convenience.
  • Loading branch information
kcudnik authored Aug 6, 2021
1 parent 3830b97 commit 73714ec
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 20 deletions.
1 change: 1 addition & 0 deletions syncd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ endif

noinst_LIBRARIES = libSyncd.a libSyncdRequestShutdown.a
libSyncd_a_SOURCES = \
WatchdogScope.cpp \
SaiSwitchInterface.cpp \
ZeroMQSelectableChannel.cpp \
RedisSelectableChannel.cpp \
Expand Down
29 changes: 17 additions & 12 deletions syncd/Syncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "RedisSelectableChannel.h"
#include "ZeroMQSelectableChannel.h"
#include "PerformanceIntervalTimer.h"
#include "TimerWatchdog.h"
#include "WatchdogScope.h"

#include "sairediscommon.h"

Expand Down Expand Up @@ -50,7 +50,8 @@ Syncd::Syncd(
m_asicInitViewMode(false), // by default we are in APPLY view mode
m_vendorSai(vendorSai),
m_veryFirstRun(false),
m_enableSyncMode(false)
m_enableSyncMode(false),
m_timerWatchdog(30 * 1000000) // watch for executions over 30 seconds
{
SWSS_LOG_ENTER();

Expand Down Expand Up @@ -317,6 +318,8 @@ sai_status_t Syncd::processSingleEvent(
return SAI_STATUS_SUCCESS;
}

WatchdogScope ws(m_timerWatchdog, op + ":" + key);

if (op == REDIS_ASIC_STATE_COMMAND_CREATE)
return processQuadEvent(SAI_COMMON_API_CREATE, kco);

Expand Down Expand Up @@ -1956,6 +1959,8 @@ void Syncd::processFlexCounterGroupEvent( // TODO must be moved to go via ASIC c
auto& op = kfvOp(kco);
auto& values = kfvFieldsValues(kco);

WatchdogScope ws(m_timerWatchdog, op + ":" + groupName);

if (op == SET_COMMAND)
{
m_manager->addCounterPlugin(groupName, values);
Expand Down Expand Up @@ -1984,6 +1989,8 @@ void Syncd::processFlexCounterEvent( // TODO must be moved to go via ASIC channe
auto& key = kfvKey(kco);
auto& op = kfvOp(kco);

WatchdogScope ws(m_timerWatchdog, op + ":" + key);

auto delimiter = key.find_first_of(":");

if (delimiter == std::string::npos)
Expand Down Expand Up @@ -4372,20 +4379,16 @@ void Syncd::run()
runMainLoop = false;
}

TimerWatchdog twd(30 * 1000000); // watch for executions over 30 seconds
m_timerWatchdog.setCallback(timerWatchdogCallback);

twd.setCallback(timerWatchdogCallback);

while(runMainLoop)
while (runMainLoop)
{
try
{
swss::Selectable *sel = NULL;

int result = s->select(&sel);

twd.setStartTime();

if (sel == m_restartQuery.get())
{
/*
Expand All @@ -4405,6 +4408,8 @@ void Syncd::run()

SWSS_LOG_NOTICE("drained queue");

WatchdogScope ws(m_timerWatchdog, "restart query");

shutdownType = handleRestartQuery(*m_restartQuery);

if (shutdownType != SYNCD_RESTART_TYPE_PRE_SHUTDOWN)
Expand Down Expand Up @@ -4474,8 +4479,6 @@ void Syncd::run()
{
SWSS_LOG_ERROR("select failed: %d", result);
}

twd.setEndTime();
}
catch(const std::exception &e)
{
Expand All @@ -4492,11 +4495,11 @@ void Syncd::run()

// make sure that if second exception will arise, then we break the loop
m_commandLineOptions->m_disableExitSleep = true;

twd.setEndTime();
}
}

WatchdogScope ws(m_timerWatchdog, "shutting down syncd");

if (shutdownType == SYNCD_RESTART_TYPE_WARM)
{
const char *warmBootWriteFile = profileGetValue(0, SAI_KEY_WARM_BOOT_WRITE_FILE);
Expand Down Expand Up @@ -4572,6 +4575,8 @@ syncd_restart_type_t Syncd::handleRestartQuery(

restartQuery.pop(op, data, values);

m_timerWatchdog.setEventName(op + ":" + data);

SWSS_LOG_NOTICE("received %s switch shutdown event", op.c_str());

return RequestShutdownCommandLineOptions::stringToRestartType(op);
Expand Down
3 changes: 3 additions & 0 deletions syncd/Syncd.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "BreakConfig.h"
#include "NotificationProducerBase.h"
#include "SelectableChannel.h"
#include "TimerWatchdog.h"

#include "meta/SaiAttributeList.h"

Expand Down Expand Up @@ -489,5 +490,7 @@ namespace syncd
std::shared_ptr<sairedis::ContextConfig> m_contextConfig;

std::shared_ptr<BreakConfig> m_breakConfig;

TimerWatchdog m_timerWatchdog;
};
}
39 changes: 31 additions & 8 deletions syncd/TimerWatchdog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <chrono>

#define MUTEX std::lock_guard<std::mutex> _lock(m_mutex);

using namespace syncd;

TimerWatchdog::TimerWatchdog(
Expand Down Expand Up @@ -35,30 +37,54 @@ TimerWatchdog::~TimerWatchdog()

void TimerWatchdog::setStartTime()
{
MUTEX;
SWSS_LOG_ENTER();

do
{
m_startTimestamp = getTimeSinceEpoch();
}
while (m_startTimestamp <= m_endTimestamp); // make sure new start time is always past last end time

}

void TimerWatchdog::setEndTime()
{
MUTEX;
SWSS_LOG_ENTER();

do
{
m_endTimestamp = getTimeSinceEpoch();
}
while (m_endTimestamp <= m_startTimestamp); // make sure new end time is always past last start time

auto span = m_endTimestamp - m_startTimestamp;

if (span > m_warnTimespan)
{
SWSS_LOG_ERROR("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000);
}
else if (span > 50*1000)
{
SWSS_LOG_WARN("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000);
}

m_eventName = "";
}

void TimerWatchdog::setEventName(
_In_ const std::string& eventName)
{
MUTEX;
SWSS_LOG_ENTER();

m_eventName = eventName;
}

void TimerWatchdog::setCallback(
_In_ std::function<void(uint64_t)> callback)
{
MUTEX;
SWSS_LOG_ENTER();

m_callback = callback;
Expand All @@ -74,6 +100,8 @@ void TimerWatchdog::threadFunction()
{
std::this_thread::sleep_for(std::chrono::seconds(1));

MUTEX; // to protect event name

// we make local copies, since executing functions can be so fast that
// when we will read second time start timestamp it can be different
// than previous one
Expand All @@ -94,7 +122,7 @@ void TimerWatchdog::threadFunction()

span = now - start; // this must be always non negative

SWSS_LOG_NOTICE("time span %ld ms", span/1000);
SWSS_LOG_NOTICE("time span %ld ms for '%s'", span/1000, m_eventName.c_str()); // TODO remove this

if (span < 0)
SWSS_LOG_THROW("negative span 'now - start': %ld - %ld", now, start);
Expand All @@ -105,12 +133,7 @@ void TimerWatchdog::threadFunction()

// function probably hanged

SWSS_LOG_WARN("timespan WD exceeded %ld ms", span/1000);

auto callback = m_callback;

if (callback)
callback(span);
SWSS_LOG_ERROR("time span WD exceeded %ld ms for %s", span/1000, m_eventName.c_str());
}

continue;
Expand Down
8 changes: 8 additions & 0 deletions syncd/TimerWatchdog.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <thread>
#include <atomic>
#include <functional>
#include <mutex>

namespace syncd
{
Expand All @@ -23,6 +24,9 @@ namespace syncd

void setEndTime();

void setEventName(
_In_ const std::string& eventName);

void setCallback(
_In_ std::function<void(uint64_t)> callback);

Expand Down Expand Up @@ -51,5 +55,9 @@ namespace syncd
std::shared_ptr<std::thread> m_thread;

std::function<void(uint64_t)> m_callback;

std::string m_eventName;

std::mutex m_mutex;
};
}
22 changes: 22 additions & 0 deletions syncd/WatchdogScope.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "WatchdogScope.h"

using namespace syncd;

WatchdogScope::WatchdogScope(
_In_ TimerWatchdog& tw,
_In_ const std::string eventName):
m_tw(tw)
{
// SWSS_LOG_ENTER(); // disabled

m_tw.setStartTime();

m_tw.setEventName(eventName);
}

WatchdogScope::~WatchdogScope()
{
// SWSS_LOG_ENTER(); // disabled

m_tw.setEndTime();
}
21 changes: 21 additions & 0 deletions syncd/WatchdogScope.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include "TimerWatchdog.h"

namespace syncd
{
class WatchdogScope
{
public:

WatchdogScope(
_In_ TimerWatchdog& tw,
_In_ const std::string eventName);

~WatchdogScope();

private:

TimerWatchdog& m_tw;
};
}

0 comments on commit 73714ec

Please sign in to comment.