From 9fda944cd601cd931976b8fbfca8b14087f4155c Mon Sep 17 00:00:00 2001 From: Jipan Yang Date: Fri, 14 Sep 2018 18:05:58 -0700 Subject: [PATCH] Warm reboot: Add support for orchagent pre-shutdown warm-restart state check (#562) * Add orchagent pre-warm-restart check mechanism * Add orchagent_restart_check options: --noFreeze & --skipPendingTaskCheck * Add waitTime option for response from orchagent * Fix build issue with latest master * adapt to new dvs.runcmd() signature * Move standard header before local headers --- .gitignore | 1 + orchagent/Makefile.am | 6 +- orchagent/orchagent_restart_check.cpp | 145 ++++++++++++++++++++++++++ orchagent/orchdaemon.cpp | 60 +++++++++++ orchagent/orchdaemon.h | 2 + orchagent/switchorch.cpp | 56 +++++++++- orchagent/switchorch.h | 20 ++++ tests/test_warm_reboot.py | 55 +++++++++- 8 files changed, 342 insertions(+), 3 deletions(-) create mode 100644 orchagent/orchagent_restart_check.cpp diff --git a/.gitignore b/.gitignore index 34034cfe0c06..8bb5f10449ae 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ neighsyncd/neighsyncd portsyncd/portsyncd orchagent/orchagent orchagent/routeresync +orchagent/orchagent_restart_check swssconfig/swssconfig swssconfig/swssplayer tests/tests diff --git a/orchagent/Makefile.am b/orchagent/Makefile.am index 7439f394c049..97d98cd692ca 100644 --- a/orchagent/Makefile.am +++ b/orchagent/Makefile.am @@ -10,7 +10,7 @@ dist_swss_DATA = \ pfc_detect_barefoot.lua \ pfc_restore.lua -bin_PROGRAMS = orchagent routeresync +bin_PROGRAMS = orchagent routeresync orchagent_restart_check if DEBUG DBGFLAGS = -ggdb -DDEBUG @@ -86,3 +86,7 @@ routeresync_SOURCES = routeresync.cpp routeresync_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) routeresync_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) routeresync_LDADD = -lswsscommon + +orchagent_restart_check_SOURCES = orchagent_restart_check.cpp +orchagent_restart_check_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON) +orchagent_restart_check_LDADD = -lhiredis -lswsscommon -lpthread diff --git a/orchagent/orchagent_restart_check.cpp b/orchagent/orchagent_restart_check.cpp new file mode 100644 index 000000000000..b3d9227a0623 --- /dev/null +++ b/orchagent/orchagent_restart_check.cpp @@ -0,0 +1,145 @@ +#include +#include + +#include +#include + +#include "notificationproducer.h" +#include "notificationconsumer.h" +#include "select.h" +#include "logger.h" + + +void printUsage() +{ + SWSS_LOG_ENTER(); + + std::cout << "Usage: orchagent_restart_check [-s] " << std::endl; + std::cout << " -n --noFreeze" << std::endl; + std::cout << " Don't freeze orchagent even if check succeeded" << std::endl; + std::cout << " -s --skipPendingTaskCheck" << std::endl; + std::cout << " Skip pending task dependency check for orchagent" << std::endl; + std::cout << " -w --waitTime" << std::endl; + std::cout << " Wait time for response from orchagent, in milliseconds. Default value: 1000" << std::endl; + std::cout << " -h --help:" << std::endl; + std::cout << " Print out this message" << std::endl; +} + + +/* + * Before stopping orchagent for warm restart, basic state check is preferred to + * ensure orchagent is not in transient state, so a deterministic state may be restored after restart. + * + * Here is to implement orchagent_restart_check binary which may talk to orchagent and + * ask it to do self-check, return "READY " signal and freeze if everything is ok, + * otherwise "NOT_READY" signal should be returned. + * + * Optionally: + * if --noFreeze option is provided, orchagent won't freeze. + * if --skipPendingTaskCheck option is provided, orchagent won't use + * whether there is pending task existing as state check criterion. + */ +int main(int argc, char **argv) +{ + swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_INFO); + SWSS_LOG_ENTER(); + + std::string skipPendingTaskCheck = "fasle"; + std::string noFreeze = "fasle"; + /* Default wait time is 1000 millisecond */ + int waitTime = 1000; + + const char* const optstring = "nsw:"; + while(true) + { + static struct option long_options[] = + { + { "noFreeze", no_argument, 0, 'n' }, + { "skipPendingTaskCheck", no_argument, 0, 's' }, + { "waitTime", required_argument, 0, 'w' } + }; + + int option_index = 0; + + int c = getopt_long(argc, argv, optstring, long_options, &option_index); + + if (c == -1) + { + break; + } + + switch (c) + { + case 'n': + SWSS_LOG_NOTICE("Won't freeze orchagent even if check succeeded"); + noFreeze = "true"; + break; + case 's': + SWSS_LOG_NOTICE("Skipping pending task check for orchagent"); + skipPendingTaskCheck = "true"; + break; + case 'w': + SWSS_LOG_NOTICE("Wait time for response from orchagent set to %s milliseconds", optarg); + waitTime = atoi(optarg); + break; + case 'h': + printUsage(); + exit(EXIT_SUCCESS); + + case '?': + SWSS_LOG_WARN("unknown option %c", optopt); + printUsage(); + exit(EXIT_FAILURE); + + default: + SWSS_LOG_ERROR("getopt_long failure"); + exit(EXIT_FAILURE); + } + } + + swss::DBConnector db(APPL_DB, swss::DBConnector::DEFAULT_UNIXSOCKET, 0); + // Send warm restart query via "RESTARTCHECK" notification channel + swss::NotificationProducer restartQuery(&db, "RESTARTCHECK"); + // Will listen for the reply on "RESTARTCHECKREPLY" channel + swss::NotificationConsumer restartQueryReply(&db, "RESTARTCHECKREPLY"); + + std::vector values; + values.emplace_back("NoFreeze", noFreeze); + values.emplace_back("SkipPendingTaskCheck", skipPendingTaskCheck); + std::string op = "orchagent"; + SWSS_LOG_NOTICE("requested %s to do warm restart state check", op.c_str()); + restartQuery.send(op, op, values); + + + swss::Select s; + s.addSelectable(&restartQueryReply); + swss::Selectable *sel; + std::string op_ret, data; + values.clear(); + int result = s.select(&sel, waitTime); + if (result == swss::Select::OBJECT) + { + restartQueryReply.pop(op_ret, data, values); + if (data == "READY") + { + SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op_ret.c_str()); + std::cout << "RESTARTCHECK succeeded" << std::endl; + return EXIT_SUCCESS; + } + else + { + SWSS_LOG_NOTICE("RESTARTCHECK failed, %s is not ready for warm restart with status %s", + op_ret.c_str(), data.c_str()); + } + } + else if (result == swss::Select::TIMEOUT) + { + SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op_ret.c_str()); + } + else + { + SWSS_LOG_NOTICE("RESTARTCHECK for %s error", op_ret.c_str()); + } + std::cout << "RESTARTCHECK failed" << std::endl; + return EXIT_FAILURE; +} diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index 16bed5242247..eecd86146002 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "orchdaemon.h" #include "logger.h" #include @@ -343,6 +344,26 @@ void OrchDaemon::start() * is a good chance to flush the pipeline before next select happened. */ flush(); + + /* + * Asked to check warm restart readiness. + * Not doing this under Select::TIMEOUT condition because of + * the existence of finer granularity ExecutableTimer with select + */ + if (gSwitchOrch->checkRestartReady()) + { + bool ret = warmRestartCheck(); + if (ret) + { + // Orchagent is ready to perform warm restart, stop processing any new db data. + // Should sleep here or continue handling timers and etc.?? + if (!gSwitchOrch->checkRestartNoFreeze()) + { + SWSS_LOG_WARN("Orchagent is frozen for warm restart!"); + sleep(UINT_MAX); + } + } + } } } @@ -435,3 +456,42 @@ bool OrchDaemon::warmRestoreValidation() WarmStart::setWarmStartState("orchagent", WarmStart::RESTORED); return true; } + +/* + * Reply with "READY" notification if no pending tasks, and return true. + * Ortherwise reply with "NOT_READY" notification and return false. + * Further consideration is needed as to when orchagent is treated as warm restart ready. + * For now, no pending task should exist in any orch agent. + */ +bool OrchDaemon::warmRestartCheck() +{ + std::vector values; + std::string op = "orchagent"; + std::string data = "READY"; + bool ret = true; + + vector ts; + getTaskToSync(ts); + + if (ts.size() != 0) + { + SWSS_LOG_NOTICE("WarmRestart check found pending tasks: "); + for(auto &s : ts) + { + SWSS_LOG_NOTICE(" %s", s.c_str()); + } + if (!gSwitchOrch->skipPendingTaskCheck()) + { + data = "NOT_READY"; + ret = false; + } + else + { + SWSS_LOG_NOTICE("Orchagent objects dependency check skipped"); + } + } + + SWSS_LOG_NOTICE("Restart check result: %s", data.c_str()); + gSwitchOrch->restartCheckReply(op, data, values); + return ret; +} diff --git a/orchagent/orchdaemon.h b/orchagent/orchdaemon.h index ce5a353e3eb0..d9a8b07da931 100644 --- a/orchagent/orchdaemon.h +++ b/orchagent/orchdaemon.h @@ -39,6 +39,8 @@ class OrchDaemon bool warmRestoreAndSyncUp(); void getTaskToSync(vector &ts); bool warmRestoreValidation(); + + bool warmRestartCheck(); private: DBConnector *m_applDb; DBConnector *m_configDb; diff --git a/orchagent/switchorch.cpp b/orchagent/switchorch.cpp index e48b5c0f9be1..2a2c264cee06 100644 --- a/orchagent/switchorch.cpp +++ b/orchagent/switchorch.cpp @@ -2,6 +2,8 @@ #include "switchorch.h" #include "converter.h" +#include "notifier.h" +#include "notificationproducer.h" using namespace std; using namespace swss; @@ -27,8 +29,12 @@ const map packet_action_map = }; SwitchOrch::SwitchOrch(DBConnector *db, string tableName) : - Orch(db, tableName) + Orch(db, tableName), + m_db(db) { + m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK"); + auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK"); + Orch::addExecutor(restartCheckNotifier); } void SwitchOrch::doTask(Consumer &consumer) @@ -122,3 +128,51 @@ void SwitchOrch::doTask(Consumer &consumer) } } +void SwitchOrch::doTask(NotificationConsumer& consumer) +{ + SWSS_LOG_ENTER(); + + std::string op; + std::string data; + std::vector values; + + consumer.pop(op, data, values); + + if (&consumer != m_restartCheckNotificationConsumer) + { + return; + } + + m_warmRestartCheck.checkRestartReadyState = false; + m_warmRestartCheck.noFreeze = false; + m_warmRestartCheck.skipPendingTaskCheck = false; + + SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str()); + if (op == "orchagent") + { + string s = op; + + m_warmRestartCheck.checkRestartReadyState = true; + for (auto &i : values) + { + s += "|" + fvField(i) + ":" + fvValue(i); + + if (fvField(i) == "NoFreeze" && fvValue(i) == "true") + { + m_warmRestartCheck.noFreeze = true; + } + if (fvField(i) == "SkipPendingTaskCheck" && fvValue(i) == "true") + { + m_warmRestartCheck.skipPendingTaskCheck = true; + } + } + SWSS_LOG_NOTICE("%s", s.c_str()); + } +} + +void SwitchOrch::restartCheckReply(const string &op, const string &data, std::vector &values) +{ + NotificationProducer restartRequestReply(m_db, "RESTARTCHECKREPLY"); + restartRequestReply.send(op, data, values); + checkRestartReadyDone(); +} diff --git a/orchagent/switchorch.h b/orchagent/switchorch.h index ac0daf67d887..d1742353cd78 100644 --- a/orchagent/switchorch.h +++ b/orchagent/switchorch.h @@ -2,11 +2,31 @@ #include "orch.h" +struct WarmRestartCheck +{ + bool checkRestartReadyState; + bool noFreeze; + bool skipPendingTaskCheck; +}; + class SwitchOrch : public Orch { public: SwitchOrch(DBConnector *db, string tableName); + bool checkRestartReady() { return m_warmRestartCheck.checkRestartReadyState; } + bool checkRestartNoFreeze() { return m_warmRestartCheck.noFreeze; } + bool skipPendingTaskCheck() { return m_warmRestartCheck.skipPendingTaskCheck; } + void checkRestartReadyDone() { m_warmRestartCheck.checkRestartReadyState = false; } + void restartCheckReply(const string &op, const string &data, std::vector &values); private: void doTask(Consumer &consumer); + + NotificationConsumer* m_restartCheckNotificationConsumer; + void doTask(NotificationConsumer& consumer); + DBConnector *m_db; + + // Information contained in the request from + // external program for orchagent pre-shutdown state check + WarmRestartCheck m_warmRestartCheck = {false, false, false}; }; diff --git a/tests/test_warm_reboot.py b/tests/test_warm_reboot.py index b6f2a75c928c..96cd2d26d21b 100644 --- a/tests/test_warm_reboot.py +++ b/tests/test_warm_reboot.py @@ -314,7 +314,6 @@ def test_VlanMgrdWarmRestart(dvs): swss_app_check_RestartCount_single(state_db, restart_count, "vlanmgrd") - # function to stop neighsyncd service and clear syslog and sairedis records def stop_neighsyncd_clear_syslog_sairedis(dvs, save_number): dvs.runcmd(['sh', '-c', 'pkill -x neighsyncd']) @@ -697,6 +696,60 @@ def test_swss_neighbor_syncup(dvs): # check restart Count swss_app_check_RestartCount_single(state_db, restart_count, "neighsyncd") + +# TODO: The condition of warm restart readiness check is still under discussion. +def test_OrchagentWarmRestartReadyCheck(dvs): + + # do a pre-cleanup + dvs.runcmd("ip -s -s neigh flush all") + time.sleep(1) + + # enable warm restart + # TODO: use cfg command to config it + conf_db = swsscommon.DBConnector(swsscommon.CONFIG_DB, dvs.redis_sock, 0) + + dvs.runcmd("ifconfig Ethernet0 10.0.0.0/31 up") + dvs.runcmd("ifconfig Ethernet4 10.0.0.2/31 up") + + dvs.servers[0].runcmd("ifconfig eth0 10.0.0.1/31") + dvs.servers[0].runcmd("ip route add default via 10.0.0.0") + + dvs.servers[1].runcmd("ifconfig eth0 10.0.0.3/31") + dvs.servers[1].runcmd("ip route add default via 10.0.0.2") + + + appl_db = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0) + ps = swsscommon.ProducerStateTable(appl_db, swsscommon.APP_ROUTE_TABLE_NAME) + fvs = swsscommon.FieldValuePairs([("nexthop","10.0.0.1"), ("ifname", "Ethernet0")]) + + ps.set("2.2.2.0/24", fvs) + + time.sleep(1) + # Should fail, since neighbor for next 10.0.0.1 has not been not resolved yet + (exitcode, result) = dvs.runcmd("/usr/bin/orchagent_restart_check") + assert result == "RESTARTCHECK failed\n" + + # Should succeed, the option for skipPendingTaskCheck -s and noFreeze -n have been provided. + # Wait up to 500 milliseconds for response from orchagent. Default wait time is 1000 milliseconds. + (exitcode, result) = dvs.runcmd("/usr/bin/orchagent_restart_check -n -s -w 500") + assert result == "RESTARTCHECK succeeded\n" + + # get neighbor and arp entry + dvs.servers[1].runcmd("ping -c 1 10.0.0.1") + + time.sleep(1) + (exitcode, result) = dvs.runcmd("/usr/bin/orchagent_restart_check") + assert result == "RESTARTCHECK succeeded\n" + + # Should fail since orchagent has been frozen at last step. + (exitcode, result) = dvs.runcmd("/usr/bin/orchagent_restart_check -n -s -w 500") + assert result == "RESTARTCHECK failed\n" + + # recover for test cases after this one. + stop_swss(dvs) + start_swss(dvs) + time.sleep(5) + def test_swss_port_state_syncup(dvs): appl_db = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0)