Skip to content

Commit bc4062b

Browse files
[mux]: Implement rollback for failed mux switchovers (sonic-net#2714)
- Make all SAI API operations needed for switchover idempotent - Implement rollback when a switchover fails Signed-off-by: Lawrence Lee <[email protected]>
1 parent 4c6fc31 commit bc4062b

10 files changed

+764
-51
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ teamsyncd/teamsyncd
7676
tests/tests
7777
tests/mock_tests/tests_response_publisher
7878
tests/mock_tests/tests_fpmsyncd
79+
tests/mock_tests/tests_intfmgrd
80+
tests/mock_tests/tests_portsyncd
7981

8082

8183
# Test Files #
@@ -87,5 +89,7 @@ tests/mock_tests/tests.trs
8789
tests/test-suite.log
8890
tests/tests.log
8991
tests/tests.trs
92+
tests/mock_tests/**/*log
93+
tests/mock_tests/**/*trs
9094
orchagent/p4orch/tests/**/*gcda
9195
orchagent/p4orch/tests/**/*gcno

orchagent/aclorch.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,11 @@ bool AclRule::createRule()
11401140
status = sai_acl_api->create_acl_entry(&m_ruleOid, gSwitchId, (uint32_t)rule_attrs.size(), rule_attrs.data());
11411141
if (status != SAI_STATUS_SUCCESS)
11421142
{
1143+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
1144+
{
1145+
SWSS_LOG_NOTICE("ACL rule %s already exists", m_id.c_str());
1146+
return true;
1147+
}
11431148
SWSS_LOG_ERROR("Failed to create ACL rule %s, rv:%d",
11441149
m_id.c_str(), status);
11451150
AclRange::remove(range_objects, range_object_list.count);
@@ -1201,6 +1206,12 @@ bool AclRule::removeRule()
12011206
auto status = sai_acl_api->remove_acl_entry(m_ruleOid);
12021207
if (status != SAI_STATUS_SUCCESS)
12031208
{
1209+
if (status == SAI_STATUS_ITEM_NOT_FOUND)
1210+
{
1211+
SWSS_LOG_NOTICE("ACL rule already deleted");
1212+
m_ruleOid = SAI_NULL_OBJECT_ID;
1213+
return true;
1214+
}
12041215
SWSS_LOG_ERROR("Failed to delete ACL rule, status %s", sai_serialize_status(status).c_str());
12051216
return false;
12061217
}

orchagent/muxorch.cpp

+83-35
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ static sai_status_t create_route(IpPrefix &pfx, sai_object_id_t nh)
116116
sai_status_t status = sai_route_api->create_route_entry(&route_entry, (uint32_t)attrs.size(), attrs.data());
117117
if (status != SAI_STATUS_SUCCESS)
118118
{
119+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS) {
120+
SWSS_LOG_NOTICE("Tunnel route to %s already exists", pfx.to_string().c_str());
121+
return SAI_STATUS_SUCCESS;
122+
}
119123
SWSS_LOG_ERROR("Failed to create tunnel route %s,nh %" PRIx64 " rv:%d",
120124
pfx.getIp().to_string().c_str(), nh, status);
121125
return status;
@@ -145,6 +149,10 @@ static sai_status_t remove_route(IpPrefix &pfx)
145149
sai_status_t status = sai_route_api->remove_route_entry(&route_entry);
146150
if (status != SAI_STATUS_SUCCESS)
147151
{
152+
if (status == SAI_STATUS_ITEM_NOT_FOUND) {
153+
SWSS_LOG_NOTICE("Tunnel route to %s already removed", pfx.to_string().c_str());
154+
return SAI_STATUS_SUCCESS;
155+
}
148156
SWSS_LOG_ERROR("Failed to remove tunnel route %s, rv:%d",
149157
pfx.getIp().to_string().c_str(), status);
150158
return status;
@@ -497,15 +505,15 @@ void MuxCable::setState(string new_state)
497505

498506
mux_cb_orch_->updateMuxMetricState(mux_name_, new_state, true);
499507

500-
MuxState state = state_;
508+
prev_state_ = state_;
501509
state_ = ns;
502510

503511
st_chg_in_progress_ = true;
504512

505513
if (!(this->*(state_machine_handlers_[it->second]))())
506514
{
507515
//Reset back to original state
508-
state_ = state;
516+
state_ = prev_state_;
509517
st_chg_in_progress_ = false;
510518
st_chg_failed_ = true;
511519
throw std::runtime_error("Failed to handle state transition");
@@ -521,6 +529,51 @@ void MuxCable::setState(string new_state)
521529
return;
522530
}
523531

532+
void MuxCable::rollbackStateChange()
533+
{
534+
if (prev_state_ == MuxState::MUX_STATE_FAILED || prev_state_ == MuxState::MUX_STATE_PENDING)
535+
{
536+
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
537+
muxStateValToString.at(prev_state_).c_str());
538+
return;
539+
}
540+
SWSS_LOG_WARN("[%s] Rolling back state change to %s", mux_name_.c_str(),
541+
muxStateValToString.at(prev_state_).c_str());
542+
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(prev_state_), true);
543+
st_chg_in_progress_ = true;
544+
state_ = prev_state_;
545+
bool success = false;
546+
switch (prev_state_)
547+
{
548+
case MuxState::MUX_STATE_ACTIVE:
549+
success = stateActive();
550+
break;
551+
case MuxState::MUX_STATE_INIT:
552+
case MuxState::MUX_STATE_STANDBY:
553+
success = stateStandby();
554+
break;
555+
case MuxState::MUX_STATE_FAILED:
556+
case MuxState::MUX_STATE_PENDING:
557+
// Check at the start of the function means we will never reach here
558+
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
559+
muxStateValToString.at(prev_state_).c_str());
560+
return;
561+
}
562+
st_chg_in_progress_ = false;
563+
if (success)
564+
{
565+
st_chg_failed_ = false;
566+
}
567+
else
568+
{
569+
st_chg_failed_ = true;
570+
SWSS_LOG_ERROR("[%s] Rollback to %s failed",
571+
mux_name_.c_str(), muxStateValToString.at(prev_state_).c_str());
572+
}
573+
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(state_), false);
574+
mux_cb_orch_->updateMuxState(mux_name_, muxStateValToString.at(state_));
575+
}
576+
524577
string MuxCable::getState()
525578
{
526579
SWSS_LOG_INFO("Get state request for %s, state %s",
@@ -838,8 +891,6 @@ void MuxNbrHandler::updateTunnelRoute(NextHopKey nh, bool add)
838891
}
839892
}
840893

841-
std::map<std::string, AclTable> MuxAclHandler::acl_table_;
842-
843894
MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
844895
{
845896
SWSS_LOG_ENTER();
@@ -857,32 +908,21 @@ MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
857908
port_ = port;
858909
alias_ = alias;
859910

860-
auto found = acl_table_.find(table_name);
861-
if (found == acl_table_.end())
862-
{
863-
SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
911+
// Always try to create the table first. If it already exists, function will return early.
912+
createMuxAclTable(port, table_name);
864913

865-
// First time handling of Mux Table, create ACL table, and bind
866-
createMuxAclTable(port, table_name);
914+
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);
915+
916+
AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
917+
if (rule == nullptr)
918+
{
867919
shared_ptr<AclRulePacket> newRule =
868920
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
869921
createMuxAclRule(newRule, table_name);
870922
}
871923
else
872924
{
873-
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);
874-
875-
AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
876-
if (rule == nullptr)
877-
{
878-
shared_ptr<AclRulePacket> newRule =
879-
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
880-
createMuxAclRule(newRule, table_name);
881-
}
882-
else
883-
{
884-
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
885-
}
925+
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
886926
}
887927
}
888928

@@ -915,23 +955,16 @@ void MuxAclHandler::createMuxAclTable(sai_object_id_t port, string strTable)
915955
{
916956
SWSS_LOG_ENTER();
917957

918-
auto inserted = acl_table_.emplace(piecewise_construct,
919-
std::forward_as_tuple(strTable),
920-
std::forward_as_tuple(gAclOrch, strTable));
921-
922-
assert(inserted.second);
923-
924-
AclTable& acl_table = inserted.first->second;
925-
926958
sai_object_id_t table_oid = gAclOrch->getTableById(strTable);
927959
if (table_oid != SAI_NULL_OBJECT_ID)
928960
{
929961
// DROP ACL table is already created
930-
SWSS_LOG_NOTICE("ACL table %s exists, reuse the same", strTable.c_str());
931-
acl_table = *(gAclOrch->getTableByOid(table_oid));
962+
SWSS_LOG_INFO("ACL table %s exists, reuse the same", strTable.c_str());
932963
return;
933964
}
934965

966+
SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
967+
AclTable acl_table(gAclOrch, strTable);
935968
auto dropType = gAclOrch->getAclTableType(TABLE_TYPE_DROP);
936969
assert(dropType);
937970
acl_table.validateAddType(*dropType);
@@ -1776,10 +1809,25 @@ bool MuxCableOrch::addOperation(const Request& request)
17761809
{
17771810
mux_obj->setState(state);
17781811
}
1779-
catch(const std::runtime_error& error)
1812+
catch(const std::runtime_error& e)
17801813
{
17811814
SWSS_LOG_ERROR("Mux Error setting state %s for port %s. Error: %s",
1782-
state.c_str(), port_name.c_str(), error.what());
1815+
state.c_str(), port_name.c_str(), e.what());
1816+
mux_obj->rollbackStateChange();
1817+
return true;
1818+
}
1819+
catch (const std::logic_error& e)
1820+
{
1821+
SWSS_LOG_ERROR("Logic error while setting state %s for port %s. Error: %s",
1822+
state.c_str(), port_name.c_str(), e.what());
1823+
mux_obj->rollbackStateChange();
1824+
return true;
1825+
}
1826+
catch (const std::exception& e)
1827+
{
1828+
SWSS_LOG_ERROR("Exception caught while setting state %s for port %s. Error: %s",
1829+
state.c_str(), port_name.c_str(), e.what());
1830+
mux_obj->rollbackStateChange();
17831831
return true;
17841832
}
17851833

orchagent/muxorch.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,6 @@ class MuxAclHandler
5252
void createMuxAclRule(shared_ptr<AclRulePacket> rule, string strTable);
5353
void bindAllPorts(AclTable &acl_table);
5454

55-
// class shared dict: ACL table name -> ACL table
56-
static std::map<std::string, AclTable> acl_table_;
5755
sai_object_id_t port_ = SAI_NULL_OBJECT_ID;
5856
bool is_ingress_acl_ = true;
5957
string alias_;
@@ -99,6 +97,7 @@ class MuxCable
9997
using state_machine_handlers = map<MuxStateChange, bool (MuxCable::*)()>;
10098

10199
void setState(string state);
100+
void rollbackStateChange();
102101
string getState();
103102
bool isStateChangeInProgress() { return st_chg_in_progress_; }
104103
bool isStateChangeFailed() { return st_chg_failed_; }
@@ -123,6 +122,7 @@ class MuxCable
123122
MuxCableType cable_type_;
124123

125124
MuxState state_ = MuxState::MUX_STATE_INIT;
125+
MuxState prev_state_;
126126
bool st_chg_in_progress_ = false;
127127
bool st_chg_failed_ = false;
128128

orchagent/neighorch.cpp

+21-14
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,12 @@ bool NeighOrch::addNextHop(const NextHopKey &nh)
255255
sai_status_t status = sai_next_hop_api->create_next_hop(&next_hop_id, gSwitchId, (uint32_t)next_hop_attrs.size(), next_hop_attrs.data());
256256
if (status != SAI_STATUS_SUCCESS)
257257
{
258+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
259+
{
260+
SWSS_LOG_NOTICE("Next hop %s on %s already exists",
261+
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str());
262+
return true;
263+
}
258264
SWSS_LOG_ERROR("Failed to create next hop %s on %s, rv:%d",
259265
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str(), status);
260266
task_process_status handle_status = handleSaiCreateStatus(SAI_API_NEXT_HOP, status);
@@ -1014,7 +1020,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10141020
/* When next hop is not found, we continue to remove neighbor entry. */
10151021
if (status == SAI_STATUS_ITEM_NOT_FOUND)
10161022
{
1017-
SWSS_LOG_ERROR("Failed to locate next hop %s on %s, rv:%d",
1023+
SWSS_LOG_NOTICE("Next hop %s on %s doesn't exist, rv:%d",
10181024
ip_address.to_string().c_str(), alias.c_str(), status);
10191025
}
10201026
else
@@ -1049,9 +1055,8 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10491055
{
10501056
if (status == SAI_STATUS_ITEM_NOT_FOUND)
10511057
{
1052-
SWSS_LOG_ERROR("Failed to locate neighbor %s on %s, rv:%d",
1058+
SWSS_LOG_NOTICE("Neighbor %s on %s already removed, rv:%d",
10531059
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str(), status);
1054-
return true;
10551060
}
10561061
else
10571062
{
@@ -1064,22 +1069,24 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10641069
}
10651070
}
10661071
}
1067-
1068-
if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
1069-
{
1070-
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
1071-
}
10721072
else
10731073
{
1074-
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
1075-
}
1074+
if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
1075+
{
1076+
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
1077+
}
1078+
else
1079+
{
1080+
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
1081+
}
10761082

1077-
removeNextHop(ip_address, alias);
1078-
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
1083+
removeNextHop(ip_address, alias);
1084+
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
1085+
SWSS_LOG_NOTICE("Removed neighbor %s on %s",
1086+
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());
1087+
}
10791088
}
10801089

1081-
SWSS_LOG_NOTICE("Removed neighbor %s on %s",
1082-
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());
10831090

10841091
/* Do not delete entry from cache if its disable request */
10851092
if (disable)

tests/mock_tests/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ tests_SOURCES = aclorch_ut.cpp \
5050
flowcounterrouteorch_ut.cpp \
5151
orchdaemon_ut.cpp \
5252
intfsorch_ut.cpp \
53+
mux_rollback_ut.cpp \
5354
warmrestartassist_ut.cpp \
5455
test_failure_handling.cpp \
5556
$(top_srcdir)/lib/gearboxutils.cpp \

tests/mock_tests/mock_orchagent_main.h

+3
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ extern VRFOrch *gVrfOrch;
5656
extern NhgOrch *gNhgOrch;
5757
extern Srv6Orch *gSrv6Orch;
5858
extern BfdOrch *gBfdOrch;
59+
extern AclOrch *gAclOrch;
60+
extern PolicerOrch *gPolicerOrch;
5961
extern Directory<Orch*> gDirectory;
6062

6163
extern sai_acl_api_t *sai_acl_api;
@@ -70,6 +72,7 @@ extern sai_route_api_t *sai_route_api;
7072
extern sai_neighbor_api_t *sai_neighbor_api;
7173
extern sai_tunnel_api_t *sai_tunnel_api;
7274
extern sai_next_hop_api_t *sai_next_hop_api;
75+
extern sai_next_hop_group_api_t *sai_next_hop_group_api;
7376
extern sai_hostif_api_t *sai_hostif_api;
7477
extern sai_policer_api_t *sai_policer_api;
7578
extern sai_buffer_api_t *sai_buffer_api;

0 commit comments

Comments
 (0)