From 1d3388100ef7edbc275746136d8a1db0769e652e Mon Sep 17 00:00:00 2001 From: Shi Su <67605788+shi-su@users.noreply.github.com> Date: Wed, 24 Feb 2021 14:24:35 -0800 Subject: [PATCH] [synchronous mode] Add failure notification for SAI failures in synchronous mode (#1596) Add function to notify users in the presence of SAI failures by throwing exceptions and trigger swss restart in synchronous mode. And include this function in routeorch and neighorch. This is a part of the first step in the SAI failure handling in orchagent with synchronous mode: 1. Failure notification mechanism to ensure enough notifications in the presence of SAI failures and avoid running switches with unhandled failures. 2. Add general failure handling logic by status. 3. Develop fine-grain failure handling mechanism for each orch to properly handle different SAI failures. This function aims to ensure enough notifications in the presence of SAI failures and avoid running switches with unhandled failures (on-par with asynchronous mode). --- cfgmgr/Makefile.am | 27 ++++++++------- orchagent/neighorch.cpp | 12 +++---- orchagent/orch.cpp | 77 +++++++++++++++++++++++++++++++++++++++++ orchagent/orch.h | 5 +++ orchagent/routeorch.cpp | 29 ++++++++-------- 5 files changed, 117 insertions(+), 33 deletions(-) diff --git a/cfgmgr/Makefile.am b/cfgmgr/Makefile.am index 3321f82a4cdd..d1ea5978c12e 100644 --- a/cfgmgr/Makefile.am +++ b/cfgmgr/Makefile.am @@ -2,6 +2,7 @@ INCLUDES = -I$(top_srcdir)/lib -I $(top_srcdir) -I $(top_srcdir)/orchagent -I $( CFLAGS_SAI = -I /usr/include/sai LIBNL_CFLAGS = -I/usr/include/libnl3 LIBNL_LIBS = -lnl-genl-3 -lnl-route-3 -lnl-3 +SAIMETA_LIBS = -lsaimeta -lsaimetadata bin_PROGRAMS = vlanmgrd teammgrd portmgrd intfmgrd buffermgrd vrfmgrd nbrmgrd vxlanmgrd sflowmgrd natmgrd coppmgrd tunnelmgrd macsecmgrd @@ -24,64 +25,64 @@ endif vlanmgrd_SOURCES = vlanmgrd.cpp vlanmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h vlanmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) vlanmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -vlanmgrd_LDADD = -lswsscommon +vlanmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) teammgrd_SOURCES = teammgrd.cpp teammgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h teammgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) teammgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -teammgrd_LDADD = -lswsscommon +teammgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) portmgrd_SOURCES = portmgrd.cpp portmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h portmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) portmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -portmgrd_LDADD = -lswsscommon +portmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) intfmgrd_SOURCES = intfmgrd.cpp intfmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h intfmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) intfmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -intfmgrd_LDADD = -lswsscommon +intfmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) buffermgrd_SOURCES = buffermgrd.cpp buffermgr.cpp buffermgrdyn.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h buffermgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) buffermgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -buffermgrd_LDADD = -lswsscommon +buffermgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) vrfmgrd_SOURCES = vrfmgrd.cpp vrfmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h vrfmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) vrfmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -vrfmgrd_LDADD = -lswsscommon +vrfmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) nbrmgrd_SOURCES = nbrmgrd.cpp nbrmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h nbrmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) $(LIBNL_CFLAGS) nbrmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) $(LIBNL_CPPFLAGS) -nbrmgrd_LDADD = -lswsscommon $(LIBNL_LIBS) +nbrmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) $(LIBNL_LIBS) vxlanmgrd_SOURCES = vxlanmgrd.cpp vxlanmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h vxlanmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) vxlanmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -vxlanmgrd_LDADD = -lswsscommon +vxlanmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) sflowmgrd_SOURCES = sflowmgrd.cpp sflowmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h sflowmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) sflowmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -sflowmgrd_LDADD = -lswsscommon +sflowmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) natmgrd_SOURCES = natmgrd.cpp natmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h natmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) natmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -natmgrd_LDADD = -lswsscommon +natmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) coppmgrd_SOURCES = coppmgrd.cpp coppmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h coppmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) coppmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -coppmgrd_LDADD = -lswsscommon +coppmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) tunnelmgrd_SOURCES = tunnelmgrd.cpp tunnelmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h tunnelmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) tunnelmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -tunnelmgrd_LDADD = -lswsscommon +tunnelmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) macsecmgrd_SOURCES = macsecmgrd.cpp macsecmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h macsecmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) macsecmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) -macsecmgrd_LDADD = -lswsscommon +macsecmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) diff --git a/orchagent/neighorch.cpp b/orchagent/neighorch.cpp index c850bf0ee60d..cc3ee6440106 100644 --- a/orchagent/neighorch.cpp +++ b/orchagent/neighorch.cpp @@ -204,7 +204,7 @@ bool NeighOrch::addNextHop(const IpAddress &ipAddress, const string &alias) { SWSS_LOG_ERROR("Failed to create next hop %s on %s, rv:%d", ipAddress.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiCreateStatus(SAI_API_NEXT_HOP, status); } SWSS_LOG_NOTICE("Created next hop %s on %s", @@ -678,7 +678,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress { SWSS_LOG_ERROR("Failed to create neighbor %s on %s, rv:%d", macAddress.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiCreateStatus(SAI_API_NEIGHBOR, status); } } SWSS_LOG_NOTICE("Created neighbor ip %s, %s on %s", ip_address.to_string().c_str(), @@ -701,7 +701,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress { SWSS_LOG_ERROR("Failed to remove neighbor %s on %s, rv:%d", macAddress.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiRemoveStatus(SAI_API_NEIGHBOR, status); } m_intfsOrch->decreaseRouterIntfsRefCount(alias); @@ -725,7 +725,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress { SWSS_LOG_ERROR("Failed to update neighbor %s on %s, rv:%d", macAddress.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiSetStatus(SAI_API_NEIGHBOR, status); } SWSS_LOG_NOTICE("Updated neighbor %s on %s", macAddress.to_string().c_str(), alias.c_str()); } @@ -798,7 +798,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable) { SWSS_LOG_ERROR("Failed to remove next hop %s on %s, rv:%d", ip_address.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiRemoveStatus(SAI_API_NEXT_HOP, status); } } @@ -830,7 +830,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable) { SWSS_LOG_ERROR("Failed to remove neighbor %s on %s, rv:%d", m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str(), status); - return false; + return handleSaiRemoveStatus(SAI_API_NEIGHBOR, status); } } diff --git a/orchagent/orch.cpp b/orchagent/orch.cpp index ea3e8e78b26e..6de6f189f7c0 100644 --- a/orchagent/orch.cpp +++ b/orchagent/orch.cpp @@ -10,6 +10,7 @@ #include "tokenize.h" #include "logger.h" #include "consumerstatetable.h" +#include "sai_serialize.h" using namespace swss; @@ -685,6 +686,82 @@ Executor *Orch::getExecutor(string executorName) return NULL; } +bool Orch::handleSaiCreateStatus(sai_api_t api, sai_status_t status, void *context) +{ + /* + * This function aims to provide coarse handling of failures in sairedis create + * operation (i.e., notify users by throwing excepions when failures happen). + * Return value: true - Handled the status successfully. No need to retry this SAI operation. + * false - Cannot handle the status. Need to retry the SAI operation. + * TODO: 1. Add general handling logic for specific statuses (e.g., SAI_STATUS_ITEM_ALREADY_EXISTS) + * 2. Develop fine-grain failure handling mechanisms and replace this coarse handling + * in each orch. + * 3. Take the type of sai api into consideration. + */ + switch (status) + { + case SAI_STATUS_SUCCESS: + SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiCreateStatus"); + return true; + default: + SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s", + sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); + exit(EXIT_FAILURE); + } + return false; +} + +bool Orch::handleSaiSetStatus(sai_api_t api, sai_status_t status, void *context) +{ + /* + * This function aims to provide coarse handling of failures in sairedis set + * operation (i.e., notify users by throwing excepions when failures happen). + * Return value: true - Handled the status successfully. No need to retry this SAI operation. + * false - Cannot handle the status. Need to retry the SAI operation. + * TODO: 1. Add general handling logic for specific statuses + * 2. Develop fine-grain failure handling mechanisms and replace this coarse handling + * in each orch. + * 3. Take the type of sai api into consideration. + */ + switch (status) + { + case SAI_STATUS_SUCCESS: + SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiSetStatus"); + return true; + default: + SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s", + sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); + exit(EXIT_FAILURE); + } + return false; +} + +bool Orch::handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context) +{ + /* + * This function aims to provide coarse handling of failures in sairedis remove + * operation (i.e., notify users by throwing excepions when failures happen). + * Return value: true - Handled the status successfully. No need to retry this SAI operation. + * false - Cannot handle the status. Need to retry the SAI operation. + * TODO: 1. Add general handling logic for specific statuses (e.g., SAI_STATUS_OBJECT_IN_USE, + * SAI_STATUS_ITEM_NOT_FOUND) + * 2. Develop fine-grain failure handling mechanisms and replace this coarse handling + * in each orch. + * 3. Take the type of sai api into consideration. + */ + switch (status) + { + case SAI_STATUS_SUCCESS: + SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiRemoveStatus"); + return true; + default: + SWSS_LOG_ERROR("Encountered failure in remove operation, exiting orchagent, SAI API: %s, status: %s", + sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); + exit(EXIT_FAILURE); + } + return false; +} + void Orch2::doTask(Consumer &consumer) { SWSS_LOG_ENTER(); diff --git a/orchagent/orch.h b/orchagent/orch.h index 1a7197dfa971..fcecf98a22d0 100644 --- a/orchagent/orch.h +++ b/orchagent/orch.h @@ -235,6 +235,11 @@ class Orch /* Note: consumer will be owned by this class */ void addExecutor(Executor* executor); Executor *getExecutor(std::string executorName); + + /* Handling SAI status*/ + virtual bool handleSaiCreateStatus(sai_api_t api, sai_status_t status, void *context = nullptr); + virtual bool handleSaiSetStatus(sai_api_t api, sai_status_t status, void *context = nullptr); + virtual bool handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context = nullptr); private: void removeMeFromObjsReferencedByMe(type_map &type_maps, const std::string &table, const std::string &obj_name, const std::string &field, const std::string &old_referenced_obj_name); void addConsumer(swss::DBConnector *db, std::string tableName, int pri = default_orch_pri); diff --git a/orchagent/routeorch.cpp b/orchagent/routeorch.cpp index 3a88b5793cba..641f81589de6 100644 --- a/orchagent/routeorch.cpp +++ b/orchagent/routeorch.cpp @@ -331,7 +331,7 @@ bool RouteOrch::validnexthopinNextHopGroup(const NextHopKey &nexthop, uint32_t& { SWSS_LOG_ERROR("Failed to add next hop member to group %" PRIx64 ": %d\n", nhopgroup->second.next_hop_group_id, status); - return false; + return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status); } ++count; @@ -371,7 +371,7 @@ bool RouteOrch::invalidnexthopinNextHopGroup(const NextHopKey &nexthop, uint32_t { SWSS_LOG_ERROR("Failed to remove next hop member %" PRIx64 " from group %" PRIx64 ": %d\n", nexthop_id, nhopgroup->second.next_hop_group_id, status); - return false; + return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status); } ++count; @@ -950,7 +950,7 @@ bool RouteOrch::createFineGrainedNextHopGroup(sai_object_id_t &next_hop_group_id if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to create next hop group rv:%d", status); - return false; + return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status); } gCrmOrch->incCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP); @@ -968,7 +968,7 @@ bool RouteOrch::removeFineGrainedNextHopGroup(sai_object_id_t &next_hop_group_id { SWSS_LOG_ERROR("Failed to remove next hop group %" PRIx64 ", rv:%d", next_hop_group_id, status); - return false; + return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status); } gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP); @@ -1033,7 +1033,7 @@ bool RouteOrch::addNextHopGroup(const NextHopGroupKey &nexthops) { SWSS_LOG_ERROR("Failed to create next hop group %s, rv:%d", nexthops.to_string().c_str(), status); - return false; + return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status); } m_nextHopGroupCount ++; @@ -1150,7 +1150,7 @@ bool RouteOrch::removeNextHopGroup(const NextHopGroupKey &nexthops) { SWSS_LOG_ERROR("Failed to remove next hop group member[%zu] %" PRIx64 ", rv:%d", i, next_hop_ids[i], statuses[i]); - return false; + return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, statuses[i]); } gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP_MEMBER); @@ -1160,7 +1160,7 @@ bool RouteOrch::removeNextHopGroup(const NextHopGroupKey &nexthops) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to remove next hop group %" PRIx64 ", rv:%d", next_hop_group_id, status); - return false; + return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status); } m_nextHopGroupCount --; @@ -1626,7 +1626,8 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey } else if (it_route == m_syncdRoutes.at(vrf_id).end()) { - if (*it_status++ != SAI_STATUS_SUCCESS) + sai_status_t status = *it_status++; + if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to create route %s with next hop(s) %s", ipPrefix.to_string().c_str(), nextHops.to_string().c_str()); @@ -1635,7 +1636,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey { removeNextHopGroup(nextHops); } - return false; + return handleSaiCreateStatus(SAI_API_ROUTE, status); } if (ipPrefix.isV4()) @@ -1665,7 +1666,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey { SWSS_LOG_ERROR("Failed to set route %s with packet action forward, %d", ipPrefix.to_string().c_str(), status); - return false; + return handleSaiSetStatus(SAI_API_ROUTE, status); } } @@ -1674,7 +1675,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey { SWSS_LOG_ERROR("Failed to set route %s with next hop(s) %s", ipPrefix.to_string().c_str(), nextHops.to_string().c_str()); - return false; + return handleSaiSetStatus(SAI_API_ROUTE, status); } /* Increase the ref_count for the next hop (group) entry */ @@ -1792,7 +1793,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx) { SWSS_LOG_ERROR("Failed to set route %s packet action to drop, rv:%d", ipPrefix.to_string().c_str(), status); - return false; + return handleSaiSetStatus(SAI_API_ROUTE, status); } SWSS_LOG_INFO("Set route %s packet action to drop", ipPrefix.to_string().c_str()); @@ -1802,7 +1803,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx) { SWSS_LOG_ERROR("Failed to set route %s next hop ID to NULL, rv:%d", ipPrefix.to_string().c_str(), status); - return false; + return handleSaiSetStatus(SAI_API_ROUTE, status); } SWSS_LOG_INFO("Set route %s next hop ID to NULL", ipPrefix.to_string().c_str()); @@ -1813,7 +1814,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to remove route prefix:%s\n", ipPrefix.to_string().c_str()); - return false; + return handleSaiRemoveStatus(SAI_API_ROUTE, status); } if (ipPrefix.isV4())