Merge pull request #19778 from Pdoijode/pdoijode/evpn-gr-mh
Some checks are pending
github-ci / Ubuntu 22.04 amd64 Build (push) Waiting to run
github-ci / Ubuntu 22.04 arm64 Build (push) Waiting to run
github-ci / Ubuntu 24.04 amd64 Build (push) Waiting to run
github-ci / Ubuntu 24.04 arm64 Build (push) Waiting to run
github-ci / Ubuntu 22.04 amd64 Test (push) Blocked by required conditions
github-ci / Ubuntu 22.04 arm64 Test (push) Blocked by required conditions
github-ci / Ubuntu 24.04 amd64 Test (push) Blocked by required conditions
github-ci / Ubuntu 24.04 arm64 Test (push) Blocked by required conditions

Graceful restart for EVPN
This commit is contained in:
Donatas Abraitis 2026-01-06 23:29:54 +02:00 committed by GitHub
commit c7f99abd3a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
47 changed files with 3448 additions and 256 deletions

View file

@ -1500,6 +1500,17 @@ int evpn_route_select_install(struct bgp *bgp, struct bgpevpn *vpn,
safi_t safi = SAFI_EVPN;
int ret = 0;
/* If the flag BGP_NODE_SELECT_DEFER is set, do not add route to
* the workqueue
*/
if (CHECK_FLAG(dest->flags, BGP_NODE_SELECT_DEFER)) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: SELECT_DEFER flag set for EVPN route %pBD, dest %p",
bgp->name_pretty, dest, dest);
return ret;
}
first = bgp_dest_get_bgp_path_info(dest);
SET_FLAG(pi->flags, BGP_PATH_UNSORTED);
if (pi != first) {
@ -2366,14 +2377,14 @@ static int update_evpn_route(struct bgp *bgp, struct bgpevpn *vpn,
/* lock ri to prevent freeing in evpn_route_select_install */
bgp_path_info_lock(pi);
/* Perform route selection. Normally, the local route in the
* VNI is expected to win and be the best route. However, if
* there is a race condition where a host moved from local to
* remote and the remote route was received in BGP just prior
* to the local MACIP notification from zebra, the remote
* route would win, and we should evict the defunct local route
* and (re)install the remote route into zebra.
*/
/* Perform route selection. Normally, the local route in the
* VNI is expected to win and be the best route. However, if
* there is a race condition where a host moved from local to
* remote and the remote route was received in BGP just prior
* to the local MACIP notification from zebra, the remote
* route would win, and we should evict the defunct local route
* and (re)install the remote route into zebra.
*/
evpn_route_select_install(bgp, vpn, dest, pi);
/*
* If the new local route was not selected evict it and tell zebra
@ -3272,6 +3283,8 @@ static int install_evpn_route_entry_in_vrf(struct bgp *bgp_vrf,
pi->uptime = monotime(NULL);
}
bgp_dest_set_defer_flag(dest, false);
/* Gateway IP nexthop should be resolved */
if (bre && bre->type == OVERLAY_INDEX_GATEWAY_IP) {
if (bgp_find_or_add_nexthop(bgp_vrf, bgp_vrf, afi, safi, pi, NULL, 0, NULL, NULL))
@ -3404,6 +3417,8 @@ static int install_evpn_route_entry_in_vni_common(
pi->uptime = monotime(NULL);
}
bgp_dest_set_defer_flag(dest, false);
/* Add this route to remote IP hashtable */
bgp_evpn_remote_ip_hash_add(vpn, pi);
@ -3676,7 +3691,7 @@ static int install_evpn_route_entry(struct bgp *bgp, struct bgpevpn *vpn,
char prefix_str[PREFIX2STR_BUFFER] = { 0 };
struct prefix tmp;
if (bgp_debug_update(parent_pi->peer, NULL, NULL, 1))
if (bgp_debug_update(parent_pi->peer, NULL, NULL, 1) || bgp_debug_zebra(NULL))
zlog_debug(
"%s (%u): Installing EVPN %pFX route in VNI %u IP/MAC table",
vrf_id_to_name(bgp->vrf_id), bgp->vrf_id, p, vpn->vni);
@ -5960,6 +5975,100 @@ void bgp_evpn_handle_autort_change(struct bgp *bgp)
update_autort_l3vni(bgp);
}
struct vni_gr_walk {
struct bgp *bgp;
uint16_t cnt;
};
/*
* Iterate over all the deferred prefixes in this table
* and calculate the bestpath.
*/
uint16_t bgp_deferred_path_selection(struct bgp *bgp, afi_t afi, safi_t safi,
struct bgp_table *table, uint16_t cnt, struct bgpevpn *vpn,
bool evpn_select)
{
struct bgp_dest *dest = NULL;
for (dest = bgp_table_top(table);
dest && bgp->gr_info[afi][safi].gr_deferred != 0 && cnt < BGP_MAX_BEST_ROUTE_SELECT;
dest = bgp_route_next(dest)) {
if (!CHECK_FLAG(dest->flags, BGP_NODE_SELECT_DEFER))
continue;
UNSET_FLAG(dest->flags, BGP_NODE_SELECT_DEFER);
bgp->gr_info[afi][safi].gr_deferred--;
if (evpn_select) {
struct bgp_path_info *pi = bgp_dest_get_bgp_path_info(dest);
/*
* Mark them all as unsorted and just pass
* the first one in to do work on. Clear
* everything since at this point it is
* unknown what was or was not done for
* all the deferred paths
*/
while (pi) {
SET_FLAG(pi->flags, BGP_PATH_UNSORTED);
pi = pi->next;
}
evpn_route_select_install(bgp, vpn, dest, bgp_dest_get_bgp_path_info(dest));
} else
bgp_process_main_one(bgp, dest, afi, safi);
cnt++;
}
/* If iteration stopped before the entire table was traversed then the
* node needs to be unlocked.
*/
if (dest) {
bgp_dest_unlock_node(dest);
dest = NULL;
}
return cnt;
}
static void bgp_evpn_handle_deferred_bestpath_per_vni(struct hash_bucket *bucket, void *arg)
{
struct bgpevpn *vpn = bucket->data;
struct vni_gr_walk *ctx = arg;
struct bgp *bgp = ctx->bgp;
afi_t afi = AFI_L2VPN;
safi_t safi = SAFI_EVPN;
/*
* Now, walk this VNI's MAC & IP route table and do deferred bestpath
* selection
*/
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s (%u): GR walking IP and MAC table for VNI %u. Deferred paths %d, batch cnt %d",
vrf_id_to_name(bgp->vrf_id), bgp->vrf_id, vpn->vni,
bgp->gr_info[afi][safi].gr_deferred, ctx->cnt);
if (!bgp->gr_info[afi][safi].gr_deferred || ctx->cnt >= BGP_MAX_BEST_ROUTE_SELECT)
return;
ctx->cnt += bgp_deferred_path_selection(bgp, afi, safi, vpn->mac_table, ctx->cnt, vpn,
true);
ctx->cnt += bgp_deferred_path_selection(bgp, afi, safi, vpn->ip_table, ctx->cnt, vpn, true);
}
void bgp_evpn_handle_deferred_bestpath_for_vnis(struct bgp *bgp, uint16_t cnt)
{
struct vni_gr_walk ctx;
ctx.bgp = bgp;
ctx.cnt = cnt;
hash_iterate(bgp->vnihash,
(void (*)(struct hash_bucket *,
void *))bgp_evpn_handle_deferred_bestpath_per_vni,
&ctx);
}
/*
* Handle change to export RT - update and advertise local routes.
*/

View file

@ -95,6 +95,12 @@ static inline int is_route_parent_evpn(struct bgp_path_info *ri)
return 0;
}
#define IS_PATH_IMPORTED_FROM_EVPN_TABLE(pi) \
(pi->sub_type == BGP_ROUTE_IMPORTED && is_route_parent_evpn(pi))
#define IS_L2VPN_AFI_IN_NON_DEFAULT_VRF(bgp, afi, safi) \
(afi == AFI_L2VPN && safi == SAFI_EVPN && bgp != bgp_get_evpn())
/* Flag if the route path's family is EVPN. */
static inline bool is_pi_family_evpn(struct bgp_path_info *pi)
{

View file

@ -786,4 +786,9 @@ extern void bgp_evpn_import_type2_route(struct bgp_path_info *pi, int import);
extern void bgp_evpn_xxport_delete_ecomm(void *val);
extern int bgp_evpn_route_target_cmp(struct ecommunity *ecom1,
struct ecommunity *ecom2);
extern void bgp_evpn_handle_deferred_bestpath_for_vnis(struct bgp *bgp, uint16_t cnt);
extern uint16_t bgp_deferred_path_selection(struct bgp *bgp, afi_t afi, safi_t safi,
struct bgp_table *table, uint16_t cnt,
struct bgpevpn *vpn, bool evpn_select);
#endif /* _BGP_EVPN_PRIVATE_H */

View file

@ -84,9 +84,13 @@ static void bgp_start_timer(struct event *event);
static void bgp_connect_timer(struct event *event);
static void bgp_holdtime_timer(struct event *event);
static void bgp_delayopen_timer(struct event *event);
static void bgp_graceful_deferral_timer_expire(struct event *event);
static void bgp_start_deferral_timer(struct bgp *bgp, afi_t afi, safi_t safi,
struct graceful_restart_info *gr_info);
/* BGP GR functions. */
static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi);
static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi,
bool *multihop_eors_pending);
/* Register peer with NHT */
int bgp_peer_connection_reg_with_nht(struct peer_connection *connection)
@ -852,6 +856,49 @@ static void bgp_graceful_stale_timer_expire(struct event *event)
bgp_clear_stale_route(peer, afi, safi);
}
/*
* Start the tier-2 selection deferral timer thread for the specified AFI, SAFI,
* mark peers from whom we need an EOR
*/
void bgp_start_tier2_deferral_timer(struct bgp *bgp, afi_t afi, safi_t safi)
{
struct afi_safi_info *thread_info;
struct graceful_restart_info *gr_info = &(bgp->gr_info[afi][safi]);
/*
* tier-2 deferral timer is already running
*/
if (gr_info->t_select_deferral_tier2) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: tier-2 path-select deferral timer for %s, duration %d is running",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp->select_defer_time);
return;
}
/* Start the timer */
thread_info = XMALLOC(MTYPE_TMP, sizeof(struct afi_safi_info));
thread_info->afi = afi;
thread_info->safi = safi;
thread_info->bgp = bgp;
thread_info->tier2_gr = true;
event_add_timer(bm->master, bgp_graceful_deferral_timer_expire, thread_info,
bgp->select_defer_time, &gr_info->t_select_deferral_tier2);
gr_info->select_defer_tier2_required = true;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Started tier-2 path-select deferral timer for %s, duration %ds",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp->select_defer_time);
frrtrace(5, frr_bgp, gr_deferral_timer_start, bgp->name_pretty, afi, safi,
bgp->select_defer_time, 2);
}
/* Selection deferral timer processing function */
static void bgp_graceful_deferral_timer_expire(struct event *event)
{
@ -859,17 +906,26 @@ static void bgp_graceful_deferral_timer_expire(struct event *event)
afi_t afi;
safi_t safi;
struct bgp *bgp;
bool multihop_eors_pending = false;
info = EVENT_ARG(event);
afi = info->afi;
safi = info->safi;
bgp = info->bgp;
bgp->gr_info[afi][safi].select_defer_over = true;
XFREE(MTYPE_TMP, info);
/*
* If tier 2 timer expired then set then set the
* select_defer_over_tier2 to true to indicate that
* BGP tier2 bestpath selection can be done now.
*/
if (info->tier2_gr)
bgp->gr_info[afi][safi].select_defer_over_tier2 = true;
else
bgp->gr_info[afi][safi].select_defer_over = true;
/* Check if graceful restart deferral completion is needed */
if (BGP_SUPPRESS_FIB_ENABLED(bgp) && bgp_gr_check_all_eors(bgp, afi, safi) &&
if (BGP_SUPPRESS_FIB_ENABLED(bgp) &&
bgp_gr_check_all_eors(bgp, afi, safi, &multihop_eors_pending) &&
!bgp->gr_info[afi][safi].gr_deferred && bgp->gr_route_sync_pending) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Triggering GR deferral completion from timer expiry for %s",
@ -880,9 +936,16 @@ static void bgp_graceful_deferral_timer_expire(struct event *event)
/* Best path selection */
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Starting deferred path selection for %s, #routes %d -- timeout",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp->gr_info[afi][safi].gr_deferred);
zlog_debug("%s: Starting %s deferred path selection for %s, #routes %d -- timeout",
bgp->name_pretty, (info->tier2_gr) ? "2nd" : "1st",
get_afi_safi_str(afi, safi, false), bgp->gr_info[afi][safi].gr_deferred);
frrtrace(5, frr_bgp, gr_deferral_timer_expiry, bgp->name_pretty, info->tier2_gr, afi, safi,
bgp->gr_info[afi][safi].gr_deferred);
XFREE(MTYPE_TMP, info);
bgp_do_deferred_path_selection(bgp, afi, safi);
}
@ -1258,15 +1321,19 @@ static void bgp_update_delay_process_status_change(struct peer *peer)
}
}
static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi)
static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi,
bool *multihop_eors_pending)
{
struct listnode *node, *nnode;
struct peer *peer = NULL;
bool eor_rcvd_from_all_mh_peers = true;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Checking all peers for EOR receipt for %s", bgp->name_pretty,
get_afi_safi_str(afi, safi, false));
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 1);
for (ALL_LIST_ELEMENTS(bgp->peer, node, nnode, peer)) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("....examining peer %s status %s flags 0x%" PRIx64
@ -1282,16 +1349,154 @@ static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi)
continue;
if (!CHECK_FLAG(peer->af_sflags[afi][safi], PEER_STATUS_EOR_RECEIVED)) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR still awaited from this peer for this %s",
get_afi_safi_str(afi, safi, false));
return false;
if (!bgp->gr_multihop_peer_exists) {
/*
* This instance doesn't have a mix of directly
* connected and multihop peers. So we don't
* need to do 2 level deferred bestpath
* calculation.
*/
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR still awaited from this peer for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(5, frr_bgp, gr_eor_peer, bgp->name_pretty, afi, safi,
peer->host, 1);
return false;
}
if (!peer->afc[afi][safi]) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... Ignoring EOR from %s. %s is not configured",
peer->host, get_afi_safi_str(afi, safi, false));
continue;
}
/*
* This afi-safi(v4-uni or v6-unicast) has a mix
* of directly connected and multihop peers.
* Since multihop peers could depend on prefixes
* learnt from directly connected peers to form
* the BGP session, BGP needs to do 2 level
* deferred bestpath selection.
*
* 1st level of deferred bestpath selection will
* be done when EORs are recieved from all the
* directly connected peers, or when the
* select-deferral-timer expires. If the timer
* expired, it means that some of the directly
* connected peers didn't come up at all. So on
* timer expiry, BGP will check to see if
* there's a mix of directly-connected and
* multihop peers for that afi-safi. if yes,
* then BGP will start the tier2-select-deferral
* timer and wait for all the multihop peers to
* come up, send EORs and then do 2nd deferred
* bestpath selection.
*
* After EORs are rcvd from all the directly
* connected peers, BGP will check if there are
* any multihop peers from whom we are yet to
* rcv EORs. If yes, BGP will start
* tier2-select-deferral timer and wait for all
* the multihop peers to come up and send EORs.
*
* If before tier2-select-deferral timer expiry,
* if all multihop peers send EOR then BGP will
* cancel the tier2 timer and do 2nd deferred
* bestpath selection.
*
* Here, even if the peerB has
* disable-connected-check configured, it will
* be treated as directly connected peer. This
* is because, peerB's session coming up
* wouldn't depend on some other BGP session
* coming up and learning prefixes.
*/
if (PEER_IS_MULTIHOP(peer)) {
/*
* If we have not recieved EOR from a
* multihop peer, start the tier2
* select-deferral-timer only if EORs
* are rcvd from all the directly
* connected peers
*/
eor_rcvd_from_all_mh_peers = false;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR still awaited from this multihop peer for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(5, frr_bgp, gr_eor_peer, bgp->name_pretty, afi, safi,
peer->host, 3);
} else {
/*
* If EOR from directly connected peer
* is not rcvd even after tier1 timer
* expiry, then we are going to ignore
* this peer since this peer may not
* come up at all.
*/
if (bgp->gr_info[afi][safi].select_defer_over) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... Ignoring directly connected peer %s. Tier1 GR timer has expired already for %s",
peer->host,
get_afi_safi_str(afi, safi, false));
frrtrace(5, frr_bgp, gr_eor_peer, bgp->name_pretty, afi,
safi, peer->host, 2);
continue;
}
/*
* If this is a directly connected peer
* and if we haven't recieved EOR from
* this peer yet, then we will wait to
* do the 1st round of deferred
* bestpath.
*/
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR still awaited from this directly connected peer for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(5, frr_bgp, gr_eor_peer, bgp->name_pretty, afi, safi,
peer->host, 5);
return false;
}
}
}
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR received from all expected peers for this %s",
zlog_debug(".... EOR received from all directly connected peers for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 2);
/*
* EOR is rcvd from all the directly connected peers at this point.
*
* If EOR is not rcvd from all of the multihop(MH) peers
* for this AFI-SAFI then start the tier2-select-deferral-timer
* and wait for all MH peers to comeup.
*
* Note that tier2 GR select deferral timer is started only
* after EOR is rcvd from all the directly connected peers.
*
* If EORs is not rcvd from all the directly connected peers
* then select deferral timer will expire. So when select deferral timer
* expires, we will check if there are any multihop peers from whom
* we have not rcvd EOR yet. If we find any and if the tier2 timer has
* not been started yet, then we will start the timer there.
*/
if (!eor_rcvd_from_all_mh_peers) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR NOT received from all multihop peers for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 3);
bgp_start_tier2_deferral_timer(bgp, afi, safi);
*multihop_eors_pending = true;
} else {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug(".... EOR received from all expected peers for %s",
get_afi_safi_str(afi, safi, false));
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 4);
}
return true;
}
@ -1299,8 +1504,20 @@ static bool bgp_gr_check_all_eors(struct bgp *bgp, afi_t afi, safi_t safi)
void bgp_gr_check_path_select(struct bgp *bgp, afi_t afi, safi_t safi)
{
struct graceful_restart_info *gr_info;
bool multihop_eors_pending = false;
if (bgp_gr_check_all_eors(bgp, afi, safi)) {
/*
* This function returns true if EORs are rcvd from all the
* directly connected peers for this AFI-SAFI in this BGP
* instance.
* OR
* If none of the directly connected peers have negotiated
* this AFI-SAFI.
*
* This function returns false if EORs are not rcvd for this AFI-SAFI
* from all the dirctly connected peers.
*/
if (bgp_gr_check_all_eors(bgp, afi, safi, &multihop_eors_pending)) {
gr_info = &(bgp->gr_info[afi][safi]);
if (!BGP_SUPPRESS_FIB_ENABLED(bgp)) {
if (gr_info->t_select_deferral) {
@ -1310,11 +1527,65 @@ void bgp_gr_check_path_select(struct bgp *bgp, afi_t afi, safi_t safi)
}
event_cancel(&gr_info->t_select_deferral);
}
/*
* If there are no pending EORs from multihop peers
* then cancel the timer.
*/
if (!multihop_eors_pending) {
if (gr_info->t_select_deferral_tier2) {
void *info = EVENT_ARG(gr_info->t_select_deferral_tier2);
XFREE(MTYPE_TMP, info);
}
event_cancel(&gr_info->t_select_deferral_tier2);
gr_info->select_defer_over_tier2 = true;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: No multihop EORs pending for %s, #routes %d -- EORs recvd",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
gr_info->gr_deferred);
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 5);
} else {
/*
* For afi-safi like l2VPN EVPN, we are here because
* EORs were rcvd from all directly connected peers and
* BGP is waiting to rcv EORs from all multihop peers.
* Wait for all directly connected and multihop peers to
* send EORs before doing deferred bestpath selection.
*/
if (afi == AFI_L2VPN) {
gr_info->select_defer_over = true;
return;
}
/*
* For IPv4/IPv6 unicast, we are here either because
* EORs are rcvd for all directly connected peers or
* none of the directly connected peers have this
* AFI-SAFI negotiated.
*
* Return if we have already done 1st round of deferred
* bestpath selection (Either because tier1
* select-deferral-timeout or EORs were rcvd from all
* the directly connected peers)
* and if we are still waiting on all multihop bgp peers
* to come up.
*/
if (gr_info->select_defer_over)
return;
}
gr_info->select_defer_over = true;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Starting deferred path selection for %s, #routes %d -- EORs recvd",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
gr_info->gr_deferred);
frrtrace(4, frr_bgp, gr_start_deferred_path_selection, bgp->name_pretty, afi, safi,
gr_info->gr_deferred);
bgp_do_deferred_path_selection(bgp, afi, safi);
}
}
@ -1340,6 +1611,51 @@ static void bgp_gr_mark_for_deferred_selection(struct bgp *bgp)
}
}
/*
* Evaluate if GR is enabled for a mix of directly-connected and
* multihop peers in ipv4-unicast and ipv6-unicast AFI-SAFI.
*/
static void bgp_gr_evaluate_mix_peer_type(struct bgp *bgp)
{
struct listnode *node, *nnode;
struct peer *peer;
afi_t afi;
safi_t safi;
bgp->gr_multihop_peer_exists = false;
FOREACH_AFI_SAFI_NSF (afi, safi) {
/*
* GR is not supported for this afi-safi
*/
if (!bgp_gr_supported_for_afi_safi(afi, safi))
continue;
for (ALL_LIST_ELEMENTS(bgp->peer, node, nnode, peer)) {
/*
* If this is not a config node or
* if this peer is admin shutdown or
* if GR is not enabled for peer then skip this
* peer
*/
if (!CHECK_FLAG(peer->flags, PEER_FLAG_CONFIG_NODE) ||
CHECK_FLAG(peer->flags, PEER_FLAG_SHUTDOWN) ||
!CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART))
continue;
/*
* If this is a multihop peer with GR supported
* afi-safi is configured for it, then set the
* value to true.
*/
if (PEER_IS_MULTIHOP(peer) && peer->afc[afi][safi]) {
bgp->gr_multihop_peer_exists = true;
return;
}
}
}
}
/*
* Start the selection deferral timer thread for the specified AFI, SAFI,
* mark peers from whom we need an EOR and inform zebra
@ -1355,6 +1671,7 @@ static void bgp_start_deferral_timer(struct bgp *bgp, afi_t afi, safi_t safi,
thread_info->afi = afi;
thread_info->safi = safi;
thread_info->bgp = bgp;
thread_info->tier2_gr = false;
event_add_timer(bm->master, bgp_graceful_deferral_timer_expire, thread_info,
bgp->select_defer_time, &gr_info->t_select_deferral);
@ -1369,6 +1686,9 @@ static void bgp_start_deferral_timer(struct bgp *bgp, afi_t afi, safi_t safi,
zlog_debug("%s: Started path-select deferral timer for %s, duration %ds",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp->select_defer_time);
frrtrace(5, frr_bgp, gr_deferral_timer_start, bgp->name_pretty, afi, safi,
bgp->select_defer_time, 1);
}
/*
@ -1431,7 +1751,7 @@ static void bgp_gr_process_peer_up_include(struct bgp *bgp, struct peer *peer)
} else {
SET_FLAG(peer->af_sflags[afi][safi], PEER_STATUS_GR_WAIT_EOR);
gr_info = &(bgp->gr_info[afi][safi]);
if (!gr_info->t_select_deferral)
if (!gr_info->t_select_deferral && !gr_info->select_defer_over)
bgp_start_deferral_timer(bgp, afi, safi, gr_info);
}
}
@ -1444,13 +1764,18 @@ static void bgp_gr_process_peer_status_change(struct peer *peer)
safi_t safi;
bgp = peer->bgp;
if (peer_established(peer->connection)) {
/*
* If we haven't yet evaluated for path selection deferral,
* do it now.
*
* If we haven't yet evaluated the presence of both directly
* connected and multihop peers, do it now.
*/
if (!bgp->gr_select_defer_evaluated) {
bgp_gr_mark_for_deferred_selection(bgp);
bgp_gr_evaluate_mix_peer_type(bgp);
bgp->gr_select_defer_evaluated = true;
}
@ -1469,6 +1794,9 @@ static void bgp_gr_process_peer_status_change(struct peer *peer)
" restarted or GR not negotiated, check for path-selection",
bgp->name_pretty, peer->host, peer->cap, peer->flags);
frrtrace(4, frr_bgp, gr_peer_up_ignore, bgp->name_pretty, peer->host,
peer->cap, peer->flags);
bgp_gr_process_peer_up_ignore(bgp, peer);
} else {
bgp_gr_process_peer_up_include(bgp, peer);
@ -1488,7 +1816,6 @@ static bool gr_path_select_deferral_applicable(struct bgp *bgp)
{
afi_t afi;
safi_t safi;
struct graceful_restart_info *gr_info;
/* True if BGP has (re)started gracefully (based on start
* settings and GR is not complete and path selection
@ -1496,12 +1823,12 @@ static bool gr_path_select_deferral_applicable(struct bgp *bgp)
*/
if (!bgp->t_startup && !bgp_in_graceful_restart())
return false;
FOREACH_AFI_SAFI_NSF (afi, safi) {
if (!bgp_gr_supported_for_afi_safi(afi, safi))
continue;
gr_info = &(bgp->gr_info[afi][safi]);
if (!gr_info->select_defer_over)
if (!BGP_GR_SELECT_DEFER_DONE(bgp, afi, safi))
return true;
}

View file

@ -126,6 +126,7 @@ extern int bgp_fsm_error_subcode(int status);
extern enum bgp_fsm_state_progress
bgp_stop_with_notify(struct peer_connection *connection, uint8_t code,
uint8_t sub_code);
extern void bgp_start_tier2_deferral_timer(struct bgp *bgp, afi_t afi, safi_t safi);
/**
* Start the route advertisement timer (that honors MRAI) for all the

View file

@ -26,6 +26,7 @@
#include "bgpd/bgp_aspath.h"
#include "bgpd/bgp_vty.h"
#include "bgpd/bgp_memory.h"
#include "bgpd/bgp_trace.h"
const struct message capcode_str[] = {
{ CAPABILITY_CODE_MP, "MultiProtocol Extensions" },
@ -1639,6 +1640,8 @@ static void bgp_peer_send_gr_capability(struct stream *s, struct peer *peer,
PEER_CAP_GRACEFUL_RESTART_N_BIT_ADV)
? "SET"
: "NOT-SET");
frrtrace(4, frr_bgp, gr_send_rbit_capability, bgp->name_pretty, peer->host,
bgp->restart_time, CHECK_FLAG(peer->cap, PEER_CAP_GRACEFUL_RESTART_R_BIT_ADV));
/* Send address-family specific graceful-restart capability
* only when GR config is present
@ -1667,6 +1670,8 @@ static void bgp_peer_send_gr_capability(struct stream *s, struct peer *peer,
f_bit ? "SET" : "NOT-SET",
get_afi_safi_str(afi, safi, false));
frrtrace(5, frr_bgp, gr_send_fbit_capability, bgp->name_pretty, peer->host,
afi, safi, f_bit);
stream_putc(s, f_bit ? GRACEFUL_RESTART_F_BIT : 0);
}
}

View file

@ -476,7 +476,7 @@ void bgp_generate_updgrp_packets(struct event *event)
/* If a GR restarter, we have to wait till path-selection
* is complete.
*/
if (bgp_in_graceful_restart())
if (!peer->bgp->gr_multihop_peer_exists && bgp_in_graceful_restart())
return;
do {
@ -488,6 +488,22 @@ void bgp_generate_updgrp_packets(struct event *event)
afi = paf->afi;
safi = paf->safi;
/*
* L2VPN EVPN routes must be advertised to peers only if
* GR is done for all the VRFs. Waiting for overall GR
* to be done will ensure that all the routes from
* non-default VRFs will be exported to default VRF
* before sending updates and EOR to peers.
*/
if (safi != SAFI_UNICAST && bgp_in_graceful_restart())
continue;
if (peer->bgp->gr_multihop_peer_exists && bgp_in_graceful_restart() &&
peer->bgp->gr_info[afi][safi].af_enabled &&
!peer->bgp->gr_info[afi][safi].route_sync)
continue;
next_pkt = paf->next_pkt_to_send;
/*
@ -2260,15 +2276,19 @@ static void bgp_update_receive_eor(struct bgp *bgp, struct peer *peer, afi_t afi
/* graceful-restart related processing */
UNSET_FLAG(peer->af_sflags[afi][safi], PEER_STATUS_GR_WAIT_EOR);
if ((bgp->t_startup || bgp_in_graceful_restart()) &&
if ((bgp->t_startup || bgp_in_graceful_restart() ||
BGP_MULTIHOP_GR_PENDING(bgp, afi, safi)) &&
bgp_gr_supported_for_afi_safi(afi, safi)) {
struct graceful_restart_info *gr_info;
gr_info = &(bgp->gr_info[afi][safi]);
if (!gr_info->select_defer_over) {
if (!gr_info->select_defer_over ||
BGP_MULTIHOP_GR_PENDING(bgp, afi, safi)) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: check for deferred path-selection",
bgp->name_pretty);
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 6);
bgp_gr_check_path_select(bgp, afi, safi);
}
}

View file

@ -65,6 +65,7 @@
#include "bgpd/bgp_trace.h"
#include "bgpd/bgp_rpki.h"
#include "bgpd/bgp_srv6.h"
#include "bgpd/bgp_bfd.h"
#ifdef ENABLE_BGP_VNC
#include "bgpd/rfapi/rfapi_backend.h"
@ -124,6 +125,7 @@ static const struct message bgp_pmsi_tnltype_str[] = {
#define SOFT_RECONFIG_TASK_MAX_PREFIX 25000
static int clear_batch_rib_helper(struct bgp_clearing_info *cinfo);
static void bgp_gr_start_tier2_timer_if_required(struct bgp *bgp, afi_t afi, safi_t safi);
static inline char *bgp_route_dump_path_info_flags(struct bgp_path_info *pi,
char *buf, size_t len)
@ -431,10 +433,11 @@ bool bgp_path_info_nexthop_changed(struct bgp_path_info *pi, struct peer *to,
}
/* This function sets flag BGP_NODE_SELECT_DEFER based on condition */
static int bgp_dest_set_defer_flag(struct bgp_dest *dest, bool delete)
int bgp_dest_set_defer_flag(struct bgp_dest *dest, bool delete)
{
struct peer *peer;
struct bgp_path_info *old_pi, *nextpi;
bool pi_is_imported_from_evpn = false;
bool set_flag = false;
struct bgp *bgp = NULL;
struct bgp_table *table = NULL;
@ -462,53 +465,77 @@ static int bgp_dest_set_defer_flag(struct bgp_dest *dest, bool delete)
}
table = bgp_dest_table(dest);
if (table) {
bgp = table->bgp;
afi = table->afi;
safi = table->safi;
}
if (!table)
return -1;
bgp = table->bgp;
afi = table->afi;
safi = table->safi;
for (old_pi = bgp_dest_get_bgp_path_info(dest);
(old_pi != NULL) && (nextpi = old_pi->next, 1); old_pi = nextpi) {
if (CHECK_FLAG(old_pi->flags, BGP_PATH_SELECTED))
continue;
/* Route selection is deferred if there is a stale path which
* which indicates peer is in restart mode
*/
if (CHECK_FLAG(old_pi->flags, BGP_PATH_STALE)
&& (old_pi->sub_type == BGP_ROUTE_NORMAL)) {
if (CHECK_FLAG(old_pi->flags, BGP_PATH_STALE) &&
(old_pi->sub_type == BGP_ROUTE_NORMAL ||
IS_PATH_IMPORTED_FROM_EVPN_TABLE(old_pi))) {
set_flag = true;
} else {
/* If the peer is graceful restart capable and peer is
* restarting mode, set the flag BGP_NODE_SELECT_DEFER
*/
/*
* If we haven't recieved EORs from all the multihop
* peers then defer bestpath calculation
*/
peer = old_pi->peer;
if (BGP_PEER_GRACEFUL_RESTART_CAPABLE(peer)
&& BGP_PEER_RESTARTING_MODE(peer)
&& (old_pi
&& old_pi->sub_type == BGP_ROUTE_NORMAL)) {
if (BGP_PEER_GRACEFUL_RESTART_CAPABLE(peer) &&
BGP_PEER_RESTARTING_MODE(peer) &&
(old_pi && (old_pi->sub_type == BGP_ROUTE_NORMAL ||
IS_PATH_IMPORTED_FROM_EVPN_TABLE(old_pi)))) {
set_flag = true;
}
}
if (set_flag)
if (set_flag) {
if (IS_PATH_IMPORTED_FROM_EVPN_TABLE(old_pi))
pi_is_imported_from_evpn = true;
break;
}
/* Set the flag BGP_NODE_SELECT_DEFER if route selection deferral timer
* is active
*/
if (set_flag && table) {
if (bgp && (bgp->gr_info[afi][safi].t_select_deferral)) {
if (!CHECK_FLAG(dest->flags, BGP_NODE_SELECT_DEFER))
bgp->gr_info[afi][safi].gr_deferred++;
SET_FLAG(dest->flags, BGP_NODE_SELECT_DEFER);
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Defer route %pBD, dest %p", bgp->name_pretty, dest,
dest);
return 0;
}
}
if (!set_flag)
return -1;
struct bgp *bgp_evpn = bgp_get_evpn();
/* Set the flag BGP_NODE_SELECT_DEFER on prefix/dest if route selection
* deferral timer is active. RFC4724 says that restarting BGP node must
* defer bestpath selection for a prefix/dest until EORs are received
* from all the GR helpers.
*
* If the dest is an imported route in destination VRF, check if the
* GR timer for this afi-safi in destaination VRF is running. If the GR
* timer for this afi-safi in destination VRF is not running, then check
* if GR timer for L2VPN EVPN in global table is running. If yes, then
* mark the route as deferred.
*/
if (BGP_GR_SELECT_DEFERRAL_TIMER_IS_RUNNING(bgp, afi, safi) ||
(pi_is_imported_from_evpn && bgp_evpn != NULL &&
BGP_GR_SELECT_DEFERRAL_TIMER_IS_RUNNING(bgp_evpn, AFI_L2VPN, SAFI_EVPN))) {
if (!CHECK_FLAG(dest->flags, BGP_NODE_SELECT_DEFER))
bgp->gr_info[afi][safi].gr_deferred++;
SET_FLAG(dest->flags, BGP_NODE_SELECT_DEFER);
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Defer route %pBD, dest %p", bgp->name_pretty, dest, dest);
return 0;
}
return -1;
}
@ -3084,6 +3111,9 @@ static void bgp_route_select_timer_expire(struct event *event)
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp->gr_info[afi][safi].gr_deferred);
frrtrace(4, frr_bgp, gr_continue_deferred_path_selection, bgp->name_pretty, afi, safi,
bgp->gr_info[afi][safi].gr_deferred);
bgp_do_deferred_path_selection(bgp, afi, safi);
}
@ -3898,8 +3928,7 @@ bgp_mplsvpn_handle_label_allocation(struct bgp *bgp, struct bgp_dest *dest,
* We have no eligible route that we can announce or the rn
* is being removed.
*/
static void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest,
afi_t afi, safi_t safi)
void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, afi_t afi, safi_t safi)
{
struct bgp_path_info *new_select;
struct bgp_path_info *old_select;
@ -4165,13 +4194,27 @@ void bgp_process_gr_deferral_complete(struct bgp *bgp, afi_t afi, safi_t safi)
bool route_sync_pending = false;
bgp_send_delayed_eor(bgp);
/* Send route processing complete message to RIB */
bgp_zebra_update(bgp, afi, safi, ZEBRA_CLIENT_ROUTE_UPDATE_COMPLETE);
bgp->gr_info[afi][safi].route_sync = true;
/* If this instance is all done, check for GR completion overall */
FOREACH_AFI_SAFI_NSF (afi, safi) {
if (bgp->gr_info[afi][safi].af_enabled && !bgp->gr_info[afi][safi].route_sync) {
/*
* Check if tier2 timer needs to be started if this
* afi-safi is enabled for multihop peer
*/
bgp_gr_start_tier2_timer_if_required(bgp, afi, safi);
/* Send route processing complete message to RIB */
if (!bgp->gr_info[afi][safi].route_sync_tier2 && BGP_GR_SELECT_DEFER_DONE(bgp, afi, safi)) {
bgp_zebra_update(bgp, afi, safi, ZEBRA_CLIENT_ROUTE_UPDATE_COMPLETE);
bgp->gr_info[afi][safi].route_sync_tier2 = true;
}
bgp->gr_info[afi][safi].route_sync = true;
/*
* If this instance is all done,
* check for GR completion overall
*/
FOREACH_AFI_SAFI (afi, safi) {
if (bgp->gr_info[afi][safi].af_enabled &&
!bgp->gr_info[afi][safi].route_sync_tier2) {
route_sync_pending = true;
break;
}
@ -4179,6 +4222,8 @@ void bgp_process_gr_deferral_complete(struct bgp *bgp, afi_t afi, safi_t safi)
if (!route_sync_pending) {
bgp->gr_route_sync_pending = false;
/* Set bgp master GR COMPLETE flag */
frrtrace(3, frr_bgp, gr_update_complete, bgp->name_pretty, afi, safi);
bgp_update_gr_completion();
}
}
@ -4256,12 +4301,181 @@ void bgp_dest_decrement_gr_fib_install_pending_count(struct bgp_dest *dest)
}
}
/* Process the routes with the flag BGP_NODE_SELECT_DEFER set */
/*
* If multihop peer is configured for this AFI SAFI and if
* tier 2 timer has not been started yet, then this function
* will start it.
*/
static void bgp_gr_start_tier2_timer_if_required(struct bgp *bgp, afi_t afi, safi_t safi)
{
struct listnode *node, *nnode;
struct peer *peer = NULL;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Checking if tier 2 timer needs to be started for %s",
bgp->name_pretty, get_afi_safi_str(afi, safi, false));
/*
* If there's no multihop peer in this VRF or
* if the tier2 timer has been started for this afi-safi or
* if select_defer_tier2 timer was not started since it was
* not required (This happens when multihop peers comeup before
* directly connected peers) and if select_defer_over_tier2 was
* set to true in bgp_gr_check_path_select()
* there's nothing to do.
*/
if (!bgp->gr_multihop_peer_exists || bgp->gr_info[afi][safi].select_defer_tier2_required ||
bgp->gr_info[afi][safi].select_defer_over_tier2)
return;
for (ALL_LIST_ELEMENTS(bgp->peer, node, nnode, peer)) {
if (!PEER_IS_MULTIHOP(peer))
continue;
if ((!peer->afc[afi][safi]) || !CHECK_FLAG(peer->flags, PEER_FLAG_CONFIG_NODE) ||
CHECK_FLAG(peer->flags, PEER_FLAG_SHUTDOWN) ||
!CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART))
continue;
/*
* Multihop peer has this GR AFI SAFI enabled. If
* all the directly connected peers with this afi-safi
* enabled, did not come up, then tier1 timer will
* expire.
*
* But tier2 GR timer is started only after all the
* directly connected peers come up.
* So in case where all directly connected peers with
* this afi-safi enabled did not come up, we did not
* start tier2 GR timer even though multihop peer exists and
* has this afi-safi enabled.
*
* So start the tier2 timer here.
*/
if (bgp->gr_info[afi][safi].select_defer_over)
bgp_start_tier2_deferral_timer(bgp, afi, safi);
}
}
/*
* Starts GR route select timer to process remaining routes
*/
static inline void bgp_gr_start_route_select_timer(struct bgp *bgp, afi_t afi, safi_t safi)
{
struct afi_safi_info *thread_info;
thread_info = XMALLOC(MTYPE_TMP, sizeof(struct afi_safi_info));
thread_info->afi = afi;
thread_info->safi = safi;
thread_info->bgp = bgp;
/*
* If there are more routes to be processed, start the
* selection timer
*/
event_add_timer(bm->master, bgp_route_select_timer_expire, thread_info,
BGP_ROUTE_SELECT_DELAY, &bgp->gr_info[afi][safi].t_route_select);
}
/*
* Trigger deferred bestpath calculation for IPv4 and IPv6
* unicast tables in non-default VRFs by starting the route-select
* timer.
*/
static inline void bgp_evpn_handle_deferred_bestpath_for_vrfs(void)
{
struct listnode *node;
struct bgp *bgp_vrf;
afi_t tmp_afi = AFI_UNSPEC;
safi_t tmp_safi = SAFI_UNICAST;
for (ALL_LIST_ELEMENTS_RO(bm->bgp, node, bgp_vrf)) {
/* NO-OP for default/global VRF */
if (bgp_vrf == bgp_get_evpn())
continue;
for (tmp_afi = AFI_IP; tmp_afi <= AFI_IP6; tmp_afi++) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Evaluating deferred path selection for %s",
bgp_vrf->name_pretty,
get_afi_safi_str(tmp_afi, tmp_safi, false));
/*
* If the route-select timer or
* select-deferral-timers are still running for
* this VRF, or has no deferred routes, then
* nothing to do. If not, start the route-select
* timer for the VRF, AFI, SAFI so that this
* deferred bestapath selection can be done for
* this VRF, AFI, SAFI.
*/
if (bgp_vrf->gr_info[tmp_afi][tmp_safi].t_route_select ||
BGP_GR_SELECT_DEFERRAL_TIMER_IS_RUNNING(bgp_vrf, tmp_afi, tmp_safi) ||
!bgp_vrf->gr_info[tmp_afi][tmp_safi].gr_deferred)
continue;
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Starting GR route select timer for %s",
bgp_vrf->name_pretty,
get_afi_safi_str(tmp_afi, tmp_safi, false));
/*
* Below piece of code is to handle non default EVPN VRF
* instances where the variables are to be set
* appropriately.
*
* NOTE:
* - Prior to this change, peer_unshut_after_cfg() sends
* UPD_PENDING + UPD_COMPLETE for non default VRFs
* prematurely.
* - So, when default vrf deferral calculation is
* complete, it invokes this function to queue the
* deferral for non default vrfs.
* - However it then ends up sending the UPDATE_COMPLETE
* and marks GR done for all instances since the below
* variables (especially gr_route_sync_pending) are
* not set for non default vrfs.
*
* Sending UPDATE_PENDING here makes sense to tell zebra
* that the non default VRF is a work in progress and is
* yet to go through the deferred path selection.
*/
bgp_vrf->gr_route_sync_pending = true;
bgp_vrf->gr_info[tmp_afi][tmp_safi].af_enabled = true;
bgp_zebra_update(bgp_vrf, tmp_afi, tmp_safi,
ZEBRA_CLIENT_ROUTE_UPDATE_PENDING);
bgp_vrf->gr_info[tmp_afi][tmp_safi].select_defer_over = true;
/*
* The reason why we are starting the timer and
* not doing deferred BP calculation in place is
* because, if this route table has more than
* BGP_MAX_BEST_ROUTE_SELECT, then we need to
* delay the deferred BP for a sec
*/
bgp_gr_start_route_select_timer(bgp_vrf, tmp_afi, tmp_safi);
}
}
}
/*
* Process the routes with the flag BGP_NODE_SELECT_DEFER set
*
* NOTE: Few important places where bgp_do_deferred_path_selection() is
* invoked are as below
* 1) For default VRF when EORs are received.
* 2) Start of deferral time when config read is done and peers are not in
* admin down in peer_unshut_after_cfg()
* 3) Via bgp_gr_start_route_select_timer() in 2 cases
* a) When there are still routes to be processed at the end of this
* function
* b) For non default Vrfs if EVPN is enabled in default vrf via
* bgp_evpn_handle_deferred_bestpath_for_vrfs()
*/
void bgp_do_deferred_path_selection(struct bgp *bgp, afi_t afi, safi_t safi)
{
struct bgp_dest *dest;
int cnt = 0;
struct afi_safi_info *thread_info;
uint16_t cnt = 0;
if (bgp->gr_info[afi][safi].t_route_select) {
struct event *t = bgp->gr_info[afi][safi].t_route_select;
@ -4271,55 +4485,120 @@ void bgp_do_deferred_path_selection(struct bgp *bgp, afi_t afi, safi_t safi)
event_cancel(&bgp->gr_info[afi][safi].t_route_select);
}
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART)) {
zlog_debug("%s: processing route for %s : cnt %d", __func__,
get_afi_safi_str(afi, safi, false),
bgp->gr_info[afi][safi].gr_deferred);
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: Started doing BGP deferred path selection for %s",
bgp->name_pretty, get_afi_safi_str(afi, safi, false));
frrtrace(4, frr_bgp, gr_eors, bgp->name_pretty, afi, safi, 7);
if (afi == AFI_L2VPN && safi == SAFI_EVPN) {
struct bgp_dest *rd_dest = NULL;
struct bgp_table *table = NULL;
/*
* Calculate bestpaths for all the RDs in global EVPN table
*/
for (rd_dest = bgp_table_top(bgp->rib[afi][safi]);
rd_dest && bgp->gr_info[afi][safi].gr_deferred != 0 &&
cnt < BGP_MAX_BEST_ROUTE_SELECT;
rd_dest = bgp_route_next(rd_dest)) {
table = bgp_dest_get_bgp_table_info(rd_dest);
if (!table)
continue;
cnt = bgp_deferred_path_selection(bgp, afi, safi, table, cnt, NULL, false);
}
/*
* If iteration stopped before all the RD tables were
* traversed then the node needs to be unlocked.
*/
if (rd_dest) {
bgp_dest_unlock_node(rd_dest);
rd_dest = NULL;
}
/*
* Calculate the bestpaths for ip-table and mac-table for all
* the L2VNIs
*/
bgp_evpn_handle_deferred_bestpath_for_vnis(bgp, cnt);
/*
* Trigger deferred bestpath calculation for IPv4 and IPv6
* unicast tables in non-default VRFs by starting the
* route-select timer.
*
* This handles the case where a non-default VRF
* is not GR enabled. In which case, none of the GR timers will
* be started/running. So for such VRFs, this trigger will do
* the deferred bestpath selection.
*
* This also handles the case where default BGP has EVPN enabled
* and non default VRFs(Tenant VRFs) dont have any peer.
*/
bgp_evpn_handle_deferred_bestpath_for_vrfs();
} else if (safi == SAFI_UNICAST && (afi == AFI_IP || afi == AFI_IP6)) {
struct bgp *bgp_evpn = bgp_get_evpn();
if (bgp->vrf_id == VRF_DEFAULT) {
/*
* Process the route list for IPv4/IPv6 unicast table
* in default VRF
*/
bgp_deferred_path_selection(bgp, afi, safi, bgp->rib[afi][safi], cnt, NULL,
false);
} else if (!bgp_evpn || !bgp_evpn->gr_info[AFI_L2VPN][SAFI_EVPN].af_enabled ||
bgp_evpn->gr_info[AFI_L2VPN][SAFI_EVPN].route_sync_tier2) {
/*
* Process the route list for IPv4/IPv6 unicast table
* in non-default VRF.
*
* For non-default VRF, deferred bestpath selection will
* take place if:
*
* 1. GR is NOT enabled for l2vpn evpn afi safi in EVPN
* default VRF
*
* OR
*
* 2. GR is enabled for l2vpn evpn afi safi in EVPN
* default VRF and GR is complete for default VRF
*/
bgp_deferred_path_selection(bgp, afi, safi, bgp->rib[afi][safi], cnt, NULL,
false);
} else {
if (bgp_evpn && BGP_DEBUG(graceful_restart, GRACEFUL_RESTART)) {
zlog_debug("%s: Skipped BGP deferred path selection for %s. GR %s started for %s L2VPN EVPN. UPDATE_COMPLETE %s",
bgp->name_pretty, get_afi_safi_str(afi, safi, false),
bgp_evpn->gr_info[AFI_L2VPN][SAFI_EVPN].af_enabled
? ""
: "NOT",
bgp_evpn->name_pretty,
bgp_evpn->gr_info[AFI_L2VPN][SAFI_EVPN].route_sync_tier2
? "done"
: "NOT done");
}
}
}
/* Process the route list */
for (dest = bgp_table_top(bgp->rib[afi][safi]);
dest && bgp->gr_info[afi][safi].gr_deferred != 0 &&
cnt < BGP_MAX_BEST_ROUTE_SELECT;
dest = bgp_route_next(dest)) {
if (!CHECK_FLAG(dest->flags, BGP_NODE_SELECT_DEFER))
continue;
UNSET_FLAG(dest->flags, BGP_NODE_SELECT_DEFER);
bgp->gr_info[afi][safi].gr_deferred--;
bgp_process_main_one(bgp, dest, afi, safi);
cnt++;
}
/* If iteration stopped before the entire table was traversed then the
* node needs to be unlocked.
/*
* Send EOR message when all routes are processed
* and if select deferral timer for tier 2 peers is
* not running or has expired.
*/
if (dest) {
bgp_dest_unlock_node(dest);
dest = NULL;
}
/* Send EOR message when all routes are processed */
if (!bgp->gr_info[afi][safi].gr_deferred) {
/* t_select_deferral will be NULL when either gr_route_fib_install_pending_cnt is 0
* or deferral timer for fib install expires
*/
if (!BGP_SUPPRESS_FIB_ENABLED(bgp) || !bgp->gr_info[afi][safi].t_select_deferral)
bgp_process_gr_deferral_complete(bgp, afi, safi);
return;
} else {
/*
* Check if there are more routes to be processed
*/
bgp_gr_start_route_select_timer(bgp, afi, safi);
}
thread_info = XMALLOC(MTYPE_TMP, sizeof(struct afi_safi_info));
thread_info->afi = afi;
thread_info->safi = safi;
thread_info->bgp = bgp;
/* If there are more routes to be processed, start the
* selection timer
*/
event_add_timer(bm->master, bgp_route_select_timer_expire, thread_info,
BGP_ROUTE_SELECT_DELAY,
&bgp->gr_info[afi][safi].t_route_select);
}
static const char *subqueue2str(enum meta_queue_indexes index)
@ -6614,14 +6893,12 @@ static wq_item_status bgp_clear_route_node(struct work_queue *wq, void *data)
continue;
/* graceful restart STALE flag set. */
if (((CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT)
&& peer->nsf[afi][safi])
|| CHECK_FLAG(peer->af_sflags[afi][safi],
PEER_STATUS_ENHANCED_REFRESH))
&& !CHECK_FLAG(pi->flags, BGP_PATH_STALE)
&& !CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE))
if (((CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT) && peer->nsf[afi][safi]) ||
CHECK_FLAG(peer->af_sflags[afi][safi], PEER_STATUS_ENHANCED_REFRESH)) &&
!CHECK_FLAG(pi->flags, BGP_PATH_STALE) &&
!CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE)) {
bgp_path_info_set_flag(dest, pi, BGP_PATH_STALE);
else {
} else {
/* If this is an EVPN route, process for
* un-import. */
if (safi == SAFI_EVPN)
@ -7351,6 +7628,15 @@ void bgp_clear_stale_route(struct peer *peer, afi_t afi, safi_t safi)
BGP_PATH_STALE))
continue;
/*
* If stale route which is being deleted
* is a l2vpn evpn route, then unimport
* it from all the VRFs and VNIs.
*/
if (safi == SAFI_EVPN && pi->sub_type == BGP_ROUTE_NORMAL)
bgp_evpn_unimport_route(peer->bgp, afi, safi,
bgp_dest_get_prefix(rm),
pi);
/*
* If this is VRF leaked route
* process for withdraw.

View file

@ -1016,4 +1016,6 @@ extern int early_route_process(struct bgp *bgp, struct bgp_dest *dest);
extern int other_route_process(struct bgp *bgp, struct bgp_dest *dest);
extern int eoiu_marker_process(struct bgp *bgp, struct bgp_dest *dest);
extern uint32_t bgp_med_value(struct attr *attr, struct bgp *bgp);
extern int bgp_dest_set_defer_flag(struct bgp_dest *dest, bool delete);
extern void bgp_process_main_one(struct bgp *bgp, struct bgp_dest *dest, afi_t afi, safi_t safi);
#endif /* _QUAGGA_BGP_ROUTE_H */

View file

@ -636,6 +636,181 @@ TRACEPOINT_EVENT(
)
TRACEPOINT_LOGLEVEL(frr_bgp, evpn_local_l3vni_del_zrecv, TRACE_INFO)
/*
* Loc 1 - gr_tier1_deferral_timer_start,
* Loc 2 - gr_tier2_deferral_timer_start,
*/
TRACEPOINT_EVENT(
frr_bgp,
gr_deferral_timer_start,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi,
uint32_t, defer_time, uint8_t, loc),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(uint32_t, defer_time, defer_time)
ctf_integer(uint8_t, location, loc)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_deferral_timer_start, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_deferral_timer_expiry,
TP_ARGS(char *, bgp_name, bool, tier2, uint8_t, afi, uint8_t, safi,
uint32_t, deferred_rt_cnt),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_string(gr_tier, tier2 ? "2" : "1")
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(uint32_t, deferred_routes, deferred_rt_cnt)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_deferral_timer_expiry, TRACE_INFO)
/*
* Loc1: gr_check_all_eors
* Loc2: gr_all_directly_connected_eors_rcvd
* Loc3: gr_all_multihop_eors_not_rcvd
* Loc4: gr_all_eors_rcvd
* Loc5: gr_no_multihop_eors_pending
* Loc6: gr_eor_rcvd_check_path_select
* Loc7: gr_do_deferred_path_selection
*/
TRACEPOINT_EVENT(
frr_bgp,
gr_eors,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi, uint8_t, loc),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(uint8_t, location, loc)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_eors, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_update_complete,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_update_complete, TRACE_INFO)
/*
* Loc1: gr_eor_awaited_from
* Loc2: gr_eor_ignore
* Loc3: gr_multihop_eor_awaited
* Loc4: gr_eor_ignore_after_tier1_timer_expiry
* Loc5: gr_directly_connected_eor_awaited
*/
TRACEPOINT_EVENT(
frr_bgp,
gr_eor_peer,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi,
char *, peer_name, uint8_t, loc),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_string(peer, peer_name)
ctf_integer(uint8_t, location, loc)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_eor_peer, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_start_deferred_path_selection,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi,
uint32_t, deferred_rt_cnt),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(uint32_t, deferred_routes, deferred_rt_cnt)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_start_deferred_path_selection, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_peer_up_ignore,
TP_ARGS(char *, bgp_name, char *, peer_host,
uint32_t, peer_cap, uint64_t, peer_flags),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_string(peer, peer_host)
ctf_integer(uint32_t, capability, peer_cap)
ctf_integer(uint64_t, peer_flags, peer_flags)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_peer_up_ignore, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_send_rbit_capability,
TP_ARGS(char *, bgp_name, char *, peer_host,
uint32_t, restart_time, bool, restart),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_string(peer, peer_host)
ctf_integer(uint32_t, restart_time, restart_time)
ctf_integer(bool, R_bit, restart)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_send_rbit_capability, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_send_fbit_capability,
TP_ARGS(char *, bgp_name, char *, peer_host,
uint8_t, afi, uint8_t, safi, bool, f_bit),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_string(peer, peer_host)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(bool, F_bit, f_bit)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_send_fbit_capability, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_continue_deferred_path_selection,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi,
uint32_t, deferred_rt_remain),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_integer(uint32_t, remaining_routes, deferred_rt_remain)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_continue_deferred_path_selection, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_send_capabilities,
TP_ARGS(char *, bgp_name, uint32_t, vrf_id, bool, disable),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint32_t, vrf_id, vrf_id)
ctf_integer(bool, disable, disable)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_send_capabilities, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
gr_zebra_update,
TP_ARGS(char *, bgp_name, uint8_t, afi, uint8_t, safi, const char *, type),
TP_FIELDS(ctf_string(bgp_instance, bgp_name)
ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, safi, safi)
ctf_string(type, type)
)
)
TRACEPOINT_LOGLEVEL(frr_bgp, gr_zebra_update, TRACE_INFO)
TRACEPOINT_EVENT(
frr_bgp,
eor_send,

View file

@ -14027,6 +14027,20 @@ static void bgp_show_peer_gr_info_afi_safi(struct vty *vty, struct peer *peer, b
peer->bgp->gr_info[afi][safi]
.t_select_deferral));
}
if (peer->bgp->gr_multihop_peer_exists) {
if (CHECK_FLAG(peer->flags, PEER_FLAG_GRACEFUL_RESTART))
json_object_int_add(json_timer,
"selectionDeferralTier2Timer",
peer->bgp->select_defer_time);
if (peer->bgp->gr_info[afi][safi].t_select_deferral_tier2 != NULL)
json_object_int_add(json_timer,
"selectionDeferralTier2TimerRemaining",
event_timer_remain_second(
peer->bgp->gr_info[afi][safi]
.t_select_deferral_tier2));
}
} else {
vty_out(vty, " Timers:\n");
vty_out(vty,
@ -14057,6 +14071,15 @@ static void bgp_show_peer_gr_info_afi_safi(struct vty *vty, struct peer *peer, b
event_timer_remain_second(
peer->bgp->gr_info[afi][safi]
.t_select_deferral));
if (peer->bgp->gr_multihop_peer_exists) {
vty_out(vty, " Multihop GR peer exists\n");
if (peer->bgp->gr_info[afi][safi].t_select_deferral_tier2 != NULL)
vty_out(vty,
" Selection Deferral Tier2 Time Remaining(sec): %ld\n",
event_timer_remain_second(
peer->bgp->gr_info[afi][safi]
.t_select_deferral_tier2));
}
}
if (json) {
json_object_object_add(json_afi_safi, "endOfRibStatus",

View file

@ -4595,6 +4595,9 @@ int bgp_zebra_send_capabilities(struct bgp *bgp, bool disable)
if (BGP_DEBUG(zebra, ZEBRA) || BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: %s send capability success", __func__, bgp->name_pretty);
frrtrace(3, frr_bgp, gr_send_capabilities, bgp->name_pretty, bgp->vrf_id, disable);
ret = BGP_GR_SUCCESS;
}
return ret;
@ -4608,7 +4611,19 @@ int bgp_zebra_update(struct bgp *bgp, afi_t afi, safi_t safi,
{
struct zapi_cap api = {0};
if (BGP_DEBUG(zebra, ZEBRA))
/*
* For non-default VRF do not communicate UPDATE_PENDING or
* UPDATE_COMPLTETE for l2vpn evpn AFI SAFI.
*/
if (IS_L2VPN_AFI_IN_NON_DEFAULT_VRF(bgp, afi, safi)) {
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: %s afi: %u safi: %u Command %s ignore", __func__,
bgp->name_pretty, afi, safi, zserv_gr_client_cap_string(type));
return BGP_GR_SUCCESS;
}
if (BGP_DEBUG(graceful_restart, GRACEFUL_RESTART))
zlog_debug("%s: %s afi: %u safi: %u Command %s", __func__,
bgp->name_pretty, afi, safi,
zserv_gr_client_cap_string(type));
@ -4640,6 +4655,10 @@ int bgp_zebra_update(struct bgp *bgp, afi_t afi, safi_t safi,
bgp->name_pretty);
return BGP_GR_FAILURE;
}
frrtrace(4, frr_bgp, gr_zebra_update, bgp->name_pretty, afi, safi,
zserv_gr_client_cap_string(type));
return BGP_GR_SUCCESS;
}

View file

@ -3652,10 +3652,7 @@ peer_init:
bgp_maximum_paths_set(bgp, afi, safi, BGP_PEER_IBGP,
multipath_num, 0);
/* Initialize graceful restart info */
bgp->gr_info[afi][safi].t_select_deferral = NULL;
bgp->gr_info[afi][safi].t_route_select = NULL;
bgp->gr_info[afi][safi].gr_deferred = 0;
bgp->gr_info[afi][safi].select_defer_over = false;
memset(&bgp->gr_info[afi][safi], 0, sizeof(struct graceful_restart_info));
}
bgp->v_update_delay = bm->v_update_delay;
@ -9109,12 +9106,13 @@ static int peer_unshut_after_cfg(struct bgp *bgp)
bgp_in_graceful_restart(), global_gr_mode, gr_cfgd_at_nbr);
/*
* If BGP is not in GR
* If BGP is not in GR and startup timer is not running
* OR
* If this VRF doesn't have GR configured at global and neighbor level
* then return
*/
if (!bgp_in_graceful_restart() || (global_gr_mode != GLOBAL_GR && !gr_cfgd_at_nbr))
if ((!bgp_in_graceful_restart() && !bgp->t_startup) ||
(global_gr_mode != GLOBAL_GR && !gr_cfgd_at_nbr))
return 0;
/*

View file

@ -332,10 +332,58 @@ enum bgp_instance_type {
BGP_INSTANCE_TYPE_VIEW
};
/*
* If BGP has started gracefully and if this VRF has
* multihop peer and tier1 processing is done already,
* then check if tier2 timer
* was started but tier2 GR processing is still pending.
*/
#define BGP_MULTIHOP_GR_PENDING(bgp, afi, safi) \
((CHECK_FLAG(bm->flags, BM_FLAG_GRACEFUL_RESTART) && bgp->gr_multihop_peer_exists && \
bgp->gr_info[afi][safi].select_defer_over && \
bgp->gr_info[afi][safi].select_defer_tier2_required && \
!bgp->gr_info[afi][safi].select_defer_over_tier2))
/*
* If this VRF has a bgp multihop peer, then
* 1. If tier2 processing is not required, then check if tier1
* processing is complete
* OR
* 2. If tier2 processing is required, then check if tier2
* processing is complete
*/
#define BGP_GR_MULTIHOP_SELECT_DEFER_DONE(bgp, afi, safi) \
((bgp->gr_multihop_peer_exists && \
((!bgp->gr_info[afi][safi].select_defer_tier2_required && \
bgp->gr_info[afi][safi].select_defer_over) || \
(bgp->gr_info[afi][safi].select_defer_tier2_required && \
bgp->gr_info[afi][safi].select_defer_over_tier2))))
/*
* Check if tier1 and tier2 processing (if required)
* is complete
*/
#define BGP_GR_SELECT_DEFER_DONE(bgp, afi, safi) \
((!bgp->gr_multihop_peer_exists && bgp->gr_info[afi][safi].select_defer_over) || \
BGP_GR_MULTIHOP_SELECT_DEFER_DONE(bgp, afi, safi))
/*
* Send eor if:
* If eor is enabled and
* 1. GR is NOT enabled
* OR
* 2. GR is enabled and complete
*/
#define BGP_SEND_EOR(bgp, afi, safi) \
(!CHECK_FLAG(bgp->flags, BGP_FLAG_GR_DISABLE_EOR) && \
(!bgp_in_graceful_restart() || bgp->gr_info[afi][safi].select_defer_over) && \
(!BGP_SUPPRESS_FIB_ENABLED(bgp) || !bgp->gr_info[afi][safi].t_select_deferral))
(!CHECK_FLAG(bgp->flags, BGP_FLAG_GR_DISABLE_EOR) && !bgp_in_graceful_restart())
/*
* Checks is tier1 or tier2 GR select deferral timer is
* running for given afi safi in given BGP instance
*/
#define BGP_GR_SELECT_DEFERRAL_TIMER_IS_RUNNING(bgp, afi, safi) \
(bgp->gr_info[afi][safi].t_select_deferral || \
bgp->gr_info[afi][safi].t_select_deferral_tier2)
/* BGP GR Global ds */
@ -346,6 +394,14 @@ enum bgp_instance_type {
struct graceful_restart_info {
/* Deferral Timer */
struct event *t_select_deferral;
/* If multihop BGP peers are present, and if their
* loopback is learnt via another BGP peer,
* then BGP needs to do 2 level deferred bestpath
* calculation. Hence we need additional select
* deferral timer
*/
struct event *t_select_deferral_tier2;
/* Routes Deferred */
uint32_t gr_deferred;
/* Routes waiting for FIB install */
@ -360,6 +416,9 @@ struct graceful_restart_info {
uint8_t flags;
/* Flag to skip backpressure logic for GR */
#define BGP_GR_SKIP_BP (1 << 0)
bool select_defer_tier2_required;
bool select_defer_over_tier2;
bool route_sync_tier2;
};
enum global_mode {
@ -693,6 +752,8 @@ struct bgp {
*/
bool gr_select_defer_evaluated;
bool gr_multihop_peer_exists;
/* Is deferred path selection still not complete? */
bool gr_route_sync_pending;
@ -1048,6 +1109,7 @@ struct afi_safi_info {
afi_t afi;
safi_t safi;
struct bgp *bgp;
bool tier2_gr;
};
#define BGP_ROUTE_ADV_HOLD(bgp) (bgp->main_peers_update_hold)
@ -2368,7 +2430,7 @@ struct bgp_nlri {
/* BGP graceful restart */
#define BGP_DEFAULT_RESTART_TIME 120
#define BGP_DEFAULT_STALEPATH_TIME 360
#define BGP_DEFAULT_SELECT_DEFERRAL_TIME 240
#define BGP_DEFAULT_SELECT_DEFERRAL_TIME 120
#define BGP_DEFAULT_RIB_STALE_TIME 500
#define BGP_DEFAULT_UPDATE_ADVERTISEMENT_TIME 1
@ -2811,6 +2873,7 @@ int bgp_enqueue_conn_err(struct bgp *bgp, struct peer_connection *connection,
int errcode);
struct peer_connection *bgp_dequeue_conn_err(struct bgp *bgp, bool *more_p);
void bgp_conn_err_reschedule(struct bgp *bgp);
static inline bool bgp_gr_supported_for_afi_safi(afi_t afi, safi_t safi);
#define BGP_GR_ROUTER_DETECT_AND_SEND_CAPABILITY_TO_ZEBRA(_bgp, _peer_list) \
do { \
@ -3133,10 +3196,11 @@ static inline bool bgp_gr_is_forwarding_preserved(struct bgp *bgp)
static inline bool bgp_gr_supported_for_afi_safi(afi_t afi, safi_t safi)
{
/*
* GR restarter behavior is supported only for IPv4-unicast
* and IPv6-unicast.
* GR restarter behavior is supported only for IPv4-unicast,
* IPv6-unicast and L2vpn EVPN
*/
if ((afi == AFI_IP && safi == SAFI_UNICAST) || (afi == AFI_IP6 && safi == SAFI_UNICAST))
if ((afi == AFI_IP && safi == SAFI_UNICAST) || (afi == AFI_IP6 && safi == SAFI_UNICAST) ||
(afi == AFI_L2VPN && safi == SAFI_EVPN))
return true;
return false;
}

View file

@ -1124,6 +1124,28 @@ BGP GR Peer Mode Commands
This command will disable the entire BGP graceful restart functionality
at the peer level.
BGP GR Support For L2VPN EVPN
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To support GR for L2VPN EVPN AFI SAFI in BGP, following changes were made:
1. If GR is enabled for a BGP instance, then GR select
deferral timer will be started for all GR supported AFI-SAFIs after
BGP is finished reading configs.This way, BGP will not have to wait
for the 1st peer to comeup to start the select-deferral-timer for
that VRF, AFI-SAFI
2. All L2VPN route will be marked as deferred in default VRF,
imported into destination VRF/VNI table and then marked as deferred in
destination VRF/VNI as well
3. Deferred bestpath selection will be skipped for IPv4 and IPv6 unicast in
non-default VRFs until L2VPN EVPN AFI-SAFI in default VRF completes GR.
This ensures that deferred bestpaths calculation in non-default VRF
is done only after paths are imported from default to non-default VRF
4. On GR helper, when the peer that has GR enabled goes down,
BGP will mark all the L2VPN EVPN routes in default VRF as stale.
If the peer doesn't comeup before GR restart timer expires, then
the L2VPN EVPN routes in default VRF that were previously marked
stale needs to be unimported from all the VRFs and VNIs (IP and MAC
table) before deleting it in L2VPN EVPN table in default VRF
BGP GR Show Commands
^^^^^^^^^^^^^^^^^^^^

View file

@ -189,9 +189,9 @@ typedef enum {
for (afi = AFI_IP; afi < AFI_MAX; afi++) \
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
#define FOREACH_AFI_SAFI_NSF(afi, safi) \
for (afi = AFI_IP; afi < AFI_MAX; afi++) \
for (safi = SAFI_UNICAST; safi <= SAFI_MPLS_VPN; safi++)
#define FOREACH_AFI_SAFI_NSF(afi, safi) \
for (afi = AFI_IP; afi < AFI_MAX; afi++) \
for (safi = SAFI_UNICAST; safi <= SAFI_EVPN; safi++)
/* Flag manipulation macros. */
#define CHECK_FLAG(V,F) ((V) & (F))

View file

@ -0,0 +1,45 @@
log file zebra.log
!
ip route 10.100.0.2/32 10.0.1.2
!
vrf vrf-blue
vni 1000 prefix-routes-only
exit-vrf
!
interface lo
ip address 10.100.0.1/32
interface PE1-eth0
ip address 10.0.1.1/24
!
bgp graceful-restart
!
router bgp 101
bgp router-id 10.100.0.1
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 10.0.1.2 remote-as 102
!
address-family l2vpn evpn
neighbor 10.0.1.2 activate
advertise-all-vni
exit-address-family
!
router bgp 101 vrf vrf-blue
bgp router-id 10.100.0.1
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 192.168.50.11 remote-as 111
neighbor fd00:50:1::11 remote-as 111
!
address-family ipv4 unicast
no neighbor fd00:50:1::11 activate
exit-address-family
!
address-family ipv6 unicast
neighbor fd00:50:1::11 activate
exit-address-family
!
address-family l2vpn evpn
advertise ipv4 unicast gateway-ip
advertise ipv6 unicast gateway-ip
exit-address-family

View file

@ -0,0 +1,50 @@
log file zebra.log
!
ip route 10.100.0.1/32 10.0.1.1
!
vrf vrf-blue
vni 1000 prefix-routes-only
exit-vrf
!
interface lo
ip address 10.100.0.2/32
interface PE2-eth0
ip address 10.0.1.2/24
!
bgp graceful-restart
bgp graceful-restart preserve-fw-state
!
router bgp 102
bgp router-id 10.100.0.2
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 10.0.1.1 remote-as 101
!
address-family l2vpn evpn
neighbor 10.0.1.1 activate
advertise-all-vni
enable-resolve-overlay-index
exit-address-family
!
router bgp 102 vrf vrf-blue
bgp router-id 10.100.0.2
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 192.168.50.21 remote-as 112
neighbor fd00:50:1::21 remote-as 112
!
address-family ipv4 unicast
no neighbor fd00:50:1::21 activate
exit-address-family
!
address-family ipv6 unicast
neighbor fd00:50:1::21 activate
exit-address-family
!
address-family l2vpn evpn
rd 10.100.0.2:102
advertise ipv4 unicast
advertise ipv6 unicast
exit-address-family
end

View file

View file

@ -0,0 +1,21 @@
!
interface host1-eth0
ip address 192.168.50.11/24
ipv6 address fd00:50:1::11/48
!
router bgp 111
bgp router-id 10.100.0.11
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 192.168.50.1 remote-as 101
neighbor fd00:50:1::1 remote-as 101
!
address-family ipv4 unicast
network 172.31.0.21/32
no neighbor fd00:50:1::1 activate
exit-address-family
!
address-family ipv6 unicast
network fd00:100::21/128
neighbor fd00:50:1::1 activate
exit-address-family

View file

@ -0,0 +1,21 @@
!
interface host2-eth0
ip address 192.168.50.21/24
ipv6 address fd00:50:1::21/48
!
router bgp 112
bgp router-id 10.100.0.21
no bgp ebgp-requires-policy
no bgp network import-check
neighbor 192.168.50.2 remote-as 102
neighbor fd00:50:1::2 remote-as 102
!
address-family ipv4 unicast
network 172.31.0.22/32
no neighbor fd00:50:1::2 activate
exit-address-family
!
address-family ipv6 unicast
network fd00:100::22/128
neighbor fd00:50:1::2 activate
exit-address-family

View file

@ -0,0 +1,801 @@
#!/usr/bin/env python
# SPDX-License-Identifier: ISC
#
# Reference topology from bgp_evpn_overlay_index_gateway/test_bgp_evpn_overlay_index_gateway.py:
#
# +--------+ BGP +--------+ BGP +--------+ +--------+
# SN1 | | IPv4/v6 | | EVPN | | | |
# =======+ Host1 +---------+ PE1 +------+ PE2 +------+ Host2 +
# | | | | | | | |
# +--------+ +--------+ +--------+ +--------+
#
# Host1 is connected to PE1 and Host2 is connected to PE2.
# Host1 and PE1 have IPv4/v6 BGP sessions.
# PE1 and PE2 have an EVPN session.
# Host1 advertises IPv4/v6 prefixes to PE1.
# PE1 advertises these prefixes to PE2 as EVPN type-5 routes (gateway IP = Host1 IP).
# Host1 MAC/IP is advertised by PE1 as EVPN type-2 route.
import os
import sys
import json
import time
import functools
import pytest
pytestmark = [pytest.mark.bgpd]
CWD = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(CWD, "../"))
# pylint: disable=C0413
from lib import topotest
from lib.topogen import Topogen, TopoRouter, get_topogen
from lib.topolog import logger
from lib.common_config import (
check_router_status,
kill_router_daemons,
start_router_daemons,
)
def build_topo(tgen):
# Create PE and host routers
for name in ["PE1", "PE2", "host1", "host2"]:
tgen.add_router(name)
# Links: PE1-PE2, PE1-host1, PE2-host2
tgen.add_link(tgen.gears["PE1"], tgen.gears["PE2"], "PE1-eth0", "PE2-eth0")
tgen.add_link(tgen.gears["PE1"], tgen.gears["host1"], "PE1-eth1", "host1-eth0")
tgen.add_link(tgen.gears["PE2"], tgen.gears["host2"], "PE2-eth1", "host2-eth0")
def setup_module(mod):
tgen = Topogen(build_topo, mod.__name__)
tgen.start_topology()
# Configure host MACs
host_macs = {"host1": "1a:2b:3c:4d:5e:61", "host2": "1a:2b:3c:4d:5e:62"}
for name, mac in host_macs.items():
host = tgen.net[name]
host.cmd_raises(f"ip link set dev {name}-eth0 down")
host.cmd_raises(f"ip link set dev {name}-eth0 address {mac}")
host.cmd_raises(f"ip link set dev {name}-eth0 up")
# Configure PE devices: vrf-blue, vxlan100/1000, bridges and sysctls
pe_suffix = {"PE1": "1", "PE2": "2"}
for name, suf in pe_suffix.items():
pe = tgen.net[name]
vtep_ip = f"10.100.0.{suf}"
bridge_ip = f"192.168.50.{suf}/24"
bridge_ipv6 = f"fd00:50:1::{suf}/48"
pe.cmd_raises("ip link add vrf-blue type vrf table 10")
pe.cmd_raises("ip link set dev vrf-blue up")
pe.cmd_raises(f"ip link add vxlan100 type vxlan id 100 dstport 4789 local {vtep_ip}")
pe.cmd_raises("ip link add name br100 type bridge stp_state 0")
pe.cmd_raises("ip link set dev vxlan100 master br100")
pe.cmd_raises(f"ip link set dev {name}-eth1 master br100")
pe.cmd_raises(f"ip addr add {bridge_ip} dev br100")
pe.cmd_raises("ip link set up dev br100")
pe.cmd_raises("ip link set up dev vxlan100")
pe.cmd_raises(f"ip link set up dev {name}-eth1")
pe.cmd_raises("ip link set dev br100 master vrf-blue")
pe.cmd_raises(f"ip -6 addr add {bridge_ipv6} dev br100")
pe.cmd_raises(f"ip link add vxlan1000 type vxlan id 1000 dstport 4789 local {vtep_ip}")
pe.cmd_raises("ip link add name br1000 type bridge stp_state 0")
pe.cmd_raises("ip link set dev vxlan1000 master br1000")
pe.cmd_raises("ip link set up dev br1000")
pe.cmd_raises("ip link set up dev vxlan1000")
pe.cmd_raises("ip link set dev br1000 master vrf-blue")
pe.cmd_raises("sysctl -w net.ipv4.ip_forward=1")
pe.cmd_raises("sysctl -w net.ipv6.conf.all.forwarding=1")
# Load FRR configuration for each router
for rname, router in tgen.routers().items():
logger.info(f"Loading config to router {rname}")
router.load_frr_config(os.path.join(CWD, f"{rname}/frr.conf"))
tgen.start_router()
def teardown_module(mod):
tgen = get_topogen()
tgen.stop_topology()
def _evpn_peer_established(router: TopoRouter, neighbor_ip: str) -> bool:
try:
output = router.vtysh_cmd("show bgp l2vpn evpn summary json")
data = json.loads(output)
except Exception:
return False
peers = data.get("peers", {})
if neighbor_ip not in peers:
logger.info(f"Neighbor {neighbor_ip} not found in BGP summary")
return False
logger.info(f"Neighbor {neighbor_ip} found in BGP summary with state: {peers[neighbor_ip].get('state')}")
return peers[neighbor_ip].get("state") == "Established"
def _evpn_any_prefixes(router: TopoRouter) -> bool:
try:
output = router.vtysh_cmd("show bgp l2vpn evpn route json")
data = json.loads(output)
logger.info(f"EVPN routes found: {data.get('numPrefix', 0)}")
except Exception:
return False
# numPrefix > 0 indicates some EVPN routes exist (e.g., IMET)
return bool(data) and data.get("numPrefix", 0) > 0
def _evpn_has_any_stale(router: TopoRouter) -> bool:
try:
output = router.vtysh_cmd("show bgp l2vpn evpn route json")
evpn = json.loads(output)
except Exception:
return False
# Iterate all RDs and prefixes to find any path marked stale
for key, rd_data in evpn.items():
if not isinstance(rd_data, dict):
continue
for prefix, pdata in rd_data.items():
if not isinstance(pdata, dict):
continue
for path_entry in pdata.get("paths", []):
# path_entry may be a dict or list of dicts based on FRR JSON
if isinstance(path_entry, dict):
if path_entry.get("stale"):
logger.info(f"Path {path_entry} is stale")
return True
elif isinstance(path_entry, list):
for p in path_entry:
if isinstance(p, dict) and p.get("stale"):
logger.info(f"Path {p} is stale")
return True
logger.info(f"No stale paths found")
return False
def _evpn_no_stale(router: TopoRouter) -> bool:
return not _evpn_has_any_stale(router)
def _evpn_has_remote_route_type(router: TopoRouter, route_type: int) -> bool:
try:
output = router.vtysh_cmd("show bgp l2vpn evpn route json")
evpn = json.loads(output)
except Exception:
return False
for key, rd_data in evpn.items():
if not isinstance(rd_data, dict):
continue
for prefix, pdata in rd_data.items():
if not isinstance(pdata, dict):
continue
for path_entry in pdata.get("paths", []):
entries = path_entry if isinstance(path_entry, list) else [path_entry]
for p in entries:
if not isinstance(p, dict):
continue
if p.get("routeType") == route_type and not p.get("local", False):
logger.info(f"Remote route type {route_type} found")
return True
logger.info(f"No remote route type {route_type} found")
return False
def _evpn_remote_type_paths_stale(router: TopoRouter, route_type: int) -> bool:
try:
output = router.vtysh_cmd("show bgp l2vpn evpn route json")
evpn = json.loads(output)
except Exception:
return False
found_remote = False
found_stale = False
for key, rd_data in evpn.items():
if not isinstance(rd_data, dict):
continue
for prefix, pdata in rd_data.items():
if not isinstance(pdata, dict):
continue
for path_entry in pdata.get("paths", []):
entries = path_entry if isinstance(path_entry, list) else [path_entry]
for p in entries:
if not isinstance(p, dict):
continue
if p.get("routeType") == route_type and not p.get("local", False):
found_remote = True
logger.info(f"Remote route type {route_type} found")
if p.get("stale"):
found_stale = True
logger.info(f"Stale route type {route_type} found")
logger.info(f"No remote route type {route_type} found")
logger.info(f"No stale route type {route_type} found")
return found_remote and found_stale
def _evpn_routes_with_stale_only_for_rd(router: TopoRouter, rd: str, route_type: int) -> bool:
"""
Verify that all paths whose RD matches the input RD and route_type are marked as stale.
Succeeds only if at least one matching path is found and all such paths are stale.
"""
try:
output = router.vtysh_cmd("show bgp l2vpn evpn route json")
evpn = json.loads(output)
except Exception:
return False
found_matching_path = False
for key, rd_data in evpn.items():
if not isinstance(rd_data, dict):
continue
if key != rd:
continue
for prefix, pdata in rd_data.items():
if not isinstance(pdata, dict):
continue
for path_entry in pdata.get("paths", []):
entries = path_entry if isinstance(path_entry, list) else [path_entry]
for p in entries:
if not isinstance(p, dict):
continue
rtype = p.get("routeType")
if rtype != route_type:
continue
found_matching_path = True
logger.info(f"Checking prefix: {prefix} path: {p}")
if not bool(p.get("stale")):
logger.info(f"Checking prefix: {prefix} path: {p} is not stale, returning False")
return False
if not found_matching_path:
logger.info(f"No matching path found for RD: {rd} and route type: {route_type}")
return False
return True
def _vrf_has_kernel_routes(router: TopoRouter, vrf_name: str, prefixes):
if isinstance(prefixes, str):
prefixes = [prefixes]
output = router.cmd(f"ip -j route show vrf {vrf_name}")
try:
routes = json.loads(output)
except Exception:
return False
have = set()
for r in routes:
dst = r.get("dst") or r.get("destination") or r.get("to")
if dst:
have.add(dst)
for pfx in prefixes:
if pfx not in have:
logger.info(f"Prefix {pfx} not found in kernel VRF {vrf_name}")
return False
logger.info(f"All prefixes {prefixes} found in kernel VRF {vrf_name}")
return True
def _bridge_has_extern_learn(router: TopoRouter, dev: str, mac: str) -> bool:
# Check for external-learned MAC on vxlan device
out = router.cmd(f"bridge fdb show dev {dev}")
for line in out.splitlines():
if mac.lower() in line.lower() and "extern_learn" in line:
logger.info(f"MAC {mac} found in bridge FDB on device {dev}")
return True
logger.info(f"MAC {mac} not found in bridge FDB on device {dev}")
return False
def _ip_neigh_has_extern_learn(router: TopoRouter, mac: str) -> bool:
"""
Check kernel's ip neigh show for MAC entry with extern_learn flag.
(Extern_learn shows as 'extern_learn' in 'ip neigh' flags.)
"""
output = router.cmd("ip neigh show")
for line in output.splitlines():
if mac.lower() in line.lower() and "extern_learn" in line:
logger.info(f"MAC {mac} found as extern_learn in 'ip neigh'")
return True
logger.info(f"MAC {mac} NOT extern_learn in 'ip neigh'")
return False
def fetch_vni_rd_from_pe2(pe2: TopoRouter, vni: int):
"""
Fetch the Route Distinguisher (RD) of the given l2vni from PE2
using 'show bgp l2vpn evpn vni <vni>' command.
Returns:
The RD as a string, e.g., '10.100.0.2:102', or None if not found.
"""
output = pe2.vtysh_cmd(f"show bgp l2vpn evpn vni {vni} json")
try:
if isinstance(output, str):
output = json.loads(output)
if "rd" in output:
logger.info(f"RD for VNI {vni} found: {output.get('rd')}")
return output.get("rd")
except Exception as e:
logger.info(f"Failed to fetch RD from PE2 VNI {vni}: {e}")
return None
def _evpn_f_bit_set(router: TopoRouter, neighbor_ip: str) -> bool:
"""
Verify that EVPN AF F-bit is set for a neighbor during GR.
Tries EVPN-specific CLI first, then falls back to generic neighbors JSON.
"""
commands = [
f"show bgp neighbors {neighbor_ip} graceful-restart json",
]
for cmd in commands:
try:
output = router.vtysh_cmd(cmd)
data = json.loads(output)
except Exception:
continue
# EVPN-specific show typically returns dict keyed by neighbor IP
if isinstance(data, dict) and neighbor_ip in data:
nbr = data[neighbor_ip]
gr = nbr.get("gracefulRestartInfo", {})
if isinstance(gr, dict):
# Some builds expose fBit directly
if isinstance(gr.get("fBit"), bool):
if gr.get("fBit") is True:
logger.info(f"F-bit is set for neighbor {neighbor_ip}")
return True
# Others nest per-AF
evpn = gr.get("l2VpnEvpn", {})
if isinstance(evpn, dict) and isinstance(evpn.get("fBit"), bool):
if evpn.get("fBit") is True:
logger.info(f"F-bit is set for neighbor {neighbor_ip}")
return True
# Generic neighbors JSON; try to locate EVPN AF entry
if isinstance(data, dict):
# data may be the neighbor object directly
gr = data.get("gracefulRestartInfo", {})
if isinstance(gr, dict):
if isinstance(gr.get("fBit"), bool) and gr.get("fBit") is True:
logger.info(f"F-bit is set for neighbor {neighbor_ip}")
return True
evpn = gr.get("l2VpnEvpn", {})
if isinstance(evpn, dict) and isinstance(evpn.get("fBit"), bool):
if evpn.get("fBit") is True:
logger.info(f"F-bit is set for neighbor {neighbor_ip}")
return True
logger.info(f"F-bit is not set for neighbor {neighbor_ip}")
return False
def _gr_r_bit_set(router: TopoRouter, neighbor_ip: str) -> bool:
"""
Verify that R-bit is set in GR capability for the neighbor (restarting peer).
Checks both generic neighbors JSON and EVPN-specific GR JSON.
"""
commands = [
f"show bgp neighbors {neighbor_ip} graceful-restart json",
]
for cmd in commands:
try:
output = router.vtysh_cmd(cmd)
data = json.loads(output)
except Exception:
continue
# EVPN-specific keyed by neighbor
if isinstance(data, dict) and neighbor_ip in data:
nbr = data[neighbor_ip]
gr = nbr.get("gracefulRestartInfo", {})
if isinstance(gr, dict):
if isinstance(gr.get("rBit"), bool) and gr.get("rBit") is True:
logger.info(f"R-bit is set for neighbor {neighbor_ip}")
return True
# Also try per-AF container if present
evpn = gr.get("l2VpnEvpn", {})
if isinstance(evpn, dict) and isinstance(evpn.get("rBit"), bool):
if evpn.get("rBit") is True:
logger.info(f"R-bit is set for neighbor {neighbor_ip}")
return True
# Generic neighbor JSON shape
if isinstance(data, dict):
gr = data.get("gracefulRestartInfo", {})
if isinstance(gr, dict):
if isinstance(gr.get("rBit"), bool) and gr.get("rBit") is True:
logger.info(f"R-bit is set for neighbor {neighbor_ip}")
return True
evpn = gr.get("l2VpnEvpn", {})
if isinstance(evpn, dict) and isinstance(evpn.get("rBit"), bool):
if evpn.get("rBit") is True:
logger.info(f"R-bit is set for neighbor {neighbor_ip}")
return True
logger.info(f"R-bit is not set for neighbor {neighbor_ip}")
return False
def test_bgp_evpn_gr_stale_and_recovery():
tgen = get_topogen()
pe1 = tgen.gears["PE1"]
pe2 = tgen.gears["PE2"]
logger.info("STEP 1: Verify routers are up and healthy")
check_router_status(tgen)
# Wait for EVPN session to establish
logger.info("STEP 2: Wait for EVPN session to establish between PE1 and PE2")
test_func = functools.partial(_evpn_peer_established, pe1, "10.0.1.2")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE1 EVPN session with PE2 not established"
test_func = functools.partial(_evpn_peer_established, pe2, "10.0.1.1")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE2 EVPN session with PE1 not established"
# Ensure we have some EVPN routes (e.g., IMET)
logger.info("STEP 3: Ensure EVPN routes (e.g., IMET) are present")
test_func = functools.partial(_evpn_any_prefixes, pe1)
result, _ = topotest.run_and_expect(test_func, True, count=30, wait=1)
assert result, "No EVPN routes present on PE1"
# Ensure type-5 routes exist on both PEs (host1->PE1->PE2, host2->PE2->PE1)
logger.info("STEP 4: Verify remote EVPN type-5 routes exist on both PEs")
test_func = functools.partial(_evpn_has_remote_route_type, pe1, 5)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-5 routes seen on PE1"
test_func = functools.partial(_evpn_has_remote_route_type, pe2, 5)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-5 routes seen on PE2"
# Kernel VRF routes imported (type-5): verify PE2 has host1, PE1 has host2
logger.info("STEP 5: Verify type-5 routes are installed into kernel VRF on both PEs")
test_func = functools.partial(_vrf_has_kernel_routes, pe2, "vrf-blue", ["172.31.0.21"])
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Type-5 prefix 172.31.0.21/32 not installed in PE2 kernel VRF vrf-blue"
test_func = functools.partial(_vrf_has_kernel_routes, pe1, "vrf-blue", ["172.31.0.22"])
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Type-5 prefix 172.31.0.22/32 not installed in PE1 kernel VRF vrf-blue"
# Ensure type-2 routes exist on both PEs
logger.info("STEP 6: Verify remote EVPN type-2 routes exist on both PEs")
test_func = functools.partial(_evpn_has_remote_route_type, pe1, 2)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-2 routes seen on PE1"
test_func = functools.partial(_evpn_has_remote_route_type, pe2, 2)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-2 routes seen on PE2"
# Ensure type-2 are installed as extern_learn in FDB (remote MACs)
logger.info("STEP 7: Verify remote MACs are extern_learn in FDB (type-2)")
test_func = functools.partial(_bridge_has_extern_learn, pe1, "vxlan100", "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Remote MAC (host2) not extern_learn in PE1 FDB for vxlan100"
test_func = functools.partial(_bridge_has_extern_learn, pe2, "vxlan100", "1a:2b:3c:4d:5e:61")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Remote MAC (host1) not extern_learn in PE2 FDB for vxlan100"
# Verify that remote MACs on PE1 are installed as extern_learn in kernel's ip_neigh table
logger.info("STEP 7b: Verify remote MACs are extern_learn in PE1 kernel's ip_neigh table")
# For PE1, check host2's MAC (remote side)
test_func = functools.partial(_ip_neigh_has_extern_learn, pe1, "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=30, wait=1)
assert result, "Remote MAC (host2) not extern_learn in kernel ip_neigh table on PE1"
# For PE2, check host1's MAC (remote side)
test_func = functools.partial(_ip_neigh_has_extern_learn, pe2, "1a:2b:3c:4d:5e:61")
result, _ = topotest.run_and_expect(test_func, True, count=30, wait=1)
assert result, "Remote MAC (host1) not extern_learn in kernel ip_neigh table on PE2"
# Fetch L2VNI RD from PE2 for validation
pe2_rd_vni_100 = fetch_vni_rd_from_pe2(pe2, 100)
# Fetch L3VNI RD from PE2 for validation
pe2_rd_vni_1000 = fetch_vni_rd_from_pe2(pe2, 1000)
# Kill bgpd on PE2 to trigger GR restart
logger.info("STEP 8: Stop bgpd on PE2 to trigger graceful restart")
kill_router_daemons(tgen, "PE2", ["bgpd"])
# PE1 should retain only PE2-originated EVPN routes as stale during GR (type-2 and type-5), not local
# Verify only type-5 routes from PE2's RD are stale
logger.info("STEP 9: Check PE1 retains ONLY PE2-originated type-5 routes as stale")
test_func = functools.partial(_evpn_routes_with_stale_only_for_rd, pe1, rd=pe2_rd_vni_1000, route_type=5)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, (
"PE1 did not retain ONLY PE2-originated EVPN type-5 routes as stale during PE2 restart"
)
logger.info(f"PE2 RD for VNI 100: {pe2_rd_vni_100}")
# Verify only type-2 routes from PE2's RD are stale
logger.info("STEP 10: Check PE1 retains ONLY PE2-originated type-2 routes as stale")
test_func = functools.partial(_evpn_routes_with_stale_only_for_rd, pe1, rd=pe2_rd_vni_100, route_type=2)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, (
"PE1 did not retain ONLY PE2-originated EVPN type-2 routes as stale during PE2 restart"
)
# Also generic check for any stale presence
logger.info("STEP 11: Confirm PE1 shows some EVPN routes as stale during PE2 restart")
test_func = functools.partial(_evpn_has_any_stale, pe1)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE1 did not retain EVPN routes as stale during PE2 restart"
# Verify PE1 kernel still has routes learned from PE2 in vrf-blue (type-5 retained)
logger.info("STEP 12: Verify PE1 kernel retains type-5 routes from PE2 during GR")
test_func = functools.partial(_vrf_has_kernel_routes, pe1, "vrf-blue", ["172.31.0.22"])
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE1 kernel VRF routes learned from PE2 disappeared during GR"
# Verify PE1 FDB retains extern learned MAC from PE2 (type-2 retained)
logger.info("STEP 13: Verify PE1 FDB retains extern_learn MAC from PE2 during GR")
test_func = functools.partial(_bridge_has_extern_learn, pe1, "vxlan100", "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE1 FDB extern_learn entry from PE2 disappeared during GR"
# Verify PE1 kernel still has routes learned from PE2 in vrf-blue (type-5 retained)
test_func = functools.partial(_ip_neigh_has_extern_learn, pe1, "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=30, wait=1)
assert result, "PE1 kernel ip_neigh table extern_learn entry from PE2 disappeared during GR"
# Bring bgpd back on PE2
logger.info("STEP 14: Restart bgpd on PE2 to recover session")
# Get config file path and router object
source_config = os.path.join(CWD, "PE2/frr.conf")
router_pe2 = tgen.gears["PE2"]
# Restart BGP daemon and load configuration using load_config
logger.info("Starting BGP daemon on PE2...")
try:
start_router_daemons(tgen, "PE2", ["bgpd"])
logger.info("BGP daemon start command completed")
# Apply BGP configuration using vtysh -f
logger.info(f"Applying BGP config from: {source_config}")
config_result = router_pe2.cmd(f"vtysh -f {source_config}")
logger.info("BGP configuration applied successfully")
except Exception as e:
logger.error(f"Failed to start daemon or load BGP config: {e}")
raise
# Wait for EVPN session to establish
logger.info("STEP 15: Wait for EVPN session to establish between PE1 and PE2")
test_func = functools.partial(_evpn_peer_established, pe1, "10.0.1.2")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE1 EVPN session with PE2 not established"
test_func = functools.partial(_evpn_peer_established, pe2, "10.0.1.1")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "PE2 EVPN session with PE1 not established"
# Verify R-bit and F-bit set on PE1 neighbor view after PE2 restart
logger.info("STEP 16: Verify GR R-bit and EVPN AF F-bit set on PE1 neighbor view")
test_func = functools.partial(_gr_r_bit_set, pe1, "10.0.1.2")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "EVPN GR R-bit not set on PE1 neighbor view after PE2 restart"
test_func = functools.partial(_evpn_f_bit_set, pe1, "10.0.1.2")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "EVPN AF F-bit not set on PE1 neighbor view during PE2 restart"
# After session recovery, stale flags should be cleared for type-2 and type-5 on PE1
logger.info("STEP 17: Ensure remote EVPN type-5 and type-2 remain active after recovery")
test_func = functools.partial(_evpn_has_remote_route_type, pe1, 5)
result, _ = topotest.run_and_expect(test_func, True, count=120, wait=2)
assert result, "Remote EVPN type-5 routes disappeared on PE1 after PE2 recovered"
test_func = functools.partial(_evpn_has_remote_route_type, pe1, 2)
result, _ = topotest.run_and_expect(test_func, True, count=120, wait=2)
assert result, "Remote EVPN type-2 routes disappeared on PE1 after PE2 recovered"
# After bgpd recovery on PE2, verify PE1 kernel still has routes learned from PE2
logger.info("STEP 18: Verify PE1 kernel still has routes from PE2 after recovery")
test_func = functools.partial(_vrf_has_kernel_routes, pe1, "vrf-blue", ["172.31.0.22"])
result, _ = topotest.run_and_expect(test_func, True, count=120, wait=2)
assert result, "PE1 kernel VRF routes learned from PE2 disappeared after recovery"
# And verify PE1 FDB still has extern_learn entry from PE2
logger.info("STEP 19: Verify PE1 FDB retains extern_learn MAC after recovery")
test_func = functools.partial(_bridge_has_extern_learn, pe1, "vxlan100", "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=120, wait=2)
assert result, "PE1 FDB extern_learn entry from PE2 disappeared after recovery"
logger.info("STEP 20: Confirm no EVPN stale routes remain on PE1 after recovery")
test_func = functools.partial(_evpn_no_stale, pe1)
result, _ = topotest.run_and_expect(test_func, True, count=120, wait=2)
assert result, "PE1 still shows EVPN stale routes after PE2 recovered"
def _vrf_routes_absent(router: TopoRouter, vrf_name: str, prefixes):
if isinstance(prefixes, str):
prefixes = [prefixes]
output = router.cmd(f"ip -j route show vrf {vrf_name}")
try:
routes = json.loads(output)
except Exception:
# If we can't parse routes, treat as absent
return True
have = set()
for r in routes:
dst = r.get("dst") or r.get("destination") or r.get("to")
if dst:
have.add(dst)
for pfx in prefixes:
if pfx in have:
return False
return True
def _bridge_extern_absent(router: TopoRouter, dev: str, mac: str) -> bool:
out = router.cmd(f"bridge fdb show dev {dev}")
for line in out.splitlines():
if mac.lower() in line.lower() and "extern_learn" in line:
return False
return True
def test_bgp_evpn_gr_stale_cleanup_on_timeout():
tgen = get_topogen()
pe1 = tgen.gears["PE1"]
logger.info("STEP 1: Verify routers are up and healthy")
check_router_status(tgen)
# Ensure EVPN session and baseline presence
logger.info("STEP 2: Verify EVPN session established (PE1 -> PE2)")
test_func = functools.partial(_evpn_peer_established, pe1, "10.0.1.2")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "EVPN session not established (PE1->PE2)"
logger.info("STEP 3: Verify remote EVPN type-5 routes present on PE1")
test_func = functools.partial(_evpn_has_remote_route_type, pe1, 5)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-5 routes on PE1"
logger.info("STEP 4: Verify kernel VRF has type-5 route on PE1 prior to GR")
test_func = functools.partial(_vrf_has_kernel_routes, pe1, "vrf-blue", ["172.31.0.22"])
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Missing kernel VRF routes on PE1 prior to GR"
logger.info("STEP 5: Verify extern_learn MAC is present on PE1 prior to GR")
test_func = functools.partial(_bridge_has_extern_learn, pe1, "vxlan100", "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Missing extern_learn MAC on PE1 prior to GR"
# Kill bgpd on PE2 and do not restart for >140 seconds
logger.info("STEP 6: Stop bgpd on PE2 and keep it down (>140s)")
kill_router_daemons(tgen, "PE2", ["bgpd"])
# Wait for restart timer (120 secs) on PE1 to expire and stale paths to be cleaned up
logger.info("STEP 7: Wait for GR stalepath-time to expire on PE1 (sleep 150s)")
time.sleep(150)
# Expect kernel VRF routes and FDB extern entry to be cleaned from PE1
logger.info("STEP 8: Verify kernel VRF routes learned from PE2 are cleaned on PE1")
test_func = functools.partial(_vrf_routes_absent, pe1, "vrf-blue", ["172.31.0.22"])
result, _ = topotest.run_and_expect(test_func, True, count=160, wait=1)
assert result, "VRF kernel routes on PE1 not cleaned after GR stalepath-time expiry"
logger.info("STEP 9: Verify FDB extern_learn MAC learned from PE2 is cleaned on PE1")
test_func = functools.partial(_bridge_extern_absent, pe1, "vxlan100", "1a:2b:3c:4d:5e:62")
result, _ = topotest.run_and_expect(test_func, True, count=160, wait=1)
assert result, "FDB extern_learn MAC on PE1 not cleaned after GR stalepath-time expiry"
# Restore bgpd on PE2 for subsequent tests
logger.info("STEP 10: Restart bgpd on PE2 for subsequent tests")
source_config = os.path.join(CWD, "PE2/frr.conf")
router_pe2 = tgen.gears["PE2"]
# Restart BGP daemon and load configuration using load_config
logger.info("Starting BGP daemon on PE2...")
try:
start_router_daemons(tgen, "PE2", ["bgpd"])
logger.info("BGP daemon start command completed")
# Apply BGP configuration using vtysh -f
logger.info(f"Applying BGP config from: {source_config}")
config_result = router_pe2.cmd(f"vtysh -f {source_config}")
logger.info("BGP configuration applied successfully")
except Exception as e:
logger.error(f"Failed to start daemon or load BGP config: {e}")
raise
"""
Commenting this test out until MR 12975 is merged
def test_bgp_evpn_gr_select_deferral_cleanup_on_pe2():
tgen = get_topogen()
pe1 = tgen.gears["PE1"]
pe2 = tgen.gears["PE2"]
logger.info("STEP 1: Verify routers are up and healthy")
check_router_status(tgen)
# Baseline: session up and PE2 has remote routes/MAC from PE1
logger.info("STEP 2: Verify EVPN session established (PE2 -> PE1)")
test_func = functools.partial(_evpn_peer_established, pe2, "10.0.1.1")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "EVPN session not established (PE2->PE1)"
logger.info("STEP 3: Verify remote EVPN type-5 routes present on PE2 (from PE1)")
test_func = functools.partial(_evpn_has_remote_route_type, pe2, 5)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "No remote EVPN type-5 routes on PE2 (from PE1)"
# PE1-originated type-5 network should be in PE2 kernel VRF
logger.info("STEP 4: Verify kernel VRF has type-5 route on PE2 prior to GR")
test_func = functools.partial(_vrf_has_kernel_routes, pe2, "vrf-blue", ["172.31.0.21"])
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Missing kernel VRF routes on PE2 prior to GR/select-deferral"
# PE1 MAC should be extern_learn on PE2
logger.info("STEP 5: Verify extern_learn MAC is present on PE2 prior to GR")
test_func = functools.partial(_bridge_has_extern_learn, pe2, "vxlan100", "1a:2b:3c:4d:5e:61")
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=2)
assert result, "Missing extern_learn MAC on PE2 prior to GR/select-deferral"
pe2.vtysh_cmd(
"configure terminal\n"
"log syslog debugging\n"
"log file zebra.log\n"
"log timestamp precision 6\n"
"debug zebra events\n"
"debug bgp graceful-restart\n"
"debug bgp neighbor-events\n"
"exit\n"
"write\n"
)
# Simulate PE2 restart: stop bgpd on PE2
logger.info("STEP 6: Stop bgpd and zebra on PE2 to simulate restart")
kill_router_daemons(tgen, "PE2", ["zebra", "bgpd"])
# Before starting PE2, administratively shutdown neighbor on PE1 to keep session down
logger.info("STEP 7: Shutdown neighbor 10.0.1.2 on PE1 to keep session down")
pe1.vtysh_cmd(
"configure terminal\n"
"router bgp 101\n"
"neighbor 10.0.1.2 shutdown\n"
)
# Start bgpd on PE2; session will remain down due to neighbor shutdown on PE1
logger.info("STEP 8: Start bgpd on PE2 (session should stay down due to neighbor shutdown)")
#start_router_daemons(tgen, "PE2", ["bgpd", "zebra"])
source_config = os.path.join(CWD, "PE2/frr.conf")
router_pe2 = tgen.gears["PE2"]
# Restart BGP daemon and load configuration using load_config
logger.info("Starting BGP and zebra daemon on PE2...")
try:
start_router_daemons(tgen, "PE2", ["bgpd, "zebra""])
logger.info("BGP and zebra daemon start command completed")
# Apply BGP configuration using vtysh -f
logger.info(f"Applying BGP and zebra config from: {source_config}")
config_result = router_pe2.cmd(f"vtysh -f {source_config}")
logger.info("BGP and zebra configuration applied successfully")
except Exception as e:
logger.error(f"Failed to start daemon or load BGP and zebra config: {e}")
raise
# Wait beyond select deferral timer (default 120s) so PE2 purges stale paths
logger.info("STEP 9: Wait beyond select-deferral (sleep 150s) so PE2 purges stale paths")
time.sleep(150)
# Verify PE2 kernel cleaned routes learned from PE1
logger.info("STEP 10: Verify kernel VRF routes learned from PE1 are cleaned on PE2")
test_func = functools.partial(_vrf_routes_absent, pe2, "vrf-blue", ["172.31.0.21"])
result, _ = topotest.run_and_expect(test_func, True, count=160, wait=1)
assert result, "VRF kernel routes on PE2 not cleaned after select-deferral expiry"
# Verify PE2 FDB cleaned extern_learn entry learned from PE1
logger.info("STEP 11: Verify FDB extern_learn MAC learned from PE1 is cleaned on PE2")
test_func = functools.partial(_bridge_extern_absent, pe2, "vxlan100", "1a:2b:3c:4d:5e:61")
result, _ = topotest.run_and_expect(test_func, True, count=160, wait=1)
assert result, "FDB extern_learn MAC on PE2 not cleaned after select-deferral expiry"
# Cleanup: re-enable neighbor on PE1 so subsequent tests can proceed
logger.info("STEP 12: Re-enable neighbor 10.0.1.2 on PE1 to restore normal operation")
pe1.vtysh_cmd(
"configure terminal\n"
"router bgp 101\n"
"no neighbor 10.0.1.2 shutdown\n"
)
"""

View file

@ -0,0 +1,39 @@
hostname r1
ip forwarding
!
interface r1-eth0
ip address 10.0.1.2/24
!
interface lo
ip address 10.1.1.1/32
ip address 10.1.1.2/32
ip address 10.1.1.3/32
ip address 10.1.1.4/32
!
router bgp 65001
no bgp ebgp-requires-policy
bgp graceful-restart
bgp graceful-restart preserve-fw-state
neighbor 10.3.3.3 remote-as 65003
neighbor 10.3.3.3 timers 1 3
neighbor 10.3.3.3 timers connect 1
neighbor 10.3.3.3 ebgp-multihop 5
neighbor 10.3.3.3 update-source lo
neighbor 10.0.1.1 remote-as 65002
neighbor 10.0.1.1 timers 1 3
neighbor 10.0.1.1 timers connect 1
neighbor 10.0.1.1 route-map R2_OUT out
!
address-family ipv4 unicast
redistribute connected
exit-address-family
!
ip prefix-list R1_EXTRA seq 5 permit 10.1.1.2/32
ip prefix-list R1_EXTRA seq 10 permit 10.1.1.3/32
ip prefix-list R1_EXTRA seq 15 permit 10.1.1.4/32
!
route-map R2_OUT deny 10
match ip address prefix-list R1_EXTRA
route-map R2_OUT permit 100
!

View file

@ -0,0 +1,21 @@
hostname r2
ip forwarding
!
interface r2-eth0
ip address 10.0.1.1/24
!
interface r2-eth1
ip address 10.0.2.1/24
!
router bgp 65002
no bgp ebgp-requires-policy
bgp graceful-restart
bgp graceful-restart preserve-fw-state
neighbor 10.0.1.2 remote-as 65001
neighbor 10.0.1.2 timers 1 3
neighbor 10.0.1.2 timers connect 1
neighbor 10.0.2.2 remote-as 65003
neighbor 10.0.2.2 timers 1 3
neighbor 10.0.2.2 timers connect 1
!

View file

@ -0,0 +1,37 @@
hostname r3
ip forwarding
!
interface r3-eth0
ip address 10.0.2.2/24
!
interface lo
ip address 10.3.3.3/32
ip address 10.3.3.4/32
ip address 10.3.3.5/32
ip address 10.3.3.6/32
!
router bgp 65003
no bgp ebgp-requires-policy
neighbor 10.1.1.1 remote-as 65001
neighbor 10.1.1.1 timers 1 3
neighbor 10.1.1.1 timers connect 1
neighbor 10.1.1.1 ebgp-multihop 5
neighbor 10.1.1.1 update-source lo
neighbor 10.0.2.1 remote-as 65002
neighbor 10.0.2.1 timers 1 3
neighbor 10.0.2.1 timers connect 1
neighbor 10.0.2.1 route-map R2_OUT out
!
address-family ipv4 unicast
redistribute connected
exit-address-family
!
ip prefix-list R3_EXTRA seq 5 permit 10.3.3.4/32
ip prefix-list R3_EXTRA seq 10 permit 10.3.3.5/32
ip prefix-list R3_EXTRA seq 15 permit 10.3.3.6/32
!
route-map R2_OUT deny 10
match ip address prefix-list R3_EXTRA
route-map R2_OUT permit 100
!

View file

@ -0,0 +1,339 @@
#!/usr/bin/env python
# SPDX-License-Identifier: ISC
"""
Test BGP Graceful Restart behavior for multihop eBGP peers.
Topology:
r1(AS65001, lo 1.1.1.1/32) ---- r2 ---- r3(AS65002, lo 3.3.3.3/32)
(GR restarting node) (GR helper) (GR helper)
eBGP peering is done over loopbacks (multihop). GR is enabled on both sides.
We verify that when r1 restarts, r3 retains routes as stale and keeps
forwarding state in kernel, then recovers when r1 comes back.
"""
import os
import sys
import json
import pytest
import functools
from lib import topotest
from lib.topogen import Topogen, TopoRouter, get_topogen
from lib.common_config import step, kill_router_daemons, start_router_daemons
from lib.topolog import logger
pytestmark = [pytest.mark.bgpd]
# Import topogen and required test modules
CWD = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(CWD, "../"))
def build_topo(tgen):
for routern in range(1, 4):
tgen.add_router("r{}".format(routern))
s1 = tgen.add_switch("s1")
s1.add_link(tgen.gears["r1"]) # r1-eth0
s1.add_link(tgen.gears["r2"]) # r2-eth0
s2 = tgen.add_switch("s2")
s2.add_link(tgen.gears["r2"]) # r2-eth1
s2.add_link(tgen.gears["r3"]) # r3-eth0
def setup_module(mod):
"""Set up the pytest environment."""
tgen = Topogen(build_topo, mod.__name__)
tgen.start_topology()
# Enable required daemons for all routers
router_list = tgen.routers()
for rname, router in router_list.items():
logger.info(f"Enabling daemons for router {rname}")
# Enable mgmtd, zebra, and bgpd
router.load_config(router.RD_MGMTD, "")
router.load_config(router.RD_ZEBRA, "")
router.load_config(router.RD_BGP, "")
# Load FRR configuration for each router
for rname, router in router_list.items():
logger.info(f"Loading config to router {rname}")
router.load_frr_config(os.path.join(CWD, f"{rname}/frr.conf"))
# Initialize all routers
tgen.start_router()
def teardown_module(mod):
tgen = get_topogen()
tgen.stop_topology()
def test_bgp_gr_multihop():
tgen = get_topogen()
if tgen.routers_have_failure():
pytest.skip(tgen.errors)
r1 = tgen.gears["r1"]
r2 = tgen.gears["r2"]
r3 = tgen.gears["r3"]
# Helper functions
def _bgp_converged_on_r3_via_r2():
# Convergence for the session r3<->r2
output = json.loads(r3.vtysh_cmd("show bgp ipv4 neighbors 10.0.2.1 json"))
n = output.get("10.0.2.1", {})
if n.get("bgpState") != "Established":
return {"bgpState": n.get("bgpState")}
afi = n.get("addressFamilyInfo", {}).get("ipv4Unicast", {})
if afi.get("acceptedPrefixCounter", 0) < 1:
return {"acceptedPrefixCounter": afi.get("acceptedPrefixCounter")}
return None
def _r3_has_stale_route():
# Verify that 10.1.1.2/32, 10.1.1.3/32, and 10.1.1.4/32 are marked as stale
stale_routes = ["10.1.1.2/32", "10.1.1.3/32", "10.1.1.4/32"]
for route in stale_routes:
output = json.loads(r3.vtysh_cmd(f"show bgp ipv4 unicast {route} json"))
expected = {"paths": [{"stale": True}]}
res = topotest.json_cmp(output, expected)
if res is not None:
return {route: res}
return None
def _r3_kernel_kept_route():
# Expect stale routes from r1 are retained in kernel
# These routes are 10.1.1.2/32, 10.1.1.3/32, and 10.1.1.4/32
stale_routes = ["10.1.1.2", "10.1.1.3", "10.1.1.4"]
expected_routes = [
{"dst": route, "gateway": "10.0.2.1", "metric": 20} for route in stale_routes
]
# Collect all routes from kernel for these prefixes
output = []
for route in stale_routes:
show = r3.cmd(f"ip -j route show {route}/32 proto bgp dev r3-eth0")
try:
# Output could be "[]" when not present
entries = json.loads(show)
except Exception:
entries = []
output.extend(entries)
# Now check all expected routes are present
def compare_kept_routes(output, expected):
# All expected routes must be present in output
for exp in expected:
found = False
for route in output:
if (
route.get("dst") == exp["dst"]
and route.get("gateway") == exp["gateway"]
and route.get("metric") == exp["metric"]
):
found = True
break
if not found:
return {"missing": exp}
return None
return compare_kept_routes(output, expected_routes)
def _r2_direct_ebgp_up():
out1 = json.loads(r2.vtysh_cmd("show bgp ipv4 neighbors 10.0.1.2 json"))
out2 = json.loads(r2.vtysh_cmd("show bgp ipv4 neighbors 10.0.2.2 json"))
n1 = out1.get("10.0.1.2", {}).get("bgpState") == "Established"
n2 = out2.get("10.0.2.2", {}).get("bgpState") == "Established"
return None if (n1 and n2) else {"r1": n1, "r3": n2}
def _r1_sessions_up_to_r2_r3():
n2 = json.loads(r1.vtysh_cmd("show bgp ipv4 neighbors 10.0.1.1 json")).get(
"10.0.1.1", {}
)
n3 = json.loads(r1.vtysh_cmd("show bgp ipv4 neighbors 10.3.3.3 json")).get(
"10.3.3.3", {}
)
ok = n2.get("bgpState") == "Established" and n3.get("bgpState") == "Established"
return None if ok else {"r2": n2.get("bgpState"), "r3": n3.get("bgpState")}
def _r1_verify_mh_peer_is_present():
output = r1.vtysh_cmd("show bgp ipv4 neighbors 10.3.3.3 json")
if not "Multihop GR peer exists" in output:
return None
else:
return output
def _r3_has_r1_routes_in_bgp():
# Before killing r1 bgpd, ensure r3 has r1's prefixes in BGP
prefixes = ["10.1.1.2/32", "10.1.1.3/32", "10.1.1.4/32"]
for pfx in prefixes:
output = json.loads(r3.vtysh_cmd(f"show bgp ipv4 unicast {pfx} json"))
paths = output.get("paths", [])
if not paths:
return {"bgp_missing": pfx}
return None
# Converge
step("Wait for direct eBGP sessions on r2 to establish")
test_func = functools.partial(_r2_direct_ebgp_up)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, "Failed to establish direct eBGP sessions on r2"
step("Verify R1 BGP sessions to R2 and R3 are Established")
test_func = functools.partial(_r1_sessions_up_to_r2_r3)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, "R1 BGP sessions to R2/R3 not Established"
step("Verify R1 BGP correctly detects that multihop peer exists")
test_func = functools.partial(_r1_verify_mh_peer_is_present)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, "R1 BGP did not detect that multihop peer exists"
# Pre-checks: r3 should have r1's prefixes in BGP and kernel before killing r1 bgpd
step("Verify r3 has r1 prefixes in BGP before r1 bgpd kill")
test_func = functools.partial(_r3_has_r1_routes_in_bgp)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, f"r3 missing r1 prefixes in BGP before kill: {result}"
step("Verify r3 kernel has r1 prefixes before r1 bgpd kill")
test_func = functools.partial(_r3_kernel_kept_route)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, f"r3 kernel missing r1 prefixes before kill: {result}"
# Stop only bgpd on r1 (simulate a BGP process failure, not full router restart)
step("Kill bgpd on r1")
kill_router_daemons(tgen, "r1", ["bgpd"]) # align with BGP_GR_TC_50_p1
# Verify retained stale in BGP
step("Verify r3 marks route from r1 as stale during GR")
test_func = functools.partial(_r3_has_stale_route)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, "Failed to see stale route retention on r3"
# Verify retained in kernel
step("Verify r3 keeps FIB route during GR")
assert _r3_kernel_kept_route() is None, "Kernel did not retain BGP route on r3"
# Get config file path and router object
source_config = os.path.join(CWD, "r1/frr.conf")
router_r1 = tgen.gears["r1"]
# Restart BGP daemon and load configuration using load_config
logger.info("Starting BGP daemon on r1...")
try:
start_router_daemons(tgen, "r1", ["bgpd"])
logger.info("BGP daemon start command completed")
# Apply BGP configuration using vtysh -f
logger.info(f"Applying BGP config from: {source_config}")
config_result = router_r1.cmd(f"vtysh -f {source_config}")
logger.info("BGP configuration applied successfully")
except Exception as e:
logger.error(f"Failed to start daemon or load BGP config: {e}")
raise
step("Verify R1 BGP sessions to R2 and R3 are Established after BGP on R1 is up")
test_func = functools.partial(_r1_sessions_up_to_r2_r3)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, "R1 BGP sessions to R2/R3 not Established"
def _r3_has_no_stale_prefixes():
for pfx in ["10.1.1.2/32", "10.1.1.3/32", "10.1.1.4/32"]:
output = json.loads(r3.vtysh_cmd(f"show bgp ipv4 unicast {pfx} json"))
# No 'stale': True flag should exist in active path anymore
if any(p.get("stale") for p in output.get("paths", [])):
return f"{pfx} still marked stale after recovery"
return None
step("Verify that prefixes from r1 are not marked stale after recovery")
test_func = functools.partial(_r3_has_no_stale_prefixes)
_, result = topotest.run_and_expect(test_func, None, count=60, wait=0.5)
assert result is None, result
def test_r1_kernel_retains_routes_on_bgpd_kill():
tgen = get_topogen()
if tgen.routers_have_failure():
pytest.skip(tgen.errors)
r1 = tgen.gears["r1"]
r2 = tgen.gears["r2"]
r3 = tgen.gears["r3"]
def _r1_neighbors_up():
n2 = json.loads(r1.vtysh_cmd("show bgp ipv4 neighbors 10.0.1.1 json")).get(
"10.0.1.1", {}
)
n3 = json.loads(r1.vtysh_cmd("show bgp ipv4 neighbors 10.3.3.3 json")).get(
"10.3.3.3", {}
)
if n2.get("bgpState") != "Established" or n3.get("bgpState") != "Established":
return {"r2": n2.get("bgpState"), "r3": n3.get("bgpState")}
return None
def _r1_kernel_has_routes():
# List of prefixes from r3
loopbacks = ["10.3.3.4", "10.3.3.5", "10.3.3.6"]
for lo in loopbacks:
out = json.loads(
r1.cmd(f"ip -j route show {lo}/32 proto bgp dev r1-eth0")
)
exp = [{"dst": lo, "gateway": "10.0.1.1", "metric": 20}]
cmp = topotest.json_cmp(out, exp)
if cmp:
return cmp
# Route to r3 LAN via r2 (advertised by r3, possibly best via multihop)
out2 = json.loads(
r1.cmd("ip -j route show 10.0.2.0/24 proto bgp dev r1-eth0")
)
exp2 = [{"dst": "10.0.2.0/24", "gateway": "10.0.1.1", "metric": 20}]
cmp2 = topotest.json_cmp(out2, exp2)
# Return first mismatch found
return cmp or cmp2
step("Ensure r1 BGP neighbors (r2 direct and r3 multihop) are Established")
_, result = topotest.run_and_expect(_r1_neighbors_up, None, count=60, wait=0.5)
assert result is None, "r1 neighbors not Established"
step("Verify r1 kernel has BGP routes before killing bgpd")
_, result = topotest.run_and_expect(_r1_kernel_has_routes, None, count=60, wait=0.5)
assert result is None, "r1 kernel missing expected BGP routes before kill"
step("Kill bgpd on r1 and verify kernel retains routes")
kill_router_daemons(tgen, "r1", ["bgpd"]) # kill only bgpd
# Routes should remain present during GR interval
_, result = topotest.run_and_expect(_r1_kernel_has_routes, None, count=60, wait=0.5)
assert result is None, "r1 kernel did not retain BGP routes after bgpd kill"
step("Start bgpd on r1 and re-verify neighbors")
# Get config file path and router object
source_config = os.path.join(CWD, "r1/frr.conf")
router_r1 = tgen.gears["r1"]
# Restart BGP daemon and load configuration using load_config
logger.info("Starting BGP daemon on r1...")
try:
start_router_daemons(tgen, "r1", ["bgpd"])
logger.info("BGP daemon start command completed")
# Apply BGP configuration using vtysh -f
logger.info(f"Applying BGP config from: {source_config}")
config_result = router_r1.cmd(f"vtysh -f {source_config}")
logger.info("BGP configuration applied successfully")
except Exception as e:
logger.error(f"Failed to start daemon or load BGP config: {e}")
raise
step("Verify R1 BGP sessions to R2 and R3 are Established after BGP on R1 is up")
_, result = topotest.run_and_expect(_r1_neighbors_up, None, count=60, wait=0.5)
assert result is None, "r1 neighbors not Established after bgpd restart"
if __name__ == "__main__":
args = ["-s"] + sys.argv[1:]
sys.exit(pytest.main(args))

View file

@ -30,6 +30,8 @@ import sys
import time
import pytest
from copy import deepcopy
import functools
from lib import topotest
# Save the Current Working Directory to find configuration files.
CWD = os.path.dirname(os.path.realpath(__file__))
@ -1337,7 +1339,10 @@ def test_verify_bgp_local_as_GR_EBGP_p0(request):
aspath = "110 200 100"
for addr_type in ADDR_TYPES:
input_static_r1 = {"r1": {"static_routes": [{"network": NETWORK[addr_type]}]}}
result = verify_bgp_rib(tgen, addr_type, dut, input_static_r1, aspath=aspath)
test_func = functools.partial(
verify_bgp_rib, tgen, addr_type, dut, input_static_r1, aspath=aspath
)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=1)
assert result is True, "Testcase {} : Failed \n Error: {}".format(
tc_name, result
)

View file

@ -42,7 +42,9 @@ import os
import sys
import time
import pytest
import functools
from copy import deepcopy
from lib import topotest
# Save the Current Working Directory to find configuration files.
CWD = os.path.dirname(os.path.realpath(__file__))
@ -1358,7 +1360,10 @@ def test_verify_bgp_local_as_GR_EBGP_p0(request):
aspath = "1.110 1.200 1.100"
for addr_type in ADDR_TYPES:
input_static_r1 = {"r1": {"static_routes": [{"network": NETWORK[addr_type]}]}}
result = verify_bgp_rib(tgen, addr_type, dut, input_static_r1, aspath=aspath)
test_func = functools.partial(
verify_bgp_rib, tgen, addr_type, dut, input_static_r1, aspath=aspath
)
result, _ = topotest.run_and_expect(test_func, True, count=60, wait=1)
assert result is True, "Testcase {} : Failed \n Error: {}".format(
tc_name, result
)

View file

@ -20,6 +20,45 @@ import babeltrace
########################### common parsers - start ############################
def print_location_gr_deferral_timer_start(field_val):
if field_val == 1:
return "Tier 1 deferral timer start"
elif field_val == 2:
return "Tier 2 deferral timer start"
def print_location_gr_eors(field_val):
if field_val == 1:
return "Check all EORs"
elif field_val == 2:
return "All dir conn EORs rcvd"
elif field_val == 3:
return "All multihop EORs NOT rcvd"
elif field_val == 4:
return "All EORs rcvd"
elif field_val == 5:
return "No multihop EORs pending"
elif field_val == 6:
return "EOR rcvd,check path select"
elif field_val == 7:
return "Do deferred path selection"
def print_location_gr_eor_peer(field_val):
if field_val == 1:
return "EOR awaited from"
elif field_val == 2:
return "EOR ignore"
elif field_val == 3:
return "Multihop EOR awaited"
elif field_val == 4:
return "Ignore EOR rcvd after tier1 expiry"
elif field_val == 5:
return "Dir conn EOR awaited"
def print_ip_addr(field_val):
"""
pretty print "struct ipaddr"
@ -268,6 +307,13 @@ def print_family_str(field_val):
return cmd_str
def location_gr_client_not_found(field_val):
if field_val == 1:
return "Process from GR queue"
elif field_val == 2:
return "Stale route delete from table"
############################ common parsers - end #############################
@ -464,6 +510,69 @@ def parse_frr_bgp_evpn_withdraw_type5(event):
"""
field_parsers = {"ip": print_ip_addr}
def parse_frr_bgp_gr_deferral_timer_start(event):
field_parsers = {
"location": print_location_gr_deferral_timer_start,
"afi": print_afi_string,
"safi": print_safi_string,
}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_deferral_timer_expiry(event):
field_parsers = {"afi": print_afi_string, "safi": print_safi_string}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_eors(event):
field_parsers = {
"location": print_location_gr_eors,
"afi": print_afi_string,
"safi": print_safi_string,
}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_eor_peer(event):
field_parsers = {
"location": print_location_gr_eor_peer,
"afi": print_afi_string,
"safi": print_safi_string,
}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_start_deferred_path_selection(event):
field_parsers = {"afi": print_afi_string, "safi": print_safi_string}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_send_fbit_capability(event):
field_parsers = {"afi": print_afi_string, "safi": print_safi_string}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_continue_deferred_path_selection(event):
field_parsers = {"afi": print_afi_string, "safi": print_safi_string}
parse_event(event, field_parsers)
def parse_frr_bgp_gr_zebra_update(event):
field_parsers = {"afi": print_afi_string, "safi": print_safi_string}
parse_event(event, field_parsers)
def parse_frr_zebra_gr_client_not_found(event):
field_parsers = {"location": location_gr_client_not_found}
parse_event(event, field_parsers)
@ -1261,6 +1370,15 @@ def main():
"frr_zebra:zevpn_build_l2vni_hash": parse_frr_zebra_zevpn_build_l2vni_hash,
"frr_zebra:zevpn_build_vni_hash": parse_frr_zebra_zevpn_build_vni_hash,
"frr_zebra:if_netlink_parse_error": parse_frr_zebra_if_netlink_parse_error,
"frr_bgp:gr_deferral_timer_start": parse_frr_bgp_gr_deferral_timer_start,
"frr_bgp:gr_deferral_timer_expiry": parse_frr_bgp_gr_deferral_timer_expiry,
"frr_bgp:gr_eors": parse_frr_bgp_gr_eors,
"frr_bgp:gr_eor_peer": parse_frr_bgp_gr_eor_peer,
"frr_bgp:gr_start_deferred_path_selection": parse_frr_bgp_gr_start_deferred_path_selection,
"frr_bgp:gr_send_fbit_capability": parse_frr_bgp_gr_send_fbit_capability,
"frr_bgp:gr_continue_deferred_path_selection": parse_frr_bgp_gr_continue_deferred_path_selection,
"frr_bgp:gr_zebra_update": parse_frr_bgp_gr_zebra_update,
"frr_zebra:gr_client_not_found": parse_frr_zebra_gr_client_not_found,
}
# get the trace path from the first command line argument

View file

@ -634,10 +634,12 @@ static inline struct nexthop_group *rib_get_fib_backup_nhg(
}
extern void zebra_gr_process_client(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t instance,
time_t restart_time, bool stale_client_cleanup);
time_t restart_time, time_t update_pending_time,
bool stale_client_cleanup);
extern int rib_add_gr_run(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t instance,
time_t restart_time, bool stale_client_cleanup);
time_t restart_time, time_t update_pending_time,
bool stale_client_cleanup);
extern void zebra_vty_init(void);
extern uint32_t zebra_rib_dplane_results_count(void);

View file

@ -1217,6 +1217,7 @@ struct zebra_vtep *zebra_evpn_vtep_add(struct zebra_evpn *zevpn, struct ipaddr *
zvtep->vtep_ip = *vtep_ip;
zvtep->flood_control = flood_control;
zvtep->gr_refresh_time = monotime(NULL);
if (zevpn->vteps)
zevpn->vteps->prev = zvtep;
@ -1248,7 +1249,7 @@ int zebra_evpn_vtep_del(struct zebra_evpn *zevpn, struct zebra_vtep *zvtep)
* Delete all remote VTEPs for this EVPN (upon VNI delete). Also
* uninstall from kernel if asked to.
*/
int zebra_evpn_vtep_del_all(struct zebra_evpn *zevpn, int uninstall)
int zebra_evpn_vtep_del_all(struct zebra_evpn *zevpn, int uninstall, struct l2vni_walk_ctx *l2_wctx)
{
struct zebra_vtep *zvtep, *zvtep_next;
@ -1257,8 +1258,17 @@ int zebra_evpn_vtep_del_all(struct zebra_evpn *zevpn, int uninstall)
for (zvtep = zevpn->vteps; zvtep; zvtep = zvtep_next) {
zvtep_next = zvtep->next;
/*
* Skip if we are doing stale cleanup, but this entry is not
* stale.
*/
if (l2_wctx && l2_wctx->gr_stale_cleanup &&
(zvtep->gr_refresh_time > l2_wctx->gr_cleanup_time))
continue;
if (uninstall)
zebra_evpn_vtep_uninstall(zevpn, &zvtep->vtep_ip);
zebra_evpn_vtep_del(zevpn, zvtep);
}
@ -1338,11 +1348,11 @@ void zebra_evpn_cleanup_all(struct hash_bucket *bucket, void *arg)
zevpn = (struct zebra_evpn *)bucket->data;
/* Free up all neighbors and MACs, if any. */
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC);
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH, NULL);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC, NULL);
/* Free up all remote VTEPs, if any. */
zebra_evpn_vtep_del_all(zevpn, 1);
zebra_evpn_vtep_del_all(zevpn, 1, NULL);
/* Delete the hash entry. */
zebra_evpn_del(zevpn);
@ -1474,6 +1484,8 @@ void zebra_evpn_rem_macip_add(vni_t vni, const struct ethaddr *macaddr, uint16_t
}
zebra_evpn_vtep_install(zevpn, zvtep);
} else {
zvtep->gr_refresh_time = monotime(NULL);
}
}
@ -1619,15 +1631,17 @@ void zebra_evpn_rem_macip_del(vni_t vni, const struct ethaddr *macaddr, uint16_t
void zebra_evpn_cfg_cleanup(struct hash_bucket *bucket, void *ctxt)
{
struct zebra_evpn *zevpn = NULL;
struct l2vni_walk_ctx *wctx = ctxt;
zevpn = (struct zebra_evpn *)bucket->data;
zevpn->advertise_gw_macip = 0;
zevpn->advertise_svi_macip = 0;
zevpn->advertise_subnet = 0;
zebra_evpn_neigh_del_all(zevpn, 1, 0,
DEL_REMOTE_NEIGH | DEL_REMOTE_NEIGH_FROM_VTEP);
zebra_evpn_mac_del_all(zevpn, 1, 0,
DEL_REMOTE_MAC | DEL_REMOTE_MAC_FROM_VTEP);
zebra_evpn_vtep_del_all(zevpn, 1);
if (!wctx->gr_stale_cleanup) {
zevpn->advertise_gw_macip = 0;
zevpn->advertise_svi_macip = 0;
zevpn->advertise_subnet = 0;
}
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_REMOTE_NEIGH | DEL_REMOTE_NEIGH_FROM_VTEP, wctx);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_REMOTE_MAC | DEL_REMOTE_MAC_FROM_VTEP, wctx);
zebra_evpn_vtep_del_all(zevpn, 1, wctx);
}

View file

@ -51,6 +51,12 @@ struct zebra_vtep {
/* Links. */
struct zebra_vtep *next;
struct zebra_vtep *prev;
/*
* Timestamp of when this entry was created/refreshed.
* This field is used to do GR stale entry cleanup
*/
uint64_t gr_refresh_time;
};
/*
@ -148,6 +154,11 @@ static inline struct interface *zevpn_map_to_svi(struct zebra_evpn *zevpn)
return zvni_map_to_svi(vni->access_vlan, zif->brslave_info.br_if);
}
struct l2vni_walk_ctx {
bool gr_stale_cleanup;
uint64_t gr_cleanup_time;
};
int advertise_gw_macip_enabled(struct zebra_evpn *zevpn);
int advertise_svi_macip_enabled(struct zebra_evpn *zevpn);
void zebra_evpn_print(struct zebra_evpn *zevpn, void **ctxt);
@ -191,7 +202,8 @@ struct zebra_vtep *zebra_evpn_vtep_find(struct zebra_evpn *zevpn, struct ipaddr
struct zebra_vtep *zebra_evpn_vtep_add(struct zebra_evpn *zevpn, struct ipaddr *vtep_ip,
int flood_control);
int zebra_evpn_vtep_del(struct zebra_evpn *zevpn, struct zebra_vtep *zvtep);
int zebra_evpn_vtep_del_all(struct zebra_evpn *zevpn, int uninstall);
int zebra_evpn_vtep_del_all(struct zebra_evpn *zevpn, int uninstall,
struct l2vni_walk_ctx *l2_wctx);
int zebra_evpn_vtep_install(struct zebra_evpn *zevpn, struct zebra_vtep *zvtep);
int zebra_evpn_vtep_uninstall(struct zebra_evpn *zevpn, struct ipaddr *vtep_ip);
void zebra_evpn_handle_flooding_remote_vteps(struct hash_bucket *bucket, void *args[]);

View file

@ -1092,6 +1092,8 @@ struct zebra_mac *zebra_evpn_mac_add(struct zebra_evpn *zevpn,
mac->neigh_list->cmp = neigh_list_cmp;
mac->uptime = monotime(NULL);
mac->gr_refresh_time = monotime(NULL);
if (IS_ZEBRA_DEBUG_VXLAN || IS_ZEBRA_DEBUG_EVPN_MH_MAC) {
char mac_buf[MAC_BUF_SIZE];
@ -1184,15 +1186,34 @@ static bool zebra_evpn_check_mac_del_from_db(struct mac_walk_ctx *wctx,
CHECK_FLAG(mac->flags, ZEBRA_MAC_LOCAL))
return true;
else if (CHECK_FLAG(wctx->flags, DEL_REMOTE_MAC) &&
CHECK_FLAG(mac->flags, ZEBRA_MAC_REMOTE))
return true;
else if (CHECK_FLAG(wctx->flags, DEL_REMOTE_MAC_FROM_VTEP) &&
CHECK_FLAG(mac->flags, ZEBRA_MAC_REMOTE) &&
ipaddr_is_same(&mac->fwd_info.r_vtep_ip, &wctx->r_vtep_ip))
return true;
else if (CHECK_FLAG(wctx->flags, DEL_LOCAL_MAC) &&
CHECK_FLAG(mac->flags, ZEBRA_MAC_AUTO) &&
!listcount(mac->neigh_list)) {
CHECK_FLAG(mac->flags, ZEBRA_MAC_REMOTE)) {
if (wctx->gr_stale_cleanup) {
/*
* If zebra is doing stale cleanup, then return true
* only if this is a stale remote MAC entry.
* Return false if this entry was refreshed.
*/
if (mac->gr_refresh_time < wctx->gr_cleanup_time)
return true;
} else {
return true;
}
} else if (CHECK_FLAG(wctx->flags, DEL_REMOTE_MAC_FROM_VTEP) &&
CHECK_FLAG(mac->flags, ZEBRA_MAC_REMOTE) &&
ipaddr_is_same(&mac->fwd_info.r_vtep_ip, &wctx->r_vtep_ip)) {
if (wctx->gr_stale_cleanup) {
/*
* If zebra is doing stale cleanup, then return true
* only if this is a stale remote MAC entry.
* Return false if this entry was refreshed.
*/
if (mac->gr_refresh_time < wctx->gr_cleanup_time)
return true;
} else {
return true;
}
} else if (CHECK_FLAG(wctx->flags, DEL_LOCAL_MAC) &&
CHECK_FLAG(mac->flags, ZEBRA_MAC_AUTO) && !listcount(mac->neigh_list)) {
if (IS_ZEBRA_DEBUG_VXLAN) {
char mac_buf[MAC_BUF_SIZE];
@ -1242,8 +1263,8 @@ static void zebra_evpn_mac_del_hash_entry(struct hash_bucket *bucket, void *arg)
/*
* Delete all MAC entries for this EVPN.
*/
void zebra_evpn_mac_del_all(struct zebra_evpn *zevpn, int uninstall,
int upd_client, uint32_t flags)
void zebra_evpn_mac_del_all(struct zebra_evpn *zevpn, int uninstall, int upd_client,
uint32_t flags, struct l2vni_walk_ctx *l2_wctx)
{
struct mac_walk_ctx wctx;
@ -1255,6 +1276,10 @@ void zebra_evpn_mac_del_all(struct zebra_evpn *zevpn, int uninstall,
wctx.uninstall = uninstall;
wctx.upd_client = upd_client;
wctx.flags = flags;
if (l2_wctx) {
wctx.gr_stale_cleanup = l2_wctx->gr_stale_cleanup;
wctx.gr_cleanup_time = l2_wctx->gr_cleanup_time;
}
hash_iterate(zevpn->mac_table, zebra_evpn_mac_del_hash_entry, &wctx);
}
@ -1694,6 +1719,7 @@ struct zebra_mac *zebra_evpn_proc_sync_mac_update(struct zebra_evpn *zevpn,
bool remote_gw;
mac->uptime = monotime(NULL);
mac->gr_refresh_time = monotime(NULL);
old_flags = mac->flags;
sticky = !!CHECK_FLAG(old_flags, ZEBRA_MAC_STICKY);
@ -1981,6 +2007,10 @@ int zebra_evpn_mac_remote_macip_add(struct zebra_evpn *zevpn, struct zebra_vrf *
remote_gw = !!CHECK_FLAG(flags, ZEBRA_MACIP_TYPE_GW);
mac = zebra_evpn_mac_lookup(zevpn, macaddr);
if (mac) {
/* Refresh the timestamp */
mac->gr_refresh_time = monotime(NULL);
}
/* Ignore if the mac is already present as a gateway mac */
if (mac && CHECK_FLAG(mac->flags, ZEBRA_MAC_DEF_GW) &&
@ -2147,6 +2177,8 @@ int zebra_evpn_add_update_local_mac(struct zebra_vrf *zvrf,
SET_FLAG(mac->flags, ZEBRA_MAC_STICKY);
inform_client = true;
} else {
mac->gr_refresh_time = monotime(NULL);
if (IS_ZEBRA_DEBUG_VXLAN || IS_ZEBRA_DEBUG_EVPN_MH_MAC) {
char mac_buf[MAC_BUF_SIZE];
@ -2432,6 +2464,7 @@ void zebra_evpn_mac_gw_macip_add(struct interface *ifp, struct zebra_evpn *zevpn
} else
mac = *macp;
mac->gr_refresh_time = monotime(NULL);
/* Set "local" forwarding info. */
zebra_evpn_mac_clear_fwd_info(mac);
SET_FLAG(mac->flags, ZEBRA_MAC_LOCAL);
@ -2501,3 +2534,75 @@ void zebra_evpn_mac_svi_add(struct interface *ifp, struct zebra_evpn *zevpn)
new_bgp_ready = zebra_evpn_mac_is_ready_for_bgp(mac->flags);
zebra_evpn_mac_send_add_del_to_client(mac, old_bgp_ready, new_bgp_ready);
}
static void zebra_vxlan_stale_remote_mac_add_l2vni(struct zebra_evpn *zevpn,
struct ethaddr *macaddr, struct ipaddr vtep_ip,
bool sticky)
{
struct zebra_mac *mac;
mac = zebra_evpn_mac_lookup(zevpn, macaddr);
if (mac) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Remote %sMAC %pEA (%p) zevpn %p,VTEP %pIA L2VNI %d exists",
sticky ? "sticky " : "", macaddr, mac, zevpn, &vtep_ip,
zevpn->vni);
return;
}
/* Create remote MAC entry in table*/
mac = zebra_evpn_mac_add(zevpn, macaddr);
if (!mac) {
zlog_debug("EVPN-GR: Failed to add remote MAC %pEA, VTEP %pIA, L2VNI %d", macaddr,
&vtep_ip, zevpn->vni);
return;
}
/* Set "remote" forwarding info. */
SET_FLAG(mac->flags, ZEBRA_MAC_REMOTE);
mac->fwd_info.r_vtep_ip = vtep_ip;
/*
* Sticky could be set either when ZEBRA_MAC_STICKY or
* ZEBRA_MAC_REMOTE_DEF_GW is set. So set both here.
* If one of them is not required, then zebra will
* update it correctly when BGP downloads the remote MAC
* to zebra.
*/
if (sticky) {
SET_FLAG(mac->flags, ZEBRA_MAC_STICKY);
SET_FLAG(mac->flags, ZEBRA_MAC_REMOTE_DEF_GW);
} else {
UNSET_FLAG(mac->flags, ZEBRA_MAC_STICKY);
UNSET_FLAG(mac->flags, ZEBRA_MAC_REMOTE_DEF_GW);
}
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Added stale remote %sMAC %pEA (%p) zevpn %p, VTEP %pIA L2VNI %d",
sticky ? "sticky " : "", macaddr, mac, zevpn, &vtep_ip, zevpn->vni);
}
void zebra_vxlan_stale_remote_mac_add(struct ethaddr *macaddr, struct ipaddr vtep_ip, bool sticky,
vni_t vni)
{
struct zebra_evpn *zevpn;
struct zebra_l3vni *zl3vni = NULL;
/* Restore remote Router-MAC */
zl3vni = zl3vni_lookup(vni);
if (zl3vni) {
zebra_vxlan_stale_remote_mac_add_l3vni(zl3vni, macaddr, vtep_ip);
return;
}
/* Restore remote MAC */
zevpn = zebra_evpn_lookup(vni);
if (!zevpn || !zevpn->vxlan_if) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Add of remote %sMAC %pEA VNI %u, could not find EVPN inst/intf (%p)",
sticky ? "sticky " : "", macaddr, vni, zevpn);
return;
}
zebra_vxlan_stale_remote_mac_add_l2vni(zevpn, macaddr, vtep_ip, sticky);
}

View file

@ -130,6 +130,12 @@ struct zebra_mac {
uint32_t sync_neigh_cnt;
time_t uptime;
/*
* Timestamp of when this entry was created/refreshed.
* This field is used to do GR stale entry cleanup
*/
uint64_t gr_refresh_time;
};
/*
@ -154,6 +160,8 @@ struct mac_walk_ctx {
uint32_t count; /* Used by VTY handlers */
struct json_object *json; /* Used for JSON Output */
bool print_dup; /* Used to print dup addr list */
bool gr_stale_cleanup; /* Used for cleaning up stale entries for GR */
uint64_t gr_cleanup_time;
};
struct rmac_walk_ctx {
@ -223,8 +231,8 @@ void zebra_evpn_mac_send_add_del_to_client(struct zebra_mac *mac,
bool old_bgp_ready,
bool new_bgp_ready);
void zebra_evpn_mac_del_all(struct zebra_evpn *zevi, int uninstall,
int upd_client, uint32_t flags);
void zebra_evpn_mac_del_all(struct zebra_evpn *zevi, int uninstall, int upd_client, uint32_t flags,
struct l2vni_walk_ctx *l2_wctx);
int zebra_evpn_mac_send_add_to_client(vni_t vni, const struct ethaddr *macaddr,
uint32_t mac_flags, uint32_t seq,
struct zebra_evpn_es *es);
@ -264,6 +272,8 @@ void zebra_evpn_mac_svi_add(struct interface *ifp, struct zebra_evpn *zevpn);
void zebra_evpn_mac_svi_del(struct interface *ifp, struct zebra_evpn *zevpn);
void zebra_evpn_mac_ifp_del(struct interface *ifp);
void zebra_evpn_mac_clear_fwd_info(struct zebra_mac *zmac);
extern void zebra_vxlan_stale_remote_mac_add(struct ethaddr *macaddr, struct ipaddr vtep_ip,
bool sticky, vni_t vni);
#ifdef __cplusplus
}

View file

@ -565,6 +565,7 @@ static struct zebra_neigh *zebra_evpn_neigh_add(struct zebra_evpn *zevpn,
n->dad_ip_auto_recovery_timer = NULL;
n->flags = n_flags;
n->uptime = monotime(NULL);
n->gr_refresh_time = monotime(NULL);
if (!zmac)
zmac = zebra_evpn_mac_lookup(zevpn, mac);
@ -798,6 +799,7 @@ struct zebra_neigh *zebra_evpn_proc_sync_neigh_update(
}
n->uptime = monotime(NULL);
n->gr_refresh_time = monotime(NULL);
}
/* update the neigh seq. we don't bother with the mac seq as
@ -885,6 +887,14 @@ static void zebra_evpn_neigh_del_hash_entry(struct hash_bucket *bucket,
((wctx->flags & DEL_REMOTE_NEIGH) && (n->flags & ZEBRA_NEIGH_REMOTE)) ||
((wctx->flags & DEL_REMOTE_NEIGH_FROM_VTEP) && (n->flags & ZEBRA_NEIGH_REMOTE) &&
ipaddr_is_same(&n->r_vtep_ip, &wctx->r_vtep_ip))) {
/*
* If we are doing stale cleanup of remote neighs
* and if this neigh is not marked stale, then don't delete it.
*/
if (wctx->gr_stale_cleanup && CHECK_FLAG(n->flags, ZEBRA_NEIGH_REMOTE) &&
(n->gr_refresh_time > wctx->gr_cleanup_time))
return;
if (wctx->upd_client && (n->flags & ZEBRA_NEIGH_LOCAL))
zebra_evpn_neigh_send_del_to_client(
wctx->zevpn->vni, &n->ip, &n->emac, n->flags,
@ -909,8 +919,8 @@ static void zebra_evpn_neigh_del_hash_entry(struct hash_bucket *bucket,
/*
* Delete all neighbor entries for this EVPN.
*/
void zebra_evpn_neigh_del_all(struct zebra_evpn *zevpn, int uninstall,
int upd_client, uint32_t flags)
void zebra_evpn_neigh_del_all(struct zebra_evpn *zevpn, int uninstall, int upd_client,
uint32_t flags, struct l2vni_walk_ctx *l2_wctx)
{
struct neigh_walk_ctx wctx;
@ -922,6 +932,10 @@ void zebra_evpn_neigh_del_all(struct zebra_evpn *zevpn, int uninstall,
wctx.uninstall = uninstall;
wctx.upd_client = upd_client;
wctx.flags = flags;
if (l2_wctx) {
wctx.gr_stale_cleanup = l2_wctx->gr_stale_cleanup;
wctx.gr_cleanup_time = l2_wctx->gr_cleanup_time;
}
hash_iterate(zevpn->neigh_table, zebra_evpn_neigh_del_hash_entry,
&wctx);
@ -1349,6 +1363,8 @@ int zebra_evpn_local_neigh_update(struct zebra_evpn *zevpn,
n->ifindex = ifp->ifindex;
created = true;
} else {
n->gr_refresh_time = monotime(NULL);
if (CHECK_FLAG(n->flags, ZEBRA_NEIGH_LOCAL)) {
bool mac_different;
bool cur_is_router;
@ -1614,19 +1630,63 @@ int zebra_evpn_local_neigh_update(struct zebra_evpn *zevpn,
return 0;
}
int zebra_evpn_remote_neigh_update(struct zebra_evpn *zevpn,
struct interface *ifp,
const struct ipaddr *ip,
const struct ethaddr *macaddr,
uint16_t state)
static void zebra_evpn_stale_remote_neigh_add(struct zebra_evpn *zevpn, const struct ipaddr *ip,
const struct ethaddr *macaddr, bool is_router)
{
struct zebra_neigh *n = NULL;
struct zebra_mac *zmac = NULL;
/* Nothing to do if the entry already exists */
if (zebra_evpn_neigh_lookup(zevpn, ip))
return;
/* Check if the MAC exists. */
zmac = zebra_evpn_mac_lookup(zevpn, macaddr);
if (!zmac) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: zmac for MAC %pEA not found. L2VNI %u", macaddr,
zevpn->vni);
return;
}
/* New neighbor - create */
n = zebra_evpn_neigh_add(zevpn, ip, macaddr, zmac, 0);
if (!n) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Can't create neigh entry for IP %pIA MAC %pEA, L2VNI %u",
ip, macaddr, zevpn->vni);
return;
}
/* Set "remote" forwarding info. */
SET_FLAG(n->flags, ZEBRA_NEIGH_REMOTE);
ZEBRA_NEIGH_SET_ACTIVE(n);
n->r_vtep_ip = zmac->fwd_info.r_vtep_ip;
if (is_router)
SET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG);
else
UNSET_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG);
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Added stale remote %sneigh entry IP %pIA MAC %pEA, L2VNI %u",
is_router ? "router " : "", ip, macaddr, zevpn->vni);
}
int zebra_evpn_remote_neigh_update(struct zebra_evpn *zevpn, struct interface *ifp,
const struct ipaddr *ip, const struct ethaddr *macaddr,
uint16_t state, bool is_router)
{
struct zebra_neigh *n = NULL;
struct zebra_mac *zmac = NULL;
/* If the neighbor is unknown, there is no further action. */
n = zebra_evpn_neigh_lookup(zevpn, ip);
if (!n)
if (!n) {
if (zrouter.graceful_restart)
zebra_evpn_stale_remote_neigh_add(zevpn, ip, macaddr, is_router);
return 0;
}
/* If a remote entry, see if it needs to be refreshed */
if (CHECK_FLAG(n->flags, ZEBRA_NEIGH_REMOTE)) {
@ -2081,6 +2141,11 @@ void zebra_evpn_neigh_remote_macip_add(struct zebra_evpn *zevpn, struct zebra_vr
* change. If so, create or update and then install the entry.
*/
n = zebra_evpn_neigh_lookup(zevpn, ipaddr);
if (n) {
/* Refresh entry */
n->gr_refresh_time = monotime(NULL);
}
if (!n || !CHECK_FLAG(n->flags, ZEBRA_NEIGH_REMOTE) ||
is_router != !!CHECK_FLAG(n->flags, ZEBRA_NEIGH_ROUTER_FLAG) ||
(memcmp(&n->emac, &mac->macaddr, sizeof(struct ethaddr)) != 0) ||
@ -2198,6 +2263,8 @@ int zebra_evpn_neigh_gw_macip_add(struct interface *ifp,
n = zebra_evpn_neigh_lookup(zevpn, ip);
if (!n)
n = zebra_evpn_neigh_add(zevpn, ip, &mac->macaddr, mac, 0);
else
n->gr_refresh_time = monotime(NULL);
/* Set "local" forwarding info. */
SET_FLAG(n->flags, ZEBRA_NEIGH_LOCAL);

View file

@ -99,9 +99,14 @@ struct zebra_neigh {
time_t dad_dup_detect_time;
time_t uptime;
/* used for ageing out the PEER_ACTIVE flag */
struct event *hold_timer;
/*
* Timestamp of when this entry was created/refreshed.
* This field is used to do GR stale entry cleanup
*/
uint64_t gr_refresh_time;
};
/*
@ -127,6 +132,9 @@ struct neigh_walk_ctx {
uint8_t addr_width; /* Used by VTY handlers */
uint8_t r_vtep_width; /* Used by VTY handlers */
struct json_object *json; /* Used for JSON Output */
bool gr_stale_cleanup; /* Used for cleaning up stale entries after GR
*/
uint64_t gr_cleanup_time;
};
/**************************** SYNC neigh handling **************************/
@ -220,8 +228,8 @@ struct zebra_neigh *zebra_evpn_proc_sync_neigh_update(
struct zebra_evpn *zevpn, struct zebra_neigh *n, uint16_t ipa_len,
const struct ipaddr *ipaddr, uint8_t flags, uint32_t seq,
const esi_t *esi, struct zebra_mac *mac);
void zebra_evpn_neigh_del_all(struct zebra_evpn *zevpn, int uninstall,
int upd_client, uint32_t flags);
void zebra_evpn_neigh_del_all(struct zebra_evpn *zevpn, int uninstall, int upd_client,
uint32_t flags, struct l2vni_walk_ctx *l2_wctx);
struct zebra_neigh *zebra_evpn_neigh_lookup(struct zebra_evpn *zevpn,
const struct ipaddr *ip);
@ -242,11 +250,9 @@ int zebra_evpn_local_neigh_update(struct zebra_evpn *zevpn,
const struct ipaddr *ip,
const struct ethaddr *macaddr, bool is_router,
bool local_inactive, bool dp_static);
int zebra_evpn_remote_neigh_update(struct zebra_evpn *zevpn,
struct interface *ifp,
const struct ipaddr *ip,
const struct ethaddr *macaddr,
uint16_t state);
int zebra_evpn_remote_neigh_update(struct zebra_evpn *zevpn, struct interface *ifp,
const struct ipaddr *ip, const struct ethaddr *macaddr,
uint16_t state, bool is_router);
void zebra_evpn_send_neigh_to_client(struct zebra_evpn *zevpn);
void zebra_evpn_clear_dup_neigh_hash(struct hash_bucket *bucket, void *ctxt);
void zebra_evpn_print_neigh(struct zebra_neigh *n, void *ctxt,

View file

@ -33,9 +33,21 @@
#include "zebra/debug.h"
#include "zebra/zapi_msg.h"
#include "zebra/zebra_trace.h"
#include "zebra/zebra_vxlan.h"
DEFINE_MTYPE_STATIC(ZEBRA, ZEBRA_GR, "GR");
struct zebra_gr_afi_clean {
/* Do NOT store pointers to client_gr_info; it can be freed asynchronously. */
vrf_id_t vrf_id;
afi_t afi;
uint8_t proto;
uint8_t instance;
time_t restart_time;
struct event *t_gac;
time_t update_pending_time;
};
/*
* Forward declaration.
*/
@ -45,6 +57,9 @@ static int32_t zebra_gr_delete_stale_routes(struct client_gr_info *info);
static void zebra_gr_process_client_stale_routes(struct zserv *client,
struct client_gr_info *info);
static void zebra_gr_delete_stale_route_table_afi(struct event *event);
static bool zebra_gr_unicast_stale_route_delete(struct route_table *table,
struct zebra_gr_afi_clean *gac, bool no_max);
/*
* Debug macros.
*/
@ -271,16 +286,6 @@ void zebra_gr_client_reconnect(struct zserv *client)
zserv_client_delete(old_client);
}
struct zebra_gr_afi_clean {
struct client_gr_info *info;
afi_t afi;
uint8_t proto;
uint8_t instance;
time_t restart_time;
struct event *t_gac;
};
/*
* Functions to deal with capabilities
*/
@ -320,7 +325,7 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
* If this ever matters uncomment and add safi to the
* arrays as needed to track
*/
if (api.safi != SAFI_UNICAST)
if (api.safi != SAFI_UNICAST && api.safi != SAFI_EVPN)
return;
/* GR only for dynamic clients */
@ -349,6 +354,9 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
zebra_route_string(client->proto),
client->gr_instance_count);
frrtrace(3, frr_zebra, gr_client_capability, api.cap, api.vrf_id,
client->gr_instance_count);
if ((info->gr_enable) && (client->gr_instance_count > 0))
client->gr_instance_count--;
@ -363,11 +371,24 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
if (!info->gr_enable) {
client->gr_instance_count++;
if (!zrouter.gr_stale_cleanup_time_recorded) {
/*
* Record the time at which GR started.
* This timestamp will be later used to
* cleanup stale routes and EVPN entries.
*/
client->restart_time = monotime(NULL);
zrouter.gr_stale_cleanup_time_recorded = true;
}
LOG_GR("%s: Cient %s vrf %s(%u) GR enabled count %d",
__func__, zebra_route_string(client->proto),
VRF_LOGNAME(vrf), api.vrf_id,
client->gr_instance_count);
frrtrace(3, frr_zebra, gr_client_capability, api.cap, api.vrf_id,
client->gr_instance_count);
info->capabilities = api.cap;
info->stale_removal_time = api.stale_removal_time;
info->vrf_id = api.vrf_id;
@ -386,6 +407,9 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
info->stale_removal_time,
api.stale_removal_time);
frrtrace(3, frr_zebra, gr_client_stale_time, api.cap, api.vrf_id,
api.stale_removal_time);
info->stale_removal_time = api.stale_removal_time;
}
@ -401,13 +425,16 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
LOG_GR("%s: Client %s vrf %s(%u) route update complete for AFI %d, SAFI %d",
__func__, zebra_route_string(client->proto),
VRF_LOGNAME(vrf), info->vrf_id, api.afi, api.safi);
frrtrace(4, frr_zebra, gr_client_update, api.cap, info->vrf_id, api.afi, api.safi);
info->route_sync[api.afi] = true;
/*
* Schedule for after anything already in the meta Q
*/
rib_add_gr_run(api.afi, api.vrf_id, client->proto, client->instance,
client->restart_time, false);
client->restart_time, client->update_pending_time, false);
zebra_gr_process_client_stale_routes(client, info);
break;
case ZEBRA_CLIENT_ROUTE_UPDATE_PENDING:
@ -421,7 +448,14 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
VRF_LOGNAME(vrf), info->vrf_id, api.afi,
api.safi);
frrtrace(4, frr_zebra, gr_client_update, api.cap, info->vrf_id, api.afi,
api.safi);
info->af_enabled[api.afi] = true;
if (!zrouter.gr_update_pending_time_recorded) {
client->update_pending_time = monotime(NULL);
zrouter.gr_update_pending_time_recorded = true;
}
}
break;
}
@ -431,6 +465,67 @@ void zread_client_capabilities(ZAPI_HANDLER_ARGS)
* Stale route handling
*/
static bool zebra_gr_enabled_for_vrf(struct zserv *client, vrf_id_t vrf_id)
{
struct client_gr_info *info = NULL;
TAILQ_FOREACH (info, &client->gr_info_queue, gr_info) {
if (info->vrf_id == vrf_id && info->gr_enable)
return true;
}
return false;
}
/* Cleans up stale ipv4 and ipv6 unicast routes that
* were imported from default EVPN VRF into GR disabled
* destination VRF and installed in kernel in that
* destination VRF.
*/
static void zebra_gr_cleanup_of_non_gr_vrf(struct zebra_gr_afi_clean *gac)
{
struct vrf *vrf;
struct zebra_vrf *zvrf;
struct route_table *table;
afi_t afi;
struct zserv *client = zserv_find_client(gac->proto, gac->instance);
RB_FOREACH (vrf, vrf_id_head, &vrfs_by_id) {
zvrf = vrf->info;
if (!zvrf)
continue;
/*
* Skip if this is default EVPN VRF
*/
if (zvrf == zebra_vrf_get_evpn())
continue;
/*
* If GR is enabled for this VRF, then zebra
* would have done the stale cleanup when BGP
* indicated UPDATE_COMPLETE for this VRF for
* all gr-enabled afi-safis. So skip such VRFs.
*/
if (zebra_gr_enabled_for_vrf(client, vrf->vrf_id))
continue;
for (afi = AFI_IP; afi <= AFI_IP6; afi++) {
table = zvrf->table[afi][SAFI_UNICAST];
if (!table)
continue;
LOG_GR("EVPN-GR: Cleaning up imported stale afi:%d unicast routes in %s(%u)",
afi, vrf->name, vrf->vrf_id);
/*
* Cleanup stale unicast routes
*/
zebra_gr_unicast_stale_route_delete(table, gac, true);
}
}
}
/*
* Delete all the stale routes that have not been refreshed
* post restart.
@ -459,7 +554,7 @@ static void zebra_gr_route_stale_delete_timer_expiry(struct event *event)
}
/* Schedule GR info and stale client deletion */
rib_add_gr_run(0, info->vrf_id, client->proto, client->instance, 0, true);
rib_add_gr_run(0, info->vrf_id, client->proto, client->instance, 0, 0, true);
}
@ -474,15 +569,11 @@ static bool zebra_gr_process_route_entry(struct route_node *rn,
time_t compare_time, uint8_t proto)
{
struct nexthop *nexthop;
char buf[PREFIX2STR_BUFFER];
/* If the route is not refreshed after restart, delete the entry */
if (re->uptime < compare_time) {
if (IS_ZEBRA_DEBUG_RIB) {
prefix2str(&rn->p, buf, sizeof(buf));
zlog_debug("%s: Client %s stale route %s is deleted",
__func__, zebra_route_string(proto), buf);
}
LOG_GR("GR %s: Client %s stale route %pFX is deleted", __func__,
zebra_route_string(proto), &rn->p);
SET_FLAG(re->status, ROUTE_ENTRY_INSTALLED);
for (ALL_NEXTHOPS(re->nhe->nhg, nexthop))
SET_FLAG(nexthop->flags, NEXTHOP_FLAG_FIB);
@ -498,28 +589,48 @@ static bool zebra_gr_process_route_entry(struct route_node *rn,
static void zebra_gr_delete_stale_info_client(struct event *event)
{
struct zebra_gr_afi_clean *gac = EVENT_ARG(event);
struct zserv *s_client = zebra_gr_find_stale_client(gac->proto, gac->instance);
if (gac->info->stale_client)
zebra_gr_delete_stale_client(gac->info);
if (s_client) {
struct client_gr_info *info;
TAILQ_FOREACH (info, &s_client->gr_info_queue, gr_info) {
if (info->vrf_id == gac->vrf_id) {
if (info->stale_client)
zebra_gr_delete_stale_client(info);
break;
}
}
}
XFREE(MTYPE_ZEBRA_GR, gac);
}
static void zebra_gr_delete_stale_route_table_afi(struct event *event)
/* Helper: return whether we should reschedule based on live info state. */
static bool zebra_gr_should_reschedule(const struct zebra_gr_afi_clean *gac)
{
struct zserv *client;
struct client_gr_info *info;
client = zserv_find_client(gac->proto, gac->instance);
if (!client)
client = zebra_gr_find_stale_client(gac->proto, gac->instance);
if (!client)
return false;
TAILQ_FOREACH (info, &client->gr_info_queue, gr_info) {
if (info->vrf_id == gac->vrf_id)
return !info->do_delete;
}
return false;
}
static bool zebra_gr_unicast_stale_route_delete(struct route_table *table,
struct zebra_gr_afi_clean *gac, bool no_max)
{
struct zebra_gr_afi_clean *gac = EVENT_ARG(event);
struct route_table *table;
struct route_node *rn;
struct route_entry *re, *next;
struct zebra_vrf *zvrf = zebra_vrf_lookup_by_id(gac->info->vrf_id);
int32_t n = 0;
if (!zvrf)
goto done;
table = zvrf->table[gac->afi][SAFI_UNICAST];
if (!table)
goto done;
uint32_t n = 0;
for (rn = route_top(table); rn; rn = srcdest_route_next(rn)) {
RNODE_FOREACH_RE_SAFE (rn, re, next) {
@ -543,15 +654,46 @@ static void zebra_gr_delete_stale_route_table_afi(struct event *event)
* Store the current prefix and afi
*/
if ((n >= ZEBRA_MAX_STALE_ROUTE_COUNT) &&
(gac->info->do_delete == false)) {
zebra_gr_should_reschedule(gac) && !no_max) {
LOG_GR("GR: Stale routes deleted %d. Restarting timer.", n);
event_add_timer(
zrouter.master,
zebra_gr_delete_stale_route_table_afi,
gac, ZEBRA_DEFAULT_STALE_UPDATE_DELAY,
&gac->t_gac);
return true;
}
}
}
return false;
}
static void zebra_gr_delete_stale_route_table_afi(struct event *event)
{
struct zebra_gr_afi_clean *gac = EVENT_ARG(event);
struct route_table *table;
struct zebra_vrf *zvrf = zebra_vrf_lookup_by_id(gac->vrf_id);
if (!zvrf)
goto done;
LOG_GR("GR: Deleting stale routes for %s, afi %d", zvrf->vrf->name, gac->afi);
frrtrace(2, frr_zebra, gr_delete_stale_route_table_afi, zvrf->vrf->name, gac->afi);
if (gac->afi == AFI_L2VPN && zvrf == zebra_vrf_get_evpn()) {
zebra_gr_cleanup_of_non_gr_vrf(gac);
zebra_evpn_stale_entries_cleanup(gac->update_pending_time);
goto done;
}
table = zvrf->table[gac->afi][SAFI_UNICAST];
if (!table)
goto done;
/* Return if timer was restarted */
if (zebra_gr_unicast_stale_route_delete(table, gac, false))
return;
done:
XFREE(MTYPE_ZEBRA_GR, gac);
@ -604,7 +746,8 @@ static int32_t zebra_gr_delete_stale_route(struct client_gr_info *info,
* Schedule for immediately after anything in the
* meta-Q
*/
rib_add_gr_run(afi, info->vrf_id, proto, instance, restart_time, false);
rib_add_gr_run(afi, info->vrf_id, proto, instance, restart_time, restart_time,
false);
}
return 0;
}
@ -651,6 +794,8 @@ static void zebra_gr_process_client_stale_routes(struct zserv *client,
LOG_GR("%s: Client %s vrf: %s(%u) route update not completed for AFI %d",
__func__, zebra_route_string(client->proto),
VRF_LOGNAME(vrf), info->vrf_id, afi);
frrtrace(4, frr_zebra, gr_process_client_stale_routes,
zebra_route_string(client->proto), VRF_LOGNAME(vrf), afi, 1);
return;
}
}
@ -667,11 +812,15 @@ static void zebra_gr_process_client_stale_routes(struct zserv *client,
__func__, zebra_route_string(client->proto),
VRF_LOGNAME(vrf), info->vrf_id);
event_cancel(&info->t_stale_removal);
frrtrace(4, frr_zebra, gr_process_client_stale_routes,
zebra_route_string(client->proto), VRF_LOGNAME(vrf), afi, 0);
}
}
void zebra_gr_process_client(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t instance,
time_t restart_time, bool stale_client_cleanup)
time_t restart_time, time_t update_pending_time,
bool stale_client_cleanup)
{
struct zserv *client = zserv_find_client(proto, instance);
struct client_gr_info *info = NULL;
@ -687,6 +836,7 @@ void zebra_gr_process_client(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t
client = zebra_gr_find_stale_client(proto, instance);
if (!client) {
LOG_GR("GR: %s: Neither active nor stale client found", __func__);
frrtrace(3, frr_zebra, gr_client_not_found, vrf_id, afi, 1);
return;
}
}
@ -700,11 +850,12 @@ void zebra_gr_process_client(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t
return;
gac = XCALLOC(MTYPE_ZEBRA_GR, sizeof(*gac));
gac->info = info;
gac->vrf_id = vrf_id;
gac->afi = afi;
gac->proto = proto;
gac->instance = instance;
gac->restart_time = restart_time;
gac->update_pending_time = update_pending_time;
/*
* If stale_client_cleanup is set, then we are being asked to cleanup

View file

@ -583,11 +583,26 @@ static void zebra_neigh_macfdb_update(struct zebra_dplane_ctx *ctx)
if (op == DPLANE_OP_NEIGH_INSTALL) {
/* Drop "permanent" entries. */
if (!vni_mcast_grp && (ndm_state & ZEBRA_NUD_PERMANENT)) {
/*
* If zebra started gracefully and if this is a HREP
* entry, then restore it.
*/
if (zrouter.graceful_restart && is_zero_mac(&mac))
zebra_vxlan_stale_hrep_add(*vtep_ip, vni);
if (IS_ZEBRA_DEBUG_KERNEL)
zlog_debug(" Dropping entry because of ZEBRA_NUD_PERMANENT");
return;
}
/*
* If zebra started gracefully and if this is a remote MAC/RMAC
* entry, then restore it.
*/
if (zrouter.graceful_restart && CHECK_FLAG(ndm_flags, ZEBRA_NTF_EXT_LEARNED))
zebra_vxlan_stale_remote_mac_add(&mac, *vtep_ip, sticky, vni);
if (IS_ZEBRA_IF_VXLAN(ifp)) {
if (!dst_present)
return;

View file

@ -2004,7 +2004,6 @@ done:
}
/*
* Route-update results processing after async dataplane update.
*/
@ -3268,6 +3267,7 @@ struct meta_q_gr_run {
uint8_t proto;
uint8_t instance;
time_t restart_time;
time_t update_pending_time;
bool stale_client_cleanup;
};
@ -3276,7 +3276,8 @@ static void process_subq_gr_run(struct listnode *lnode)
struct meta_q_gr_run *gr_run = listgetdata(lnode);
zebra_gr_process_client(gr_run->afi, gr_run->vrf_id, gr_run->proto, gr_run->instance,
gr_run->restart_time, gr_run->stale_client_cleanup);
gr_run->restart_time, gr_run->update_pending_time,
gr_run->stale_client_cleanup);
XFREE(MTYPE_WQ_WRAPPER, gr_run);
}
@ -4477,7 +4478,7 @@ void rib_meta_queue_early_route_cleanup(const struct prefix *p, int route_type)
}
int rib_add_gr_run(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t instance,
time_t restart_time, bool stale_client_cleanup)
time_t restart_time, time_t update_pending_time, bool stale_client_cleanup)
{
struct meta_q_gr_run *gr_run;
@ -4488,6 +4489,7 @@ int rib_add_gr_run(afi_t afi, vrf_id_t vrf_id, uint8_t proto, uint8_t instance,
gr_run->vrf_id = vrf_id;
gr_run->instance = instance;
gr_run->restart_time = restart_time;
gr_run->update_pending_time = update_pending_time;
gr_run->stale_client_cleanup = stale_client_cleanup;
return mq_add_handler(gr_run, rib_meta_queue_gr_run_add);
@ -5026,6 +5028,7 @@ void rib_sweep_route(struct event *t)
zebra_router_sweep_route();
zebra_router_sweep_nhgs();
zebra_evpn_stale_entries_cleanup(zrouter.startup_time);
}
/* Remove specific by protocol routes from 'table'. */

View file

@ -297,6 +297,9 @@ void zebra_router_init(bool asic_offload, bool notify_on_ack, bool v6_with_v4_ne
zrouter.nhg_keep = ZEBRA_DEFAULT_NHG_KEEP_TIMER;
zrouter.gr_stale_cleanup_time_recorded = false;
zrouter.gr_update_pending_time_recorded = false;
/* Initialize the red-black tree for router tables */
RB_INIT(zebra_router_table_head, &zrouter.tables);

View file

@ -210,6 +210,8 @@ struct zebra_router {
struct zebra_vrf *evpn_vrf;
struct zebra_architectural_values zav;
bool gr_stale_cleanup_time_recorded;
bool gr_update_pending_time_recorded;
/*
* zebra start time and time of sweeping RIB of old routes

View file

@ -1267,8 +1267,52 @@ TRACEPOINT_EVENT(
TRACEPOINT_LOGLEVEL(frr_zebra, if_netlink_parse_error, TRACE_INFO)
/* clang-format on */
/*
* Loc 1: zebra_gr_process_client
* Loc 2: zebra_gr_delete_stale_route_table_afi
*/
TRACEPOINT_EVENT(frr_zebra, gr_client_not_found,
TP_ARGS(vrf_id_t, vrf_id, uint8_t, afi, uint8_t, loc),
TP_FIELDS(ctf_integer(vrf_id_t, vrf_id, vrf_id) ctf_integer(uint8_t, afi, afi)
ctf_integer(uint8_t, location, loc)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_client_not_found, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_client_capability,
TP_ARGS(uint8_t, cap, vrf_id_t, vrf_id, uint32_t, gr_instance_count),
TP_FIELDS(ctf_integer(int, capability, cap) ctf_integer(vrf_id_t, vrf_id, vrf_id)
ctf_integer(uint32_t, gr_instance_count, gr_instance_count)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_client_capability, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_client_stale_time,
TP_ARGS(uint8_t, cap, vrf_id_t, vrf_id, uint32_t, stale_removal_time),
TP_FIELDS(ctf_integer(int, capability, cap) ctf_integer(vrf_id_t, vrf_id, vrf_id)
ctf_integer(uint32_t, stale_removal_time, stale_removal_time)))
TRACEPOINT_LOGLEVEL(frr_zebra, stale_removal_time, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_client_update,
TP_ARGS(uint8_t, cap, vrf_id_t, vrf_id, uint8_t, afi, uint8_t, safi),
TP_FIELDS(ctf_integer(int, capability, cap) ctf_integer(vrf_id_t, vrf_id, vrf_id)
ctf_integer(uint8_t, afi, afi) ctf_integer(uint8_t, safi, safi)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_client_update, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_process_client_stale_routes,
TP_ARGS(const char *, proto, const char *, vrf, uint8_t, afi, bool, pending),
TP_FIELDS(ctf_string(client, proto) ctf_string(vrf, vrf)
ctf_integer(uint8_t, afi, afi)
ctf_integer(bool, gr_pending, pending)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_process_client_stale_routes, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_delete_stale_route_table_afi, TP_ARGS(char *, vrf, uint8_t, afi),
TP_FIELDS(ctf_string(vrf, vrf) ctf_integer(uint8_t, afi, afi)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_delete_stale_route_table_afi, TRACE_INFO)
TRACEPOINT_EVENT(frr_zebra, gr_evpn_stale_entries_cleanup,
TP_ARGS(const char *, vrf, uint64_t, gr_cleanup_time),
TP_FIELDS(ctf_string(vrf, vrf)
ctf_integer(uint64_t, gr_cleanup_time, gr_cleanup_time)))
TRACEPOINT_LOGLEVEL(frr_zebra, gr_evpn_stale_entries_cleanup, TRACE_INFO)
/* clang-format on */
#include <lttng/tracepoint-event.h>
#endif /* HAVE_LTTNG */

View file

@ -115,6 +115,8 @@ static struct zebra_vxlan_sg *
zebra_vxlan_sg_do_ref(struct zebra_vrf *vrf, const struct ipaddr *sip,
const struct in_addr mcast_grp);
static void zebra_vxlan_cleanup_sg_table(struct zebra_vrf *zvrf);
static void zl3vni_stale_remote_nh_read_add(struct zebra_l3vni *zl3vni, struct ipaddr *ip,
struct ethaddr *macaddr);
bool zebra_evpn_do_dup_addr_detect(struct zebra_vrf *zvrf)
{
@ -1260,6 +1262,8 @@ static struct zebra_mac *zl3vni_rmac_add(struct zebra_l3vni *zl3vni,
SET_FLAG(zrmac->flags, ZEBRA_MAC_REMOTE);
SET_FLAG(zrmac->flags, ZEBRA_MAC_REMOTE_RMAC);
zrmac->gr_refresh_time = monotime(NULL);
return zrmac;
}
@ -1436,6 +1440,8 @@ static int zl3vni_remote_rmac_add(struct zebra_l3vni *zl3vni,
/* install rmac in kernel */
zl3vni_rmac_install(zl3vni, zrmac);
} else {
zrmac->gr_refresh_time = monotime(NULL);
if (!ipaddr_is_same(&zrmac->fwd_info.r_vtep_ip, &ip_vtep)) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("L3VNI %u Remote VTEP change(%pIA -> %pIA) for RMAC %pEA",
@ -1618,6 +1624,8 @@ static struct zebra_neigh *_nh_add(struct zebra_l3vni *zl3vni,
SET_FLAG(n->flags, ZEBRA_NEIGH_REMOTE);
SET_FLAG(n->flags, ZEBRA_NEIGH_REMOTE_NH);
n->gr_refresh_time = monotime(NULL);
return n;
}
@ -1782,6 +1790,7 @@ static int zl3vni_remote_nh_add(struct zebra_l3vni *zl3vni,
/* install the nh neigh in kernel */
zl3vni_nh_install(zl3vni, nh);
} else if (memcmp(&nh->emac, rmac, ETH_ALEN) != 0) {
nh->gr_refresh_time = monotime(NULL);
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug(
"L3VNI %u RMAC change(%pEA --> %pEA) for nexthop %pIA, prefix %pFX",
@ -1794,6 +1803,8 @@ static int zl3vni_remote_nh_add(struct zebra_l3vni *zl3vni,
memcpy(&nh->emac, rmac, ETH_ALEN);
/* install (update) the nh neigh in kernel */
zl3vni_nh_install(zl3vni, nh);
} else {
nh->gr_refresh_time = monotime(NULL);
}
rb_find_or_add_host(&nh->host_rb, host_prefix);
@ -2457,11 +2468,11 @@ static int zebra_vxlan_handle_vni_transition(struct zebra_vrf *zvrf, vni_t vni,
/* Delete EVPN from BGP. */
zebra_evpn_send_del_to_client(zevpn);
zebra_evpn_neigh_del_all(zevpn, 0, 0, DEL_ALL_NEIGH);
zebra_evpn_mac_del_all(zevpn, 0, 0, DEL_ALL_MAC);
zebra_evpn_neigh_del_all(zevpn, 0, 0, DEL_ALL_NEIGH, NULL);
zebra_evpn_mac_del_all(zevpn, 0, 0, DEL_ALL_MAC, NULL);
/* Free up all remote VTEPs, if any. */
zebra_evpn_vtep_del_all(zevpn, 1);
zebra_evpn_vtep_del_all(zevpn, 1, NULL);
zl3vni = zl3vni_from_vrf(zevpn->vrf_id);
if (zl3vni)
@ -2537,14 +2548,29 @@ static int zebra_vxlan_handle_vni_transition(struct zebra_vrf *zvrf, vni_t vni,
return 0;
}
struct l3vni_walk_ctx {
struct zebra_l3vni *zl3vni;
bool gr_stale_cleanup;
uint64_t gr_cleanup_time;
};
/* delete and uninstall rmac hash entry */
static void zl3vni_del_rmac_hash_entry(struct hash_bucket *bucket, void *ctx)
{
struct zebra_mac *zrmac = NULL;
struct zebra_l3vni *zl3vni = NULL;
struct l3vni_walk_ctx *wctx = ctx;
zrmac = (struct zebra_mac *)bucket->data;
zl3vni = (struct zebra_l3vni *)ctx;
zl3vni = wctx->zl3vni;
/*
* If we are doing stale cleanup but this RMAC is not
* marked stale, then do not delete it
*/
if (wctx->gr_stale_cleanup && (zrmac->gr_refresh_time > wctx->gr_cleanup_time))
return;
zl3vni_rmac_uninstall(zl3vni, zrmac);
/* Send RMAC for FPM processing */
@ -2558,9 +2584,10 @@ static void zl3vni_del_nh_hash_entry(struct hash_bucket *bucket, void *ctx)
{
struct zebra_neigh *n = NULL, *svd_nh = NULL;
struct zebra_l3vni *zl3vni = NULL;
struct l3vni_walk_ctx *wctx = ctx;
n = (struct zebra_neigh *)bucket->data;
zl3vni = (struct zebra_l3vni *)ctx;
zl3vni = (struct zebra_l3vni *)wctx->zl3vni;
/* remove SVD based remote nexthop neigh entry */
svd_nh = svd_nh_lookup(&n->ip);
@ -2576,6 +2603,13 @@ static void zl3vni_del_nh_hash_entry(struct hash_bucket *bucket, void *ctx)
}
}
/*
* If we are doing stale cleanup but this neigh is not
* marked stale, then do not delete it
*/
if (wctx->gr_stale_cleanup && (n->gr_refresh_time > wctx->gr_cleanup_time))
return;
zl3vni_nh_uninstall(zl3vni, n);
zl3vni_nh_del(zl3vni, n);
}
@ -4222,8 +4256,13 @@ int zebra_vxlan_handle_kernel_neigh_update(struct interface *ifp, struct interfa
* next-hop
*/
zl3vni = zl3vni_from_svi(ifp, link_if);
if (zl3vni)
if (zl3vni) {
/* Restore remote nexthops if zebra started gracefully*/
if (zrouter.graceful_restart && is_own)
zl3vni_stale_remote_nh_read_add(zl3vni, ip, macaddr);
return zl3vni_local_nh_add_update(zl3vni, ip, state);
}
/* We are only interested in neighbors on an SVI that resides on top
* of a VxLAN bridge.
@ -4245,7 +4284,7 @@ int zebra_vxlan_handle_kernel_neigh_update(struct interface *ifp, struct interfa
is_router, local_inactive,
dp_static);
return zebra_evpn_remote_neigh_update(zevpn, ifp, ip, macaddr, state);
return zebra_evpn_remote_neigh_update(zevpn, ifp, ip, macaddr, state, is_router);
}
static int32_t zebra_vxlan_remote_macip_helper(bool add, struct stream *s, vni_t *vni,
@ -4436,6 +4475,8 @@ int zebra_vxlan_check_readd_vtep(struct interface *ifp, vni_t vni, struct ipaddr
if (!zvtep)
return 0;
zvtep->gr_refresh_time = monotime(NULL);
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("Del MAC for remote VTEP %pIA intf %s(%u) VNI %u - readd", vtep_ip,
ifp->name, ifp->ifindex, vni);
@ -4798,6 +4839,8 @@ void zebra_vxlan_remote_vtep_del(vrf_id_t vrf_id, vni_t vni, struct ipaddr *vtep
if (!zvtep)
return;
zvtep->gr_refresh_time = monotime(NULL);
zebra_evpn_vtep_uninstall(zevpn, vtep_ip);
zebra_evpn_vtep_del(zevpn, zvtep);
}
@ -4866,6 +4909,9 @@ void zebra_vxlan_remote_vtep_add(vrf_id_t vrf_id, vni_t vni, struct ipaddr *vtep
zlog_debug("%s: VTEP %pIA already exists for VNI %u flood_control %d (received flood_control %d)",
__func__, vtep_ip, vni, zvtep->flood_control, flood_control);
/* Refresh entry */
zvtep->gr_refresh_time = monotime(NULL);
/* If the remote VTEP already exists check if
* the flood mode has changed
*/
@ -5358,13 +5404,17 @@ void zebra_vxlan_process_vrf_vni_cmd(struct zebra_vrf *zvrf, vni_t vni,
zebra_vxlan_process_l3vni_oper_down(zl3vni);
struct l3vni_walk_ctx wctx;
wctx.zl3vni = zl3vni;
wctx.gr_stale_cleanup = false;
wctx.gr_cleanup_time = 0;
/* delete and uninstall all rmacs */
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry,
zl3vni);
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry, &wctx);
/* delete and uninstall all next-hops */
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry,
zl3vni);
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry, &wctx);
zvrf->l3vni = 0;
zl3vni_del(zl3vni);
@ -5392,6 +5442,7 @@ int zebra_vxlan_vrf_enable(struct zebra_vrf *zvrf)
int zebra_vxlan_vrf_disable(struct zebra_vrf *zvrf)
{
struct zebra_l3vni *zl3vni = NULL;
struct l3vni_walk_ctx wctx;
if (zvrf->l3vni)
zl3vni = zl3vni_lookup(zvrf->l3vni);
@ -5400,10 +5451,14 @@ int zebra_vxlan_vrf_disable(struct zebra_vrf *zvrf)
zebra_vxlan_process_l3vni_oper_down(zl3vni);
wctx.zl3vni = zl3vni;
wctx.gr_stale_cleanup = false;
wctx.gr_cleanup_time = 0;
/* delete and uninstall all rmacs */
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry, zl3vni);
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry, &wctx);
/* delete and uninstall all next-hops */
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry, zl3vni);
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry, &wctx);
zl3vni->vrf_id = VRF_UNKNOWN;
@ -6276,26 +6331,60 @@ void zebra_vxlan_sg_replay(ZAPI_HANDLER_ARGS)
/* Cleanup EVPN configuration of a specific VRF */
static void zebra_evpn_vrf_cfg_cleanup(struct zebra_vrf *zvrf)
static void zebra_evpn_vrf_cfg_cleanup(struct zebra_vrf *zvrf, bool stale_cleanup,
uint64_t gr_cleanup_time)
{
struct zebra_l3vni *zl3vni = NULL;
struct l2vni_walk_ctx wctx;
zvrf->advertise_all_vni = 0;
zvrf->advertise_gw_macip = 0;
zvrf->advertise_svi_macip = 0;
zvrf->vxlan_flood_ctrl = VXLAN_FLOOD_HEAD_END_REPL;
if (!stale_cleanup) {
zvrf->advertise_all_vni = 0;
zvrf->advertise_gw_macip = 0;
zvrf->advertise_svi_macip = 0;
zvrf->vxlan_flood_ctrl = VXLAN_FLOOD_HEAD_END_REPL;
}
hash_iterate(zvrf->evpn_table, zebra_evpn_cfg_cleanup, NULL);
wctx.gr_stale_cleanup = stale_cleanup;
wctx.gr_cleanup_time = gr_cleanup_time;
hash_iterate(zvrf->evpn_table, zebra_evpn_cfg_cleanup, &wctx);
if (zvrf->l3vni)
zl3vni = zl3vni_lookup(zvrf->l3vni);
if (zl3vni) {
struct l3vni_walk_ctx l3wctx;
l3wctx.zl3vni = zl3vni;
l3wctx.gr_stale_cleanup = stale_cleanup;
l3wctx.gr_cleanup_time = gr_cleanup_time;
/* delete and uninstall all rmacs */
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry,
zl3vni);
hash_iterate(zl3vni->rmac_table, zl3vni_del_rmac_hash_entry, &l3wctx);
/* delete and uninstall all next-hops */
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry,
zl3vni);
hash_iterate(zl3vni->nh_table, zl3vni_del_nh_hash_entry, &l3wctx);
}
}
/*
* Cleanup stale EVPN entries in VRF
*/
void zebra_evpn_stale_entries_cleanup(uint64_t gr_cleanup_time)
{
struct vrf *vrf;
struct zebra_vrf *zvrf;
if (IS_ZEBRA_DEBUG_EVENT)
zlog_debug("EVPN-GR: Cleaning up stale entries in all VRFs");
RB_FOREACH (vrf, vrf_id_head, &vrfs_by_id) {
if (IS_ZEBRA_DEBUG_EVENT)
zlog_debug("EVPN-GR: Cleaning up stale entries in %s(%u)", vrf->name,
vrf->vrf_id);
frrtrace(2, frr_zebra, gr_evpn_stale_entries_cleanup, VRF_LOGNAME(vrf),
gr_cleanup_time);
zvrf = vrf->info;
if (zvrf)
zebra_evpn_vrf_cfg_cleanup(zvrf, true, gr_cleanup_time);
}
}
@ -6308,7 +6397,7 @@ static int zebra_evpn_bgp_cfg_clean_up(struct zserv *client)
RB_FOREACH (vrf, vrf_id_head, &vrfs_by_id) {
zvrf = vrf->info;
if (zvrf)
zebra_evpn_vrf_cfg_cleanup(zvrf);
zebra_evpn_vrf_cfg_cleanup(zvrf, false, 0);
}
return 0;
@ -6329,8 +6418,20 @@ static int zebra_evpn_pim_cfg_clean_up(struct zserv *client)
static int zebra_evpn_cfg_clean_up(struct zserv *client)
{
if (client->proto == ZEBRA_ROUTE_BGP)
return zebra_evpn_bgp_cfg_clean_up(client);
if (client->proto == ZEBRA_ROUTE_BGP) {
if (DYNAMIC_CLIENT_GR_DISABLED(client)) {
if (IS_ZEBRA_DEBUG_EVENT)
zlog_debug(
"EVPN-GR: client bgp has GR disabled. Cleaning up EVPN entries");
return zebra_evpn_bgp_cfg_clean_up(client);
}
/*
* BGP has GR enabled, do not cleanup the neigh
* and fdb entries from kernel.
*/
if (IS_ZEBRA_DEBUG_EVENT)
zlog_debug("EVPN-GR: client bgp has GR enabled. Retaining EVPN entries");
}
if (client->proto == ZEBRA_ROUTE_PIM)
return zebra_evpn_pim_cfg_clean_up(client);
@ -6482,3 +6583,90 @@ void zebra_vlan_dplane_result(struct zebra_dplane_ctx *ctx)
state);
}
}
/*********************** EVPN graceful restart *******************/
void zebra_vxlan_stale_hrep_add(struct ipaddr vtep_ip, vni_t vni)
{
struct zebra_evpn *zevpn = NULL;
struct zebra_vtep *zvtep = NULL;
zevpn = zebra_evpn_lookup(vni);
if (!zevpn || !zevpn->vxlan_if) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Add HREP %pIA, VNI %u,could not find EVPN inst/intf (%p)",
&vtep_ip, vni, zevpn);
return;
}
zvtep = zebra_evpn_vtep_find(zevpn, &vtep_ip);
if (!zvtep) {
/* Remote VTEP will be installed in kernel only if
* flood_control type is VXLAN_FLOOD_HEAD_END_REPL. So
* if we found the remote neigh with 0 MAC, then it's
* safe to set the flood_control type to
* VXLAN_FLOOD_HEAD_END_REPL
*/
zvtep = zebra_evpn_vtep_add(zevpn, &vtep_ip, VXLAN_FLOOD_HEAD_END_REPL);
if (!zvtep) {
zlog_debug("EVPN-GR: Failed to add HREP entry for %pIA, vni %u)", &vtep_ip,
vni);
return;
}
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Added stale HREP entry for %pIA, vni %u", &vtep_ip,
vni);
}
}
void zebra_vxlan_stale_remote_mac_add_l3vni(struct zebra_l3vni *zl3vni, struct ethaddr *macaddr,
struct ipaddr vtep_ip)
{
struct zebra_mac *zrmac = NULL;
zrmac = zl3vni_rmac_lookup(zl3vni, macaddr);
if (zrmac) {
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: RMAC %pEA (%p) zl3vni %p,VTEP %pIA L3VNI %d exists",
macaddr, zrmac, zl3vni, &vtep_ip, zl3vni->vni);
return;
}
/* Create the RMAC entry*/
zrmac = zl3vni_rmac_add(zl3vni, macaddr);
if (!zrmac) {
zlog_debug("EVPN-GR: Failed to add RMAC %pEA. VTEP %pIA L3VNI %u", macaddr,
&vtep_ip, zl3vni->vni);
return;
}
zrmac->fwd_info.r_vtep_ip = vtep_ip;
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Added stale RMAC %pEA (%p) zl3vni %p, VTEP %pIA L3VNI %d",
macaddr, zrmac, zl3vni, &vtep_ip, zl3vni->vni);
}
static void zl3vni_stale_remote_nh_read_add(struct zebra_l3vni *zl3vni, struct ipaddr *ip,
struct ethaddr *macaddr)
{
#ifdef GNU_LINUX
struct zebra_neigh *n = NULL;
/* Return if the NH exists */
if (zl3vni_nh_lookup(zl3vni, ip))
return;
/* Create remote NH */
n = zl3vni_nh_add(zl3vni, ip, macaddr);
if (!n) {
zlog_debug("EVPN-GR: Failed to add remote NH:IP %pIA MAC %pEA, L3VNI %u)", ip,
macaddr, zl3vni->vni);
return;
}
if (IS_ZEBRA_DEBUG_VXLAN)
zlog_debug("EVPN-GR: Added stale remote NH entry: IP %pIA MAC %pEA, L3VNI %u", ip,
macaddr, zl3vni->vni);
#endif
}

View file

@ -225,6 +225,10 @@ extern int zebra_vxlan_dp_network_mac_del(struct interface *ifp,
extern void zebra_vxlan_set_accept_bgp_seq(bool set);
extern bool zebra_vxlan_get_accept_bgp_seq(void);
extern void zebra_vlan_dplane_result(struct zebra_dplane_ctx *ctx);
extern void zebra_vxlan_stale_hrep_add(struct ipaddr vtep_ip, vni_t vni);
extern void zebra_vxlan_stale_remote_mac_add(struct ethaddr *macaddr, struct ipaddr vtep_ip,
bool sticky, vni_t vni);
extern void zebra_evpn_stale_entries_cleanup(uint64_t gr_cleanup_time);
#ifdef __cplusplus
}
#endif

View file

@ -147,11 +147,11 @@ static int zebra_vxlan_if_del_vni(struct interface *ifp,
zebra_evpn_send_del_to_client(zevpn);
/* Free up all neighbors and MAC, if any. */
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC);
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH, NULL);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC, NULL);
/* Free up all remote VTEPs, if any. */
zebra_evpn_vtep_del_all(zevpn, 1);
zebra_evpn_vtep_del_all(zevpn, 1, NULL);
/* Delete the hash entry. */
if (zebra_evpn_vxlan_del(zevpn)) {
@ -278,9 +278,9 @@ static int zebra_vxlan_if_update_vni(struct interface *ifp,
/* Also, free up all MACs and neighbors. */
zevpn->svi_if = NULL;
zebra_evpn_send_del_to_client(zevpn);
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC);
zebra_evpn_vtep_del_all(zevpn, 1);
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH, NULL);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC, NULL);
zebra_evpn_vtep_del_all(zevpn, 1, NULL);
return 0;
}
@ -291,8 +291,8 @@ static int zebra_vxlan_if_update_vni(struct interface *ifp,
*/
access_vlan = vnip->access_vlan;
vnip->access_vlan = ctx->old_vni.access_vlan;
zebra_evpn_neigh_del_all(zevpn, 0, 1, DEL_LOCAL_MAC);
zebra_evpn_mac_del_all(zevpn, 0, 1, DEL_LOCAL_MAC);
zebra_evpn_neigh_del_all(zevpn, 0, 1, DEL_LOCAL_MAC, NULL);
zebra_evpn_mac_del_all(zevpn, 0, 1, DEL_LOCAL_MAC, NULL);
zebra_evpn_rem_mac_uninstall_all(zevpn);
vnip->access_vlan = access_vlan;
}
@ -907,11 +907,11 @@ int zebra_vxlan_if_vni_down(struct interface *ifp, struct zebra_vxlan_vni *vnip)
zebra_evpn_send_del_to_client(zevpn);
/* Free up all neighbors and MACs, if any. */
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC);
zebra_evpn_neigh_del_all(zevpn, 1, 0, DEL_ALL_NEIGH, NULL);
zebra_evpn_mac_del_all(zevpn, 1, 0, DEL_ALL_MAC, NULL);
/* Free up all remote VTEPs, if any. */
zebra_evpn_vtep_del_all(zevpn, 1);
zebra_evpn_vtep_del_all(zevpn, 1, NULL);
}
return 0;
}

View file

@ -258,5 +258,7 @@ extern void zebra_vxlan_sg_deref(struct ipaddr *local_vtep_ip, struct in_addr mc
extern void zebra_vxlan_process_l3vni_oper_up(struct zebra_l3vni *zl3vni);
extern void zebra_vxlan_process_l3vni_oper_down(struct zebra_l3vni *zl3vni);
extern int zebra_evpn_vxlan_del(struct zebra_evpn *zevpn);
extern void zebra_vxlan_stale_remote_mac_add_l3vni(struct zebra_l3vni *zl3vni,
struct ethaddr *macaddr, struct ipaddr vtep_ip);
#endif /* _ZEBRA_VXLAN_PRIVATE_H */

View file

@ -232,7 +232,7 @@ struct zserv {
*/
uint32_t gr_instance_count;
time_t restart_time;
time_t update_pending_time;
/*
* Graceful restart information for
* each instance