From 526679c681c34a4d5f8f7159b29cf51b971677c2 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 26 Mar 2019 14:16:32 -0700 Subject: [PATCH] LU-11299 lnet: discover each gateway Net Wakeup every gateway aliveness interval / number of local networks. Discover each local gateway network in round robin. This is done to make sure the gateway keeps its networks up. Test-Parameters: forbuildonly Signed-off-by: Amir Shehat Change-Id: I4035e39c286cb599d4eb8f9df7ed5d278e6d744a Reviewed-on: https://review.whamcloud.com/34511 Tested-by: Jenkins Reviewed-by: Olaf Weber --- lnet/include/lnet/lib-lnet.h | 4 +++ lnet/include/lnet/lib-types.h | 10 +++++--- lnet/lnet/api-ni.c | 41 ++++++++++++++++++++++++++---- lnet/lnet/lib-move.c | 23 ++++++++++++++--- lnet/lnet/peer.c | 33 ++++++++++++++++++++++++ lnet/lnet/router.c | 58 +++++++++++++++++++++++++++++++++++++++---- 6 files changed, 153 insertions(+), 16 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index a1ad5a7..aae7ead 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -615,6 +615,7 @@ struct lnet_net *lnet_get_net_locked(__u32 net_id); int lnet_islocalnid(lnet_nid_t nid); int lnet_islocalnet(__u32 net); +int lnet_islocalnet_locked(__u32 net); void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, unsigned int offset, unsigned int mlen); @@ -905,7 +906,10 @@ bool lnet_net_unique(__u32 net_id, struct list_head *nilist, bool lnet_ni_unique_net(struct list_head *nilist, char *iface); void lnet_incr_dlc_seq(void); __u32 lnet_get_dlc_seq_locked(void); +int lnet_get_net_count(void); +struct lnet_peer_net *lnet_get_next_peer_net_locked(struct lnet_peer *lp, + __u32 prev_lpn_id); struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, struct lnet_peer_net *peer_net, struct lnet_peer_ni *prev); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index c959e5c..f00108f 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -618,6 +618,9 @@ struct lnet_peer { /* primary NID of the peer */ lnet_nid_t lp_primary_nid; + /* net to perform discovery on */ + __u32 lp_disc_net_id; + /* CPT of peer_table */ int lp_cpt; @@ -639,9 +642,6 @@ struct lnet_peer { /* routes on this peer */ struct list_head lp_routes; - /* time of last router check attempt */ - time64_t lp_rtrcheck_timestamp; - /* reference count */ atomic_t lp_refcount; @@ -762,6 +762,10 @@ struct lnet_peer_net { /* Net ID */ __u32 lpn_net_id; + /* time of last router net check attempt */ + time64_t lpn_rtrcheck_timestamp; + + /* reference count */ atomic_t lpn_refcount; }; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 56a6f9c..8784577 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -221,6 +221,7 @@ MODULE_PARM_DESC(lnet_retry_count, unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT; +unsigned int lnet_current_net_count; /* * This sequence number keeps track of how many times DLC was used to @@ -1368,18 +1369,28 @@ lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni) EXPORT_SYMBOL(lnet_cpt_of_nid); int -lnet_islocalnet(__u32 net_id) +lnet_islocalnet_locked(__u32 net_id) { struct lnet_net *net; - int cpt; - bool local; - - cpt = lnet_net_lock_current(); + bool local; net = lnet_get_net_locked(net_id); local = net != NULL; + return local; +} + +int +lnet_islocalnet(__u32 net_id) +{ + int cpt; + bool local; + + cpt = lnet_net_lock_current(); + + local = lnet_islocalnet_locked(net_id); + lnet_net_unlock(cpt); return local; @@ -1536,6 +1547,23 @@ lnet_get_ni_count(void) return count; } +int +lnet_get_net_count(void) +{ + struct lnet_net *net; + int count = 0; + + lnet_net_lock(0); + + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + count++; + } + + lnet_net_unlock(0); + + return count; +} + void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) { @@ -2382,6 +2410,9 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun) lnet_net_unlock(LNET_LOCK_EX); } + /* update net count */ + lnet_current_net_count = lnet_get_net_count(); + return ni_count; failed1: diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 70ece19..7067164 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -42,6 +42,8 @@ #include #include +extern unsigned int lnet_current_net_count; + static int local_nid_dist_zero = 1; module_param(local_nid_dist_zero, int, 0444); MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); @@ -2126,7 +2128,8 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) } struct lnet_ni * -lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) +lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, + bool discovery) { struct lnet_peer_net *peer_net = NULL; struct lnet_ni *best_ni = NULL; @@ -2148,6 +2151,14 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) continue; best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net, md_cpt, false); + + /* + * if this is a discovery message and lp_disc_net_id is + * specified then use that net to send the discovery on. + */ + if (peer->lp_disc_net_id == peer_net->lpn_net_id && + discovery) + break; } if (best_ni) @@ -2317,7 +2328,8 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) * networks. */ sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, - sd->sd_md_cpt); + sd->sd_md_cpt, + lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd, sd->sd_peer, @@ -3405,9 +3417,14 @@ lnet_monitor_thread(void *arg) * if we wake up every 1 second? Although, we've seen * cases where we get a complaint that an idle thread * is waking up unnecessarily. + * + * Take into account the current net_count when you wake + * up for alive router checking, since we need to check + * possibly as many networks as we have configured. */ interval = min(lnet_recovery_interval, - min((unsigned int) alive_router_check_interval, + min((unsigned int) alive_router_check_interval / + lnet_current_net_count, lnet_transaction_timeout / 2)); wait_event_interruptible_timeout(the_lnet.ln_mt_waitq, false, diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index c7dee63..f58e0e0 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -716,6 +716,39 @@ lnet_find_peer(lnet_nid_t nid) return lp; } +struct lnet_peer_net * +lnet_get_next_peer_net_locked(struct lnet_peer *lp, __u32 prev_lpn_id) +{ + struct lnet_peer_net *net; + + if (!prev_lpn_id) { + /* no net id provided return the first net */ + net = list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + + return net; + } + + /* find the net after the one provided */ + list_for_each_entry(net, &lp->lp_peer_nets, lpn_peer_nets) { + if (net->lpn_net_id == prev_lpn_id) { + /* + * if we reached the end of the list loop to the + * beginning. + */ + if (net->lpn_peer_nets.next == &lp->lp_peer_nets) + return list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + else + return list_next_entry(net, lpn_peer_nets); + } + } + + return NULL; +} + struct lnet_peer_ni * lnet_get_next_peer_ni_locked(struct lnet_peer *peer, struct lnet_peer_net *peer_net, diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index b0ed4be..7d7e08d 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -35,6 +35,8 @@ #define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ PAGE_SHIFT) +extern unsigned int lnet_current_net_count; + static char *forwarding = ""; module_param(forwarding, charp, 0444); MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); @@ -387,8 +389,9 @@ static void lnet_shuffle_seed(void) static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) { - unsigned int len = 0; + struct lnet_peer_net *lpn; unsigned int offset = 0; + unsigned int len = 0; struct list_head *e; lnet_shuffle_seed(); @@ -412,7 +415,10 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) * force a router check on the gateway to make sure the route is * alive */ - route->lr_gateway->lp_rtrcheck_timestamp = 0; + list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets, + lpn_peer_nets) { + lpn->lpn_rtrcheck_timestamp = 0; + } the_lnet.ln_remote_nets_version++; @@ -642,6 +648,18 @@ lnet_del_route(__u32 net, lnet_nid_t gw_nid) } delete_zombies: + /* + * check if there are any routes remaining on the gateway + * If there are no more routes make sure to set the peer's + * lp_disc_net_id to 0 (invalid), in case we add more routes in + * the future on that gateway, then we start our discovery process + * from scratch + */ + if (lpni) { + if (list_empty(&lp->lp_routes)) + lp->lp_disc_net_id = 0; + } + lnet_net_unlock(LNET_LOCK_EX); while (!list_empty(&zombies)) { @@ -864,11 +882,15 @@ lnet_router_checker_active(void) void lnet_check_routers(void) { + struct lnet_peer_net *first_lpn = NULL; + struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; struct list_head *entry; struct lnet_peer *rtr; bool push = false; + bool found_lpn; __u64 version; + __u32 net_id; time64_t now; int cpt; int rc; @@ -889,8 +911,31 @@ rescan: * interfaces could be down and in that case they would be * undergoing recovery separately from this discovery. */ - if (now - rtr->lp_rtrcheck_timestamp < - alive_router_check_interval) + /* find next peer net which is also local */ + net_id = rtr->lp_disc_net_id; + do { + lpn = lnet_get_next_peer_net_locked(rtr, net_id); + if (!lpn) { + CERROR("gateway %s has no networks\n", + libcfs_nid2str(rtr->lp_primary_nid)); + break; + } + if (first_lpn == lpn) + break; + if (!first_lpn) + first_lpn = lpn; + found_lpn = lnet_islocalnet_locked(lpn->lpn_net_id); + net_id = lpn->lpn_net_id; + } while (!found_lpn); + + if (!found_lpn || !lpn) { + CERROR("no local network found for gateway %s\n", + libcfs_nid2str(rtr->lp_primary_nid)); + continue; + } + + if (now - lpn->lpn_rtrcheck_timestamp < + alive_router_check_interval / lnet_current_net_count) continue; /* @@ -916,6 +961,9 @@ rescan: } lnet_peer_ni_addref_locked(lpni); + /* specify the net to use */ + rtr->lp_disc_net_id = lpn->lpn_net_id; + /* discover the router */ CDEBUG(D_NET, "discover %s, cpt = %d\n", libcfs_nid2str(lpni->lpni_nid), cpt); @@ -925,7 +973,7 @@ rescan: lnet_peer_ni_decref_locked(lpni); if (!rc) - rtr->lp_rtrcheck_timestamp = now; + lpn->lpn_rtrcheck_timestamp = now; else CERROR("Failed to discover router %s\n", libcfs_nid2str(rtr->lp_primary_nid)); -- 1.8.3.1