X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Frouter.c;h=50441b366f7ac07c9ed1a5bcd2ccb33c37de21fa;hp=e5dd77bf0094a7ff25e15a5908a17197a8b16f5f;hb=0131d39a622f1efc07dc49df7bceed1bbe16357d;hpb=0f81c5ae973bf7fba45b6ba7f9c5f4fb1f6eadcb diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index e5dd77b..50441b3 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -619,6 +619,7 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) unsigned int offset = 0; unsigned int len = 0; struct list_head *e; + time64_t now; lnet_shuffle_seed(); @@ -641,9 +642,10 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) * force a router check on the gateway to make sure the route is * alive */ + now = ktime_get_real_seconds(); list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets, lpn_peer_nets) { - lpn->lpn_rtrcheck_timestamp = 0; + lpn->lpn_next_ping = now; } the_lnet.ln_remote_nets_version++; @@ -1137,12 +1139,13 @@ bool lnet_router_checker_active(void) void lnet_check_routers(void) { - struct lnet_peer_net *first_lpn = NULL; + struct lnet_peer_net *first_lpn; struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; struct list_head *entry; struct lnet_peer *rtr; bool push = false; + bool needs_ping; bool found_lpn; __u64 version; __u32 net_id; @@ -1158,15 +1161,18 @@ rescan: rtr = list_entry(entry, struct lnet_peer, lp_rtr_list); + /* If we're currently discovering the peer then don't + * issue another discovery + */ + if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY) + continue; + now = ktime_get_real_seconds(); - /* - * only discover the router if we've passed - * alive_router_check_interval seconds. Some of the router - * interfaces could be down and in that case they would be - * undergoing recovery separately from this discovery. - */ - /* find next peer net which is also local */ + /* find the next local peer net which needs to be ping'd */ + needs_ping = false; + first_lpn = NULL; + found_lpn = false; net_id = rtr->lp_disc_net_id; do { lpn = lnet_get_next_peer_net_locked(rtr, net_id); @@ -1175,13 +1181,27 @@ rescan: libcfs_nid2str(rtr->lp_primary_nid)); break; } + + /* We looped back to the first peer net */ if (first_lpn == lpn) break; if (!first_lpn) first_lpn = lpn; - found_lpn = lnet_islocalnet_locked(lpn->lpn_net_id); + net_id = lpn->lpn_net_id; - } while (!found_lpn); + if (!lnet_islocalnet_locked(net_id)) + continue; + + found_lpn = true; + + CDEBUG(D_NET, "rtr %s(%p) %s(%p) next ping %lld\n", + libcfs_nid2str(rtr->lp_primary_nid), rtr, + libcfs_net2str(net_id), lpn, + lpn->lpn_next_ping); + + needs_ping = now >= lpn->lpn_next_ping; + + } while (!needs_ping); if (!found_lpn || !lpn) { CERROR("no local network found for gateway %s\n", @@ -1189,19 +1209,10 @@ rescan: continue; } - if (now - lpn->lpn_rtrcheck_timestamp < - alive_router_check_interval / lnet_current_net_count) - continue; + if (!needs_ping) + continue; - /* - * If we're currently discovering the peer then don't - * issue another discovery - */ spin_lock(&rtr->lp_lock); - if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY) { - spin_unlock(&rtr->lp_lock); - continue; - } /* make sure we fully discover the router */ rtr->lp_state &= ~LNET_PEER_NIDS_UPTODATE; rtr->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH | @@ -1225,16 +1236,16 @@ rescan: libcfs_nid2str(lpni->lpni_nid), cpt); rc = lnet_discover_peer_locked(lpni, cpt, false); - /* decrement ref count acquired by find_peer_ni_locked() */ + /* drop ref taken above */ lnet_peer_ni_decref_locked(lpni); if (!rc) - lpn->lpn_rtrcheck_timestamp = now; + lpn->lpn_next_ping = now + alive_router_check_interval; else CERROR("Failed to discover router %s\n", libcfs_nid2str(rtr->lp_primary_nid)); - /* NB dropped lock */ + /* NB cpt lock was dropped in lnet_discover_peer_locked() */ if (version != the_lnet.ln_routers_version) { /* the routers list has changed */ goto rescan;