X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Frouter.c;h=6b45ffd2be6e16b93c6163b886ae6d034640523a;hp=6daca44d55aa4bb065e09de0ba791c4f5c3040de;hb=416e67222b769df490a8be034ef987a596dd8dff;hpb=b81bcc6c6f0c54c48e908eccb13adc620582881e;ds=sidebyside diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 6daca44..6b45ffd 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -35,8 +35,6 @@ #define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ PAGE_SHIFT) -extern unsigned int lnet_current_net_count; - static char *forwarding = ""; module_param(forwarding, charp, 0444); MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); @@ -187,7 +185,8 @@ lnet_peers_start_down(void) } /* - * A net is alive if at least one gateway NI on the network is alive. + * The peer_net of a gateway is alive if at least one of the peer_ni's on + * that peer_net is alive. */ static bool lnet_is_gateway_net_alive(struct lnet_peer_net *lpn) @@ -210,6 +209,9 @@ bool lnet_is_gateway_alive(struct lnet_peer *gw) { struct lnet_peer_net *lpn; + if (!gw->lp_alive) + return false; + list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) { if (!lnet_is_gateway_net_alive(lpn)) return false; @@ -230,7 +232,10 @@ bool lnet_is_route_alive(struct lnet_route *route) struct lnet_peer *gw = route->lr_gateway; struct lnet_peer_net *llpn; struct lnet_peer_net *rlpn; - bool route_alive; + + /* If the gateway is down then all routes are considered down */ + if (!gw->lp_alive) + return false; /* * if discovery is disabled then rely on the cached aliveness @@ -239,40 +244,39 @@ bool lnet_is_route_alive(struct lnet_route *route) * aliveness information can only be obtained when discovery is * enabled. */ - if (lnet_peer_discovery_disabled) + if (lnet_is_discovery_disabled(gw)) return route->lr_alive; /* - * check the gateway's interfaces on the route rnet to make sure - * that the gateway is viable. + * check the gateway's interfaces on the local network */ llpn = lnet_peer_get_net_locked(gw, route->lr_lnet); if (!llpn) return false; - route_alive = lnet_is_gateway_net_alive(llpn); + if (!lnet_is_gateway_net_alive(llpn)) + return false; if (avoid_asym_router_failure) { + /* Check the gateway's interfaces on the remote network */ rlpn = lnet_peer_get_net_locked(gw, route->lr_net); if (!rlpn) return false; - route_alive = route_alive && - lnet_is_gateway_net_alive(rlpn); + if (!lnet_is_gateway_net_alive(rlpn)) + return false; } - if (!route_alive) - return route_alive; - spin_lock(&gw->lp_lock); if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) { + spin_unlock(&gw->lp_lock); if (gw->lp_rtr_refcount > 0) CERROR("peer %s is being used as a gateway but routing feature is not turned on\n", libcfs_nid2str(gw->lp_primary_nid)); - route_alive = false; + return false; } spin_unlock(&gw->lp_lock); - return route_alive; + return true; } void @@ -332,18 +336,23 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp) spin_lock(&lp->lp_lock); lp_state = lp->lp_state; - spin_unlock(&lp->lp_lock); /* only handle replies if discovery is disabled. */ - if (!lnet_peer_discovery_disabled) + if (!lnet_is_discovery_disabled_locked(lp)) { + spin_unlock(&lp->lp_lock); return; + } - if (lp_state & LNET_PEER_PING_FAILED) { - CDEBUG(D_NET, - "Ping failed with %d. Set routes down for gw %s\n", - lp->lp_ping_error, libcfs_nid2str(lp->lp_primary_nid)); - /* If the ping failed then mark the routes served by this - * peer down + spin_unlock(&lp->lp_lock); + + if (lp_state & LNET_PEER_PING_FAILED || + pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED) { + CDEBUG(D_NET, "Set routes down for gw %s because %s %d\n", + libcfs_nid2str(lp->lp_primary_nid), + lp_state & LNET_PEER_PING_FAILED ? "ping failed" : + "route feature is disabled", lp->lp_ping_error); + /* If the ping failed or the peer has routing disabled then + * mark the routes served by this peer down */ list_for_each_entry(route, &lp->lp_routes, lr_gwlist) lnet_set_route_aliveness(route, false); @@ -373,13 +382,6 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp) route->lr_gateway->lp_primary_nid) continue; - /* gateway has the routing feature disabled */ - if (pbuf->pb_info.pi_features & - LNET_PING_FEAT_RTE_DISABLED) { - lnet_set_route_aliveness(route, false); - continue; - } - llpn = lnet_peer_get_net_locked(lp, route->lr_lnet); if (!llpn) { lnet_set_route_aliveness(route, false); @@ -429,23 +431,25 @@ lnet_router_discovery_complete(struct lnet_peer *lp) spin_lock(&lp->lp_lock); lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY; + lp->lp_state |= LNET_PEER_RTR_DISCOVERED; + lp->lp_alive = lp->lp_dc_error == 0; spin_unlock(&lp->lp_lock); /* * Router discovery successful? All peer information would've been * updated already. No need to do any more processing */ - if (!lp->lp_dc_error) + if (lp->lp_alive) return; + /* - * discovery failed? then we need to set the status of each lpni - * to DOWN. It will be updated the next time we discover the - * router. For router peer NIs not on local networks, we never send - * messages directly to them, so their health will always remain - * at maximum. We can only tell if they are up or down from the - * status returned in the PING response. If we fail to get that - * status in our scheduled router discovery, then we'll assume - * it's down until we're told otherwise. + * We do not send messages directly to the remote interfaces + * of an LNet router. As such, we rely on the PING response + * to determine the up/down status of these interfaces. If + * a PING response is not receieved, or some other problem with + * discovery occurs that prevents us from getting this status, + * we assume all interfaces are down until we're able to + * determine otherwise. */ CDEBUG(D_NET, "%s: Router discovery failed %d\n", libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error); @@ -522,7 +526,6 @@ static void lnet_shuffle_seed(void) add_device_randomness(&ni->ni_nid, sizeof(ni->ni_nid)); seeded = 1; - return; } /* NB expects LNET_LOCK held */ @@ -751,7 +754,7 @@ lnet_del_route(__u32 net, lnet_nid_t gw_nid) struct lnet_peer_ni *lpni; struct lnet_route *route; struct list_head zombies; - struct lnet_peer *lp; + struct lnet_peer *lp = NULL; int i = 0; INIT_LIST_HEAD(&rnet_zombies); @@ -927,7 +930,7 @@ lnet_wait_known_routerstate(void) spin_lock(&rtr->lp_lock); - if ((rtr->lp_state & LNET_PEER_DISCOVERED) == 0) { + if ((rtr->lp_state & LNET_PEER_RTR_DISCOVERED) == 0) { all_known = 0; spin_unlock(&rtr->lp_lock); break; @@ -1667,6 +1670,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, lnet_peer_ni_decref_locked(lpni); if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) { lp = lpni->lpni_peer_net->lpn_peer; + lp->lp_alive = alive; list_for_each_entry(route, &lp->lp_routes, lr_gwlist) lnet_set_route_aliveness(route, alive); }