X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Frouter.c;h=cae8da60de9aad8aef0f7ef792107aefcfaa3451;hb=135b5c0009e5201ac70394ee1fe98e523fe86072;hp=380f5ea5b7fdc95eda4d09db6a84f5ca8848d088;hpb=010f6b1819b9009745abda8d6119589dc336bd95;p=fs%2Flustre-release.git diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 380f5ea..cae8da6 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -35,8 +35,6 @@ #define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ PAGE_SHIFT) -extern unsigned int lnet_current_net_count; - static char *forwarding = ""; module_param(forwarding, charp, 0444); MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); @@ -187,7 +185,8 @@ lnet_peers_start_down(void) } /* - * A net is alive if at least one gateway NI on the network is alive. + * The peer_net of a gateway is alive if at least one of the peer_ni's on + * that peer_net is alive. */ static bool lnet_is_gateway_net_alive(struct lnet_peer_net *lpn) @@ -210,6 +209,9 @@ bool lnet_is_gateway_alive(struct lnet_peer *gw) { struct lnet_peer_net *lpn; + if (!gw->lp_alive) + return false; + list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) { if (!lnet_is_gateway_net_alive(lpn)) return false; @@ -230,7 +232,10 @@ bool lnet_is_route_alive(struct lnet_route *route) struct lnet_peer *gw = route->lr_gateway; struct lnet_peer_net *llpn; struct lnet_peer_net *rlpn; - bool route_alive; + + /* If the gateway is down then all routes are considered down */ + if (!gw->lp_alive) + return false; /* * if discovery is disabled then rely on the cached aliveness @@ -243,36 +248,35 @@ bool lnet_is_route_alive(struct lnet_route *route) return route->lr_alive; /* - * check the gateway's interfaces on the route rnet to make sure - * that the gateway is viable. + * check the gateway's interfaces on the local network */ llpn = lnet_peer_get_net_locked(gw, route->lr_lnet); if (!llpn) return false; - route_alive = lnet_is_gateway_net_alive(llpn); + if (!lnet_is_gateway_net_alive(llpn)) + return false; if (avoid_asym_router_failure) { + /* Check the gateway's interfaces on the remote network */ rlpn = lnet_peer_get_net_locked(gw, route->lr_net); if (!rlpn) return false; - route_alive = route_alive && - lnet_is_gateway_net_alive(rlpn); + if (!lnet_is_gateway_net_alive(rlpn)) + return false; } - if (!route_alive) - return route_alive; - spin_lock(&gw->lp_lock); if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) { + spin_unlock(&gw->lp_lock); if (gw->lp_rtr_refcount > 0) CERROR("peer %s is being used as a gateway but routing feature is not turned on\n", libcfs_nid2str(gw->lp_primary_nid)); - route_alive = false; + return false; } spin_unlock(&gw->lp_lock); - return route_alive; + return true; } void @@ -341,12 +345,14 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp) spin_unlock(&lp->lp_lock); - if (lp_state & LNET_PEER_PING_FAILED) { - CDEBUG(D_NET, - "Ping failed with %d. Set routes down for gw %s\n", - lp->lp_ping_error, libcfs_nid2str(lp->lp_primary_nid)); - /* If the ping failed then mark the routes served by this - * peer down + if (lp_state & LNET_PEER_PING_FAILED || + pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED) { + CDEBUG(D_NET, "Set routes down for gw %s because %s %d\n", + libcfs_nid2str(lp->lp_primary_nid), + lp_state & LNET_PEER_PING_FAILED ? "ping failed" : + "route feature is disabled", lp->lp_ping_error); + /* If the ping failed or the peer has routing disabled then + * mark the routes served by this peer down */ list_for_each_entry(route, &lp->lp_routes, lr_gwlist) lnet_set_route_aliveness(route, false); @@ -376,13 +382,6 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp) route->lr_gateway->lp_primary_nid) continue; - /* gateway has the routing feature disabled */ - if (pbuf->pb_info.pi_features & - LNET_PING_FEAT_RTE_DISABLED) { - lnet_set_route_aliveness(route, false); - continue; - } - llpn = lnet_peer_get_net_locked(lp, route->lr_lnet); if (!llpn) { lnet_set_route_aliveness(route, false); @@ -432,23 +431,25 @@ lnet_router_discovery_complete(struct lnet_peer *lp) spin_lock(&lp->lp_lock); lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY; + lp->lp_state |= LNET_PEER_RTR_DISCOVERED; + lp->lp_alive = lp->lp_dc_error == 0; spin_unlock(&lp->lp_lock); /* * Router discovery successful? All peer information would've been * updated already. No need to do any more processing */ - if (!lp->lp_dc_error) + if (lp->lp_alive) return; + /* - * discovery failed? then we need to set the status of each lpni - * to DOWN. It will be updated the next time we discover the - * router. For router peer NIs not on local networks, we never send - * messages directly to them, so their health will always remain - * at maximum. We can only tell if they are up or down from the - * status returned in the PING response. If we fail to get that - * status in our scheduled router discovery, then we'll assume - * it's down until we're told otherwise. + * We do not send messages directly to the remote interfaces + * of an LNet router. As such, we rely on the PING response + * to determine the up/down status of these interfaces. If + * a PING response is not receieved, or some other problem with + * discovery occurs that prevents us from getting this status, + * we assume all interfaces are down until we're able to + * determine otherwise. */ CDEBUG(D_NET, "%s: Router discovery failed %d\n", libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error); @@ -525,7 +526,6 @@ static void lnet_shuffle_seed(void) add_device_randomness(&ni->ni_nid, sizeof(ni->ni_nid)); seeded = 1; - return; } /* NB expects LNET_LOCK held */ @@ -747,19 +747,16 @@ lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list, int lnet_del_route(__u32 net, lnet_nid_t gw_nid) { - struct list_head rnet_zombies; + LIST_HEAD(rnet_zombies); struct lnet_remotenet *rnet; struct lnet_remotenet *tmp; struct list_head *rn_list; struct lnet_peer_ni *lpni; struct lnet_route *route; - struct list_head zombies; - struct lnet_peer *lp; + LIST_HEAD(zombies); + struct lnet_peer *lp = NULL; int i = 0; - INIT_LIST_HEAD(&rnet_zombies); - INIT_LIST_HEAD(&zombies); - CDEBUG(D_NET, "Del route: net %s : gw %s\n", libcfs_net2str(net), libcfs_nid2str(gw_nid)); @@ -930,7 +927,7 @@ lnet_wait_known_routerstate(void) spin_lock(&rtr->lp_lock); - if ((rtr->lp_state & LNET_PEER_DISCOVERED) == 0) { + if ((rtr->lp_state & LNET_PEER_RTR_DISCOVERED) == 0) { all_known = 0; spin_unlock(&rtr->lp_lock); break; @@ -1198,13 +1195,11 @@ lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt) { int npages = rbp->rbp_npages; struct lnet_rtrbuf *rb; - struct list_head tmp; + LIST_HEAD(tmp); if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ return; - INIT_LIST_HEAD(&tmp); - lnet_net_lock(cpt); list_splice_init(&rbp->rbp_msgs, &tmp); lnet_drop_routed_msgs_locked(&tmp, cpt); @@ -1225,7 +1220,7 @@ lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt) static int lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt) { - struct list_head rb_list; + LIST_HEAD(rb_list); struct lnet_rtrbuf *rb; int num_rb; int num_buffers = 0; @@ -1253,8 +1248,6 @@ lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt) rbp->rbp_req_nbuffers = nbufs; lnet_net_unlock(cpt); - INIT_LIST_HEAD(&rb_list); - /* allocate the buffers on a local list first. If all buffers are * allocated successfully then join this list to the rbp buffer * list. If not then free all allocated buffers. */ @@ -1670,6 +1663,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, lnet_peer_ni_decref_locked(lpni); if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) { lp = lpni->lpni_peer_net->lpn_peer; + lp->lp_alive = alive; list_for_each_entry(route, &lp->lp_routes, lr_gwlist) lnet_set_route_aliveness(route, alive); }