Keep track of the aliveness of a peer so that we can optimize for
situations where an LNet router hasn't responded to a ping. In
this situation we consider all routes down, and we needn't spend time
inspecting each route, or inspecting all of the router's local and
remote interfaces in order to determine the router's aliveness.
Cray-bug-id: LUS-7860
Test-Parameters: trivial
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Ie63c1ef40de3ad818639bae6b040923898fd5b46
Reviewed-on: https://review.whamcloud.com/36678
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
/* tasks waiting on discovery of this peer */
wait_queue_head_t lp_dc_waitq;
/* tasks waiting on discovery of this peer */
wait_queue_head_t lp_dc_waitq;
+
+ /* cached peer aliveness */
+ bool lp_alive;
init_waitqueue_head(&lp->lp_dc_waitq);
spin_lock_init(&lp->lp_lock);
lp->lp_primary_nid = nid;
init_waitqueue_head(&lp->lp_dc_waitq);
spin_lock_init(&lp->lp_lock);
lp->lp_primary_nid = nid;
+ if (lnet_peers_start_down())
+ lp->lp_alive = false;
+ else
+ lp->lp_alive = true;
/*
* all peers created on a router should have health on
/*
* all peers created on a router should have health on
- * A net is alive if at least one gateway NI on the network is alive.
+ * The peer_net of a gateway is alive if at least one of the peer_ni's on
+ * that peer_net is alive.
*/
static bool
lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
*/
static bool
lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
{
struct lnet_peer_net *lpn;
{
struct lnet_peer_net *lpn;
+ if (!gw->lp_alive)
+ return false;
+
list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
if (!lnet_is_gateway_net_alive(lpn))
return false;
list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
if (!lnet_is_gateway_net_alive(lpn))
return false;
struct lnet_peer *gw = route->lr_gateway;
struct lnet_peer_net *llpn;
struct lnet_peer_net *rlpn;
struct lnet_peer *gw = route->lr_gateway;
struct lnet_peer_net *llpn;
struct lnet_peer_net *rlpn;
+
+ /* If the gateway is down then all routes are considered down */
+ if (!gw->lp_alive)
+ return false;
/*
* if discovery is disabled then rely on the cached aliveness
/*
* if discovery is disabled then rely on the cached aliveness
return route->lr_alive;
/*
return route->lr_alive;
/*
- * check the gateway's interfaces on the route rnet to make sure
- * that the gateway is viable.
+ * check the gateway's interfaces on the local network
*/
llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
if (!llpn)
return false;
*/
llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
if (!llpn)
return false;
- route_alive = lnet_is_gateway_net_alive(llpn);
+ if (!lnet_is_gateway_net_alive(llpn))
+ return false;
if (avoid_asym_router_failure) {
if (avoid_asym_router_failure) {
+ /* Check the gateway's interfaces on the remote network */
rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
if (!rlpn)
return false;
rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
if (!rlpn)
return false;
- route_alive = route_alive &&
- lnet_is_gateway_net_alive(rlpn);
+ if (!lnet_is_gateway_net_alive(rlpn))
+ return false;
- if (!route_alive)
- return route_alive;
-
spin_lock(&gw->lp_lock);
if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
spin_lock(&gw->lp_lock);
if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
+ spin_unlock(&gw->lp_lock);
if (gw->lp_rtr_refcount > 0)
CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
libcfs_nid2str(gw->lp_primary_nid));
if (gw->lp_rtr_refcount > 0)
CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
libcfs_nid2str(gw->lp_primary_nid));
}
spin_unlock(&gw->lp_lock);
}
spin_unlock(&gw->lp_lock);
spin_lock(&lp->lp_lock);
lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY;
lp->lp_state |= LNET_PEER_RTR_DISCOVERED;
spin_lock(&lp->lp_lock);
lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY;
lp->lp_state |= LNET_PEER_RTR_DISCOVERED;
+ lp->lp_alive = lp->lp_dc_error == 0;
spin_unlock(&lp->lp_lock);
/*
* Router discovery successful? All peer information would've been
* updated already. No need to do any more processing
*/
spin_unlock(&lp->lp_lock);
/*
* Router discovery successful? All peer information would've been
* updated already. No need to do any more processing
*/
- * discovery failed? then we need to set the status of each lpni
- * to DOWN. It will be updated the next time we discover the
- * router. For router peer NIs not on local networks, we never send
- * messages directly to them, so their health will always remain
- * at maximum. We can only tell if they are up or down from the
- * status returned in the PING response. If we fail to get that
- * status in our scheduled router discovery, then we'll assume
- * it's down until we're told otherwise.
+ * We do not send messages directly to the remote interfaces
+ * of an LNet router. As such, we rely on the PING response
+ * to determine the up/down status of these interfaces. If
+ * a PING response is not receieved, or some other problem with
+ * discovery occurs that prevents us from getting this status,
+ * we assume all interfaces are down until we're able to
+ * determine otherwise.
*/
CDEBUG(D_NET, "%s: Router discovery failed %d\n",
libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error);
*/
CDEBUG(D_NET, "%s: Router discovery failed %d\n",
libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error);
lnet_peer_ni_decref_locked(lpni);
if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) {
lp = lpni->lpni_peer_net->lpn_peer;
lnet_peer_ni_decref_locked(lpni);
if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) {
lp = lpni->lpni_peer_net->lpn_peer;
list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
lnet_set_route_aliveness(route, alive);
}
list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
lnet_set_route_aliveness(route, alive);
}