Whamcloud - gitweb
LU-13785 lnet: Use lr_hops for avoid_asym_router_failure
[fs/lustre-release.git] / lnet / lnet / router.c
index acddc75..fb60c56 100644 (file)
@@ -326,7 +326,8 @@ bool lnet_is_route_alive(struct lnet_route *route)
         * that the remote net must exist on the gateway. For multi-hop
         * routes the next-hop will not have the remote net.
         */
-       if (avoid_asym_router_failure && route->lr_single_hop) {
+       if (avoid_asym_router_failure &&
+           (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
                rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
                if (!rlpn)
                        return false;
@@ -377,7 +378,8 @@ lnet_consolidate_routes_locked(struct lnet_peer *orig_lp,
 static inline void
 lnet_check_route_inconsistency(struct lnet_route *route)
 {
-       if (!route->lr_single_hop && (int)route->lr_hops <= 1) {
+       if (!route->lr_single_hop &&
+           (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
                CWARN("route %s->%s is detected to be multi-hop but hop count is set to %d\n",
                        libcfs_net2str(route->lr_net),
                        libcfs_nid2str(route->lr_gateway->lp_primary_nid),
@@ -401,6 +403,7 @@ lnet_set_route_hop_type(struct lnet_peer *gw, struct lnet_route *route)
        lnet_check_route_inconsistency(route);
 }
 
+/* Must hold net_lock/EX */
 static inline void
 lnet_set_route_aliveness(struct lnet_route *route, bool alive)
 {
@@ -415,6 +418,7 @@ lnet_set_route_aliveness(struct lnet_route *route, bool alive)
        }
 }
 
+/* Must hold net_lock/EX */
 void
 lnet_router_discovery_ping_reply(struct lnet_peer *lp)
 {
@@ -491,7 +495,9 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp)
                }
 
                route->lr_single_hop = single_hop;
-               if (avoid_asym_router_failure && single_hop)
+               if (avoid_asym_router_failure &&
+                   (route->lr_hops == 1 ||
+                    route->lr_hops == LNET_UNDEFINED_HOPS))
                        lnet_set_route_aliveness(route, net_up);
                else
                        lnet_set_route_aliveness(route, true);
@@ -1770,6 +1776,37 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
 
        /* recalculate aliveness */
        alive = lnet_is_peer_ni_alive(lpni);
+
+       lp = lpni->lpni_peer_net->lpn_peer;
+       /* If this is an LNet router then update route aliveness */
+       if (lp->lp_rtr_refcount) {
+               if (reset)
+                       /* reset flag indicates gateway peer went up or down */
+                       lp->lp_alive = alive;
+
+               /* If discovery is disabled, locally or on the gateway, then
+                * any routes using lpni as next-hop need to be updated
+                *
+                * NB: We can get many notifications while a route is down, so
+                * we try and avoid the expensive net_lock/EX here for the
+                * common case of receiving duplicate lnet_notify() calls (i.e.
+                * only grab EX lock when we actually need to update the route
+                * aliveness).
+                */
+               if (lnet_is_discovery_disabled(lp)) {
+                       list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+                               if (route->lr_nid == lpni->lpni_nid &&
+                                   route->lr_alive != alive) {
+                                       lnet_net_unlock(0);
+                                       lnet_net_lock(LNET_LOCK_EX);
+                                       lnet_set_route_aliveness(route, alive);
+                                       lnet_net_unlock(LNET_LOCK_EX);
+                                       lnet_net_lock(0);
+                               }
+                       }
+               }
+       }
+
        lnet_net_unlock(0);
 
        if (ni != NULL && !alive)
@@ -1778,12 +1815,6 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
        cpt = lpni->lpni_cpt;
        lnet_net_lock(cpt);
        lnet_peer_ni_decref_locked(lpni);
-       if (lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer) {
-               lp = lpni->lpni_peer_net->lpn_peer;
-               lp->lp_alive = alive;
-               list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
-                       lnet_set_route_aliveness(route, alive);
-       }
        lnet_net_unlock(cpt);
 
        return 0;