Whamcloud - gitweb
LU-6142 lnet: use list_first_entry() in lnet/lnet subdirectory.
[fs/lustre-release.git] / lnet / lnet / router.c
index 599a8a1..d581fa2 100644 (file)
@@ -35,8 +35,6 @@
 #define LNET_NRB_LARGE_PAGES   ((LNET_MTU + PAGE_SIZE - 1) >> \
                                  PAGE_SHIFT)
 
-extern unsigned int lnet_current_net_count;
-
 static char *forwarding = "";
 module_param(forwarding, charp, 0444);
 MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
@@ -80,6 +78,14 @@ int avoid_asym_router_failure = 1;
 module_param(avoid_asym_router_failure, int, 0644);
 MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
 
+int dead_router_check_interval = INT_MIN;
+module_param(dead_router_check_interval, int, 0444);
+MODULE_PARM_DESC(dead_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)");
+
+int live_router_check_interval = INT_MIN;
+module_param(live_router_check_interval, int, 0444);
+MODULE_PARM_DESC(live_router_check_interval, "(DEPRECATED - Use alive_router_check_interval)");
+
 int alive_router_check_interval = 60;
 module_param(alive_router_check_interval, int, 0644);
 MODULE_PARM_DESC(alive_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
@@ -112,6 +118,12 @@ module_param_call(router_sensitivity_percentage, rtr_sensitivity_set, param_get_
 MODULE_PARM_DESC(router_sensitivity_percentage,
                "How healthy a gateway should be to be used in percent");
 
+static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet,
+                                  struct lnet_route *route);
+static void lnet_del_route_from_rnet(struct lnet_nid *gw_nid,
+                                    struct list_head *route_list,
+                                    struct list_head *zombies);
+
 static int
 rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
@@ -144,30 +156,91 @@ rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
 }
 
 void
+lnet_move_route(struct lnet_route *route, struct lnet_peer *lp,
+               struct list_head *rt_list)
+{
+       struct lnet_remotenet *rnet;
+       struct list_head zombies;
+       struct list_head *l;
+
+       INIT_LIST_HEAD(&zombies);
+
+       if (rt_list)
+               l = rt_list;
+       else
+               l = &zombies;
+
+       rnet = lnet_find_rnet_locked(route->lr_net);
+       LASSERT(rnet);
+
+       CDEBUG(D_NET, "deleting route %s->%s\n",
+              libcfs_net2str(route->lr_net),
+              libcfs_nidstr(&route->lr_nid));
+
+       /*
+        * use the gateway's lp_primary_nid to delete the route as the
+        * lr_nid can be a constituent NID of the peer
+        */
+       lnet_del_route_from_rnet(
+               &route->lr_gateway->lp_primary_nid,
+               &rnet->lrn_routes, l);
+
+       if (lp) {
+               route = list_first_entry(l, struct lnet_route,
+                                        lr_list);
+               route->lr_gateway = lp;
+               lnet_add_route_to_rnet(rnet, route);
+       } else {
+               while (!list_empty(l) && !rt_list) {
+                       route = list_first_entry(l, struct lnet_route,
+                                                lr_list);
+                       list_del(&route->lr_list);
+                       LIBCFS_FREE(route, sizeof(*route));
+               }
+       }
+}
+
+void
 lnet_rtr_transfer_to_peer(struct lnet_peer *src, struct lnet_peer *target)
 {
        struct lnet_route *route;
+       struct lnet_route *tmp, *tmp2;
 
        lnet_net_lock(LNET_LOCK_EX);
-       target->lp_rtr_refcount += src->lp_rtr_refcount;
-       /* move the list of queued messages to the new peer */
+       CDEBUG(D_NET, "transfering routes from %s -> %s\n",
+              libcfs_nidstr(&src->lp_primary_nid),
+              libcfs_nidstr(&target->lp_primary_nid));
+       list_for_each_entry(route, &src->lp_routes, lr_gwlist) {
+               CDEBUG(D_NET, "%s: %s->%s\n",
+                      libcfs_nidstr(&src->lp_primary_nid),
+                      libcfs_net2str(route->lr_net),
+                      libcfs_nidstr(&route->lr_nid));
+       }
        list_splice_init(&src->lp_rtrq, &target->lp_rtrq);
-       /* move all the routes that reference the peer */
-       list_splice_init(&src->lp_routes, &target->lp_routes);
-       /* update all the routes to point to the new peer */
-       list_for_each_entry(route, &target->lp_routes, lr_gwlist)
-               route->lr_gateway = target;
-       /* remove the old peer from the ln_routers list */
-       list_del_init(&src->lp_rtr_list);
-       /* add the new peer to the ln_routers list */
+       list_for_each_entry_safe(route, tmp, &src->lp_routes, lr_gwlist) {
+               struct lnet_route *r2;
+               bool present = false;
+               list_for_each_entry_safe(r2, tmp2, &target->lp_routes, lr_gwlist) {
+                       if (route->lr_net == r2->lr_net) {
+                               if (route->lr_priority >= r2->lr_priority)
+                                       present = true;
+                               else if (route->lr_hops >= r2->lr_hops)
+                                       present = true;
+                               else
+                                       lnet_move_route(r2, NULL, NULL);
+                       }
+               }
+               if (present)
+                       lnet_move_route(route, NULL, NULL);
+               else
+                       lnet_move_route(route, target, NULL);
+       }
+
        if (list_empty(&target->lp_rtr_list)) {
                lnet_peer_addref_locked(target);
                list_add_tail(&target->lp_rtr_list, &the_lnet.ln_routers);
        }
-       /* reset the ref count on the old peer and decrement its ref count */
-       src->lp_rtr_refcount = 0;
-       lnet_peer_decref_locked(src);
-       /* update the router version */
+
        the_lnet.ln_routers_version++;
        lnet_net_unlock(LNET_LOCK_EX);
 }
@@ -179,7 +252,8 @@ lnet_peers_start_down(void)
 }
 
 /*
- * A net is alive if at least one gateway NI on the network is alive.
+ * The peer_net of a gateway is alive if at least one of the peer_ni's on
+ * that peer_net is alive.
  */
 static bool
 lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
@@ -202,6 +276,9 @@ bool lnet_is_gateway_alive(struct lnet_peer *gw)
 {
        struct lnet_peer_net *lpn;
 
+       if (!gw->lp_alive)
+               return false;
+
        list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
                if (!lnet_is_gateway_net_alive(lpn))
                        return false;
@@ -222,39 +299,56 @@ bool lnet_is_route_alive(struct lnet_route *route)
        struct lnet_peer *gw = route->lr_gateway;
        struct lnet_peer_net *llpn;
        struct lnet_peer_net *rlpn;
-       bool route_alive;
+
+       /* If the gateway is down then all routes are considered down */
+       if (!gw->lp_alive)
+               return false;
+
+       /*
+        * if discovery is disabled then rely on the cached aliveness
+        * information. This is handicapped information which we log when
+        * we receive the discovery ping response. The most uptodate
+        * aliveness information can only be obtained when discovery is
+        * enabled.
+        */
+       if (lnet_is_discovery_disabled(gw))
+               return atomic_read(&route->lr_alive) == 1;
 
        /*
-        * check the gateway's interfaces on the route rnet to make sure
-        * that the gateway is viable.
+        * check the gateway's interfaces on the local network
         */
        llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
        if (!llpn)
                return false;
 
-       route_alive = lnet_is_gateway_net_alive(llpn);
+       if (!lnet_is_gateway_net_alive(llpn))
+               return false;
 
-       if (avoid_asym_router_failure) {
+       /*
+        * For single hop routes avoid_asym_router_failure dictates
+        * that the remote net must exist on the gateway. For multi-hop
+        * routes the next-hop will not have the remote net.
+        */
+       if (avoid_asym_router_failure &&
+           (route->lr_hops == 1 || route->lr_single_hop)) {
                rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
                if (!rlpn)
                        return false;
-               route_alive = route_alive &&
-                             lnet_is_gateway_net_alive(rlpn);
+               if (!lnet_is_gateway_net_alive(rlpn))
+                       return false;
        }
 
-       if (!route_alive)
-               return route_alive;
-
        spin_lock(&gw->lp_lock);
        if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
+               spin_unlock(&gw->lp_lock);
                if (gw->lp_rtr_refcount > 0)
                        CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
-                              libcfs_nid2str(gw->lp_primary_nid));
-               route_alive = false;
+                              libcfs_nidstr(&gw->lp_primary_nid));
+               return false;
        }
        spin_unlock(&gw->lp_lock);
 
-       return route_alive;
+       return true;
 }
 
 void
@@ -275,45 +369,180 @@ lnet_consolidate_routes_locked(struct lnet_peer *orig_lp,
         * intent here is not to confuse the user who added the route.
         */
        list_for_each_entry(route, &orig_lp->lp_routes, lr_gwlist) {
-               lpni = lnet_peer_get_ni_locked(orig_lp, route->lr_nid);
+               lpni = lnet_peer_ni_get_locked(orig_lp, &route->lr_nid);
                if (!lpni) {
                        lnet_net_lock(LNET_LOCK_EX);
                        list_move(&route->lr_gwlist, &new_lp->lp_routes);
                        lnet_net_unlock(LNET_LOCK_EX);
                }
        }
+}
+
+static inline void
+lnet_check_route_inconsistency(struct lnet_route *route)
+{
+       if (!route->lr_single_hop &&
+           (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
+               CWARN("route %s->%s is detected to be multi-hop but hop count is set to %d\n",
+                       libcfs_net2str(route->lr_net),
+                       libcfs_nidstr(&route->lr_gateway->lp_primary_nid),
+                       (int) route->lr_hops);
+       }
+}
+
+static void
+lnet_set_route_hop_type(struct lnet_peer *gw, struct lnet_route *route)
+{
+       struct lnet_peer_net *lpn;
+       bool single_hop = false;
+
+       list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
+               if (route->lr_net == lpn->lpn_net_id) {
+                       single_hop = true;
+                       break;
+               }
+       }
+       route->lr_single_hop = single_hop;
+       lnet_check_route_inconsistency(route);
+}
+
+/* Must hold net_lock/EX */
+void
+lnet_router_discovery_ping_reply(struct lnet_peer *lp)
+{
+       struct lnet_ping_buffer *pbuf = lp->lp_data;
+       struct lnet_peer_net *llpn;
+       struct lnet_route *route;
+       bool single_hop = false;
+       bool net_up = false;
+       unsigned lp_state;
+       __u32 net;
+       int i;
+
+
+       spin_lock(&lp->lp_lock);
+       lp_state = lp->lp_state;
+
+       /* only handle replies if discovery is disabled. */
+       if (!lnet_is_discovery_disabled_locked(lp)) {
+               spin_unlock(&lp->lp_lock);
+               return;
+       }
+
+       spin_unlock(&lp->lp_lock);
+
+       if (lp_state & LNET_PEER_PING_FAILED ||
+           pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED) {
+               CDEBUG(D_NET, "Set routes down for gw %s because %s %d\n",
+                      libcfs_nidstr(&lp->lp_primary_nid),
+                      lp_state & LNET_PEER_PING_FAILED ? "ping failed" :
+                      "route feature is disabled", lp->lp_ping_error);
+               /* If the ping failed or the peer has routing disabled then
+                * mark the routes served by this peer down
+                */
+               list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
+                       lnet_set_route_aliveness(route, false);
+               return;
+       }
+
+       CDEBUG(D_NET, "Discovery is disabled. Processing reply for gw: %s:%d\n",
+              libcfs_nidstr(&lp->lp_primary_nid), pbuf->pb_info.pi_nnis);
+
+       /*
+        * examine the ping response to determine if the routes on that
+        * gateway should be declared alive.
+        * The route is alive if:
+        *  1. local network to reach the route is alive and
+        *  2. route is single hop, avoid_async_router_failure is set and
+        *     there exists at least one NI on the route's remote net
+        */
+       list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+               llpn = lnet_peer_get_net_locked(lp, route->lr_lnet);
+               if (!llpn) {
+                       lnet_set_route_aliveness(route, false);
+                       continue;
+               }
+
+               if (!lnet_is_gateway_net_alive(llpn)) {
+                       lnet_set_route_aliveness(route, false);
+                       continue;
+               }
 
+               single_hop = net_up = false;
+               for (i = 1; i < pbuf->pb_info.pi_nnis; i++) {
+                       net = LNET_NIDNET(pbuf->pb_info.pi_ni[i].ns_nid);
+
+                       if (route->lr_net == net) {
+                               single_hop = true;
+                               if (pbuf->pb_info.pi_ni[i].ns_status ==
+                                   LNET_NI_STATUS_UP) {
+                                       net_up = true;
+                                       break;
+                               }
+                       }
+               }
+
+               route->lr_single_hop = single_hop;
+               if (avoid_asym_router_failure &&
+                   (route->lr_hops == 1 || route->lr_single_hop))
+                       lnet_set_route_aliveness(route, net_up);
+               else
+                       lnet_set_route_aliveness(route, true);
+
+               /*
+                * warn that the route is configured as single-hop but it
+                * really is multi-hop as far as we can tell.
+                */
+               lnet_check_route_inconsistency(route);
+       }
 }
 
 void
 lnet_router_discovery_complete(struct lnet_peer *lp)
 {
        struct lnet_peer_ni *lpni = NULL;
+       struct lnet_route *route;
 
        spin_lock(&lp->lp_lock);
        lp->lp_state &= ~LNET_PEER_RTR_DISCOVERY;
+       lp->lp_state |= LNET_PEER_RTR_DISCOVERED;
+       lp->lp_alive = lp->lp_dc_error == 0;
        spin_unlock(&lp->lp_lock);
 
-       /*
-        * Router discovery successful? All peer information would've been
-        * updated already. No need to do any more processing
-        */
-       if (!lp->lp_dc_error)
+       if (!lp->lp_dc_error) {
+               /* ping replies are being handled when discovery is disabled */
+               if (lnet_is_discovery_disabled_locked(lp))
+                       return;
+
+               /*
+               * mark single-hop routes.  If the remote net is not configured on
+               * the gateway we assume this is intentional and we mark the
+               * gateway as multi-hop
+               */
+               list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+                       lnet_set_route_aliveness(route, true);
+                       lnet_set_route_hop_type(lp, route);
+               }
+
                return;
+       }
+
        /*
-        * discovery failed? then we need to set the status of each lpni
-        * to DOWN. It will be updated the next time we discover the
-        * router. For router peer NIs not on local networks, we never send
-        * messages directly to them, so their health will always remain
-        * at maximum. We can only tell if they are up or down from the
-        * status returned in the PING response. If we fail to get that
-        * status in our scheduled router discovery, then we'll assume
-        * it's down until we're told otherwise.
+        * We do not send messages directly to the remote interfaces
+        * of an LNet router. As such, we rely on the PING response
+        * to determine the up/down status of these interfaces. If
+        * a PING response is not receieved, or some other problem with
+        * discovery occurs that prevents us from getting this status,
+        * we assume all interfaces are down until we're able to
+        * determine otherwise.
         */
        CDEBUG(D_NET, "%s: Router discovery failed %d\n",
-              libcfs_nid2str(lp->lp_primary_nid), lp->lp_dc_error);
+              libcfs_nidstr(&lp->lp_primary_nid), lp->lp_dc_error);
        while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL)
                lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
+
+       list_for_each_entry(route, &lp->lp_routes, lr_gwlist)
+               lnet_set_route_aliveness(route, false);
 }
 
 static void
@@ -353,15 +582,12 @@ struct lnet_remotenet *
 lnet_find_rnet_locked(__u32 net)
 {
        struct lnet_remotenet *rnet;
-       struct list_head *tmp;
        struct list_head *rn_list;
 
        LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
        rn_list = lnet_net2rnethash(net);
-       list_for_each(tmp, rn_list) {
-               rnet = list_entry(tmp, struct lnet_remotenet, lrn_list);
-
+       list_for_each_entry(rnet, rn_list, lrn_list) {
                if (rnet->lrn_net == net)
                        return rnet;
        }
@@ -382,7 +608,6 @@ static void lnet_shuffle_seed(void)
                add_device_randomness(&ni->ni_nid, sizeof(ni->ni_nid));
 
        seeded = 1;
-       return;
 }
 
 /* NB expects LNET_LOCK held */
@@ -393,6 +618,7 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
        unsigned int offset = 0;
        unsigned int len = 0;
        struct list_head *e;
+       time64_t now;
 
        lnet_shuffle_seed();
 
@@ -415,9 +641,10 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
         * force a router check on the gateway to make sure the route is
         * alive
         */
+       now = ktime_get_real_seconds();
        list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets,
                            lpn_peer_nets) {
-               lpn->lpn_rtrcheck_timestamp = 0;
+               lpn->lpn_next_ping = now;
        }
 
        the_lnet.ln_remote_nets_version++;
@@ -430,7 +657,7 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
 }
 
 int
-lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
+lnet_add_route(__u32 net, __u32 hops, struct lnet_nid *gateway,
               __u32 priority, __u32 sensitivity)
 {
        struct list_head *route_entry;
@@ -443,13 +670,13 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        int rc;
 
        CDEBUG(D_NET, "Add route: remote net %s hops %d priority %u gw %s\n",
-              libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
+              libcfs_net2str(net), hops, priority, libcfs_nidstr(gateway));
 
-       if (gateway == LNET_NID_ANY ||
-           LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
-           net == LNET_NIDNET(LNET_NID_ANY) ||
+       if (LNET_NID_IS_ANY(gateway) ||
+           nid_is_lo0(gateway) ||
+           net == LNET_NET_ANY ||
            LNET_NETTYP(net) == LOLND ||
-           LNET_NIDNET(gateway) == net ||
+           LNET_NID_NET(gateway) == net ||
            (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255)))
                return -EINVAL;
 
@@ -457,11 +684,11 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        if (lnet_islocalnet(net))
                return -EEXIST;
 
-       if (!lnet_islocalnet(LNET_NIDNET(gateway))) {
+       if (!lnet_islocalnet(LNET_NID_NET(gateway))) {
                CERROR("Cannot add route with gateway %s. There is no local interface configured on LNet %s\n",
-                      libcfs_nid2str(gateway),
-                      libcfs_net2str(LNET_NIDNET(gateway)));
-               return -EINVAL;
+                      libcfs_nidstr(gateway),
+                      libcfs_net2str(LNET_NID_NET(gateway)));
+               return -EHOSTUNREACH;
        }
 
        /* Assume net, route, all new */
@@ -469,7 +696,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        LIBCFS_ALLOC(rnet, sizeof(*rnet));
        if (route == NULL || rnet == NULL) {
                CERROR("Out of memory creating route %s %d %s\n",
-                      libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+                      libcfs_net2str(net), hops, libcfs_nidstr(gateway));
                if (route != NULL)
                        LIBCFS_FREE(route, sizeof(*route));
                if (rnet != NULL)
@@ -480,11 +707,15 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        INIT_LIST_HEAD(&rnet->lrn_routes);
        rnet->lrn_net = net;
        /* store the local and remote net that the route represents */
-       route->lr_lnet = LNET_NIDNET(gateway);
+       route->lr_lnet = LNET_NID_NET(gateway);
        route->lr_net = net;
-       route->lr_nid = gateway;
+       route->lr_nid = *gateway;
        route->lr_priority = priority;
        route->lr_hops = hops;
+       if (lnet_peers_start_down())
+               atomic_set(&route->lr_alive, 0);
+       else
+               atomic_set(&route->lr_alive, 1);
 
        lnet_net_lock(LNET_LOCK_EX);
 
@@ -492,7 +723,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
         * lnet_nid2peerni_ex() grabs a ref on the lpni. We will need to
         * lose that once we're done
         */
-       lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX);
+       lpni = lnet_nid2peerni_ex(gateway);
        if (IS_ERR(lpni)) {
                lnet_net_unlock(LNET_LOCK_EX);
 
@@ -502,11 +733,13 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                rc = PTR_ERR(lpni);
                CERROR("Error %d creating route %s %d %s\n", rc,
                        libcfs_net2str(net), hops,
-                       libcfs_nid2str(gateway));
+                       libcfs_nidstr(gateway));
                return rc;
        }
 
-       LASSERT(lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer);
+       LASSERT(lpni);
+       LASSERT(lpni->lpni_peer_net);
+       LASSERT(lpni->lpni_peer_net->lpn_peer);
        gw = lpni->lpni_peer_net->lpn_peer;
 
        route->lr_gateway = gw;
@@ -530,7 +763,8 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                }
 
                /* our lookups must be true */
-               LASSERT(route2->lr_gateway->lp_primary_nid != gateway);
+               LASSERT(!nid_same(&route2->lr_gateway->lp_primary_nid,
+                                 gateway));
        }
 
        /*
@@ -544,6 +778,8 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        if (add_route) {
                gw->lp_health_sensitivity = sensitivity;
                lnet_add_route_to_rnet(rnet2, route);
+               if (lnet_peer_discovery_disabled)
+                       CWARN("Consider turning discovery on to enable full Multi-Rail routing functionality\n");
        }
 
        /*
@@ -552,6 +788,14 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        lnet_peer_ni_decref_locked(lpni);
        lnet_net_unlock(LNET_LOCK_EX);
 
+       /* If avoid_asym_router_failure is enabled and hop count is not
+        * set to 1 for a route that is actually single-hop, then the
+        * feature will fail to prevent the router from being selected
+        * if it is missing a NI on the remote network due to misconfiguration.
+        */
+       if (avoid_asym_router_failure && hops == LNET_UNDEFINED_HOPS)
+               CWARN("Use hops = 1 for a single-hop route when avoid_asym_router_failure feature is enabled\n");
+
        rc = 0;
 
        if (!add_route) {
@@ -563,13 +807,14 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                LIBCFS_FREE(rnet, sizeof(*rnet));
 
        /* kick start the monitor thread to handle the added route */
-       wake_up(&the_lnet.ln_mt_waitq);
+       complete(&the_lnet.ln_mt_wait_complete);
 
        return rc;
 }
 
-static void
-lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list,
+void
+lnet_del_route_from_rnet(struct lnet_nid *gw_nid,
+                        struct list_head *route_list,
                         struct list_head *zombies)
 {
        struct lnet_peer *gateway;
@@ -578,8 +823,7 @@ lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list,
 
        list_for_each_entry_safe(route, tmp, route_list, lr_list) {
                gateway = route->lr_gateway;
-               if (gw_nid != LNET_NID_ANY &&
-                   gw_nid != gateway->lp_primary_nid)
+               if (gw_nid && !nid_same(gw_nid, &gateway->lp_primary_nid))
                        continue;
 
                /*
@@ -599,44 +843,46 @@ lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list,
 }
 
 int
-lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+lnet_del_route(__u32 net, struct lnet_nid *gw)
 {
-       struct list_head rnet_zombies;
+       LIST_HEAD(rnet_zombies);
        struct lnet_remotenet *rnet;
        struct lnet_remotenet *tmp;
        struct list_head *rn_list;
        struct lnet_peer_ni *lpni;
        struct lnet_route *route;
-       struct list_head zombies;
-       struct lnet_peer *lp;
+       struct lnet_nid gw_nid;
+       LIST_HEAD(zombies);
+       struct lnet_peer *lp = NULL;
        int i = 0;
 
-       INIT_LIST_HEAD(&rnet_zombies);
-       INIT_LIST_HEAD(&zombies);
-
        CDEBUG(D_NET, "Del route: net %s : gw %s\n",
-              libcfs_net2str(net), libcfs_nid2str(gw_nid));
+              libcfs_net2str(net), libcfs_nidstr(gw));
 
        /* NB Caller may specify either all routes via the given gateway
         * or a specific route entry actual NIDs) */
 
        lnet_net_lock(LNET_LOCK_EX);
 
-       lpni = lnet_find_peer_ni_locked(gw_nid);
+       if (gw)
+               lpni = lnet_peer_ni_find_locked(gw);
+       else
+               lpni = NULL;
        if (lpni) {
                lp = lpni->lpni_peer_net->lpn_peer;
                LASSERT(lp);
                gw_nid = lp->lp_primary_nid;
+               gw = &gw_nid;
                lnet_peer_ni_decref_locked(lpni);
        }
 
-       if (net != LNET_NIDNET(LNET_NID_ANY)) {
+       if (net != LNET_NET_ANY) {
                rnet = lnet_find_rnet_locked(net);
                if (!rnet) {
                        lnet_net_unlock(LNET_LOCK_EX);
                        return -ENOENT;
                }
-               lnet_del_route_from_rnet(gw_nid, &rnet->lrn_routes,
+               lnet_del_route_from_rnet(gw, &rnet->lrn_routes,
                                         &zombies);
                if (list_empty(&rnet->lrn_routes))
                        list_move(&rnet->lrn_list, &rnet_zombies);
@@ -647,7 +893,7 @@ lnet_del_route(__u32 net, lnet_nid_t gw_nid)
                rn_list = &the_lnet.ln_remote_nets_hash[i];
 
                list_for_each_entry_safe(rnet, tmp, rn_list, lrn_list) {
-                       lnet_del_route_from_rnet(gw_nid, &rnet->lrn_routes,
+                       lnet_del_route_from_rnet(gw, &rnet->lrn_routes,
                                                 &zombies);
                        if (list_empty(&rnet->lrn_routes))
                                list_move(&rnet->lrn_list, &rnet_zombies);
@@ -686,9 +932,9 @@ delete_zombies:
 }
 
 void
-lnet_destroy_routes (void)
+lnet_destroy_routes(void)
 {
-       lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+       lnet_del_route(LNET_NET_ANY, NULL);
 }
 
 int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
@@ -724,14 +970,12 @@ int lnet_get_rtr_pool_cfg(int cpt, struct lnet_ioctl_pool_cfg *pool_cfg)
 }
 
 int
-lnet_get_route(int idx, __u32 *net, __u32 *hops,
-              lnet_nid_t *gateway, __u32 *alive, __u32 *priority, __u32 *sensitivity)
+lnet_get_route(int idx, __u32 *net, __u32 *hops, lnet_nid_t *gateway,
+              __u32 *flags, __u32 *priority, __u32 *sensitivity)
 {
        struct lnet_remotenet *rnet;
        struct list_head *rn_list;
        struct lnet_route *route;
-       struct list_head *e1;
-       struct list_head *e2;
        int cpt;
        int i;
 
@@ -739,21 +983,23 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
 
        for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
                rn_list = &the_lnet.ln_remote_nets_hash[i];
-               list_for_each(e1, rn_list) {
-                       rnet = list_entry(e1, struct lnet_remotenet, lrn_list);
-
-                       list_for_each(e2, &rnet->lrn_routes) {
-                               route = list_entry(e2, struct lnet_route,
-                                                  lr_list);
-
+               list_for_each_entry(rnet, rn_list, lrn_list) {
+                       list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
                                if (idx-- == 0) {
                                        *net      = rnet->lrn_net;
-                                       *gateway  = route->lr_nid;
+                                       *gateway  = lnet_nid_to_nid4(&route->lr_nid);
                                        *hops     = route->lr_hops;
                                        *priority = route->lr_priority;
                                        *sensitivity = route->lr_gateway->
                                                lp_health_sensitivity;
-                                       *alive    = lnet_is_route_alive(route);
+                                       if (lnet_is_route_alive(route))
+                                               *flags |= LNET_RT_ALIVE;
+                                       else
+                                               *flags &= ~LNET_RT_ALIVE;
+                                       if (route->lr_single_hop)
+                                               *flags &= ~LNET_RT_MULTI_HOP;
+                                       else
+                                               *flags |= LNET_RT_MULTI_HOP;
                                        lnet_net_unlock(cpt);
                                        return 0;
                                }
@@ -769,7 +1015,6 @@ static void
 lnet_wait_known_routerstate(void)
 {
        struct lnet_peer *rtr;
-       struct list_head *entry;
        int all_known;
 
        LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
@@ -778,13 +1023,10 @@ lnet_wait_known_routerstate(void)
                int cpt = lnet_net_lock_current();
 
                all_known = 1;
-               list_for_each(entry, &the_lnet.ln_routers) {
-                       rtr = list_entry(entry, struct lnet_peer,
-                                        lp_rtr_list);
-
+               list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) {
                        spin_lock(&rtr->lp_lock);
 
-                       if ((rtr->lp_state & LNET_PEER_DISCOVERED) == 0) {
+                       if ((rtr->lp_state & LNET_PEER_RTR_DISCOVERED) == 0) {
                                all_known = 0;
                                spin_unlock(&rtr->lp_lock);
                                break;
@@ -797,8 +1039,7 @@ lnet_wait_known_routerstate(void)
                if (all_known)
                        return;
 
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               schedule_timeout(cfs_time_seconds(1));
+               schedule_timeout_uninterruptible(cfs_time_seconds(1));
        }
 }
 
@@ -808,15 +1049,9 @@ lnet_net_set_status_locked(struct lnet_net *net, __u32 status)
        struct lnet_ni *ni;
        bool update = false;
 
-       list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-               lnet_ni_lock(ni);
-               if (ni->ni_status &&
-                   ni->ni_status->ns_status != status) {
-                   ni->ni_status->ns_status = status;
-                   update = true;
-               }
-               lnet_ni_unlock(ni);
-       }
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+               if (lnet_ni_set_status(ni, status))
+                       update = true;
 
        return update;
 }
@@ -825,6 +1060,7 @@ static bool
 lnet_update_ni_status_locked(void)
 {
        struct lnet_net *net;
+       struct lnet_ni *ni;
        bool push = false;
        time64_t now;
        time64_t timeout;
@@ -833,19 +1069,19 @@ lnet_update_ni_status_locked(void)
 
        timeout = router_ping_timeout + alive_router_check_interval;
 
-       now = ktime_get_real_seconds();
+       now = ktime_get_seconds();
        list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
                if (net->net_lnd->lnd_type == LOLND)
                        continue;
 
                if (now < net->net_last_alive + timeout)
-                       continue;
+                       goto check_ni_fatal;
 
                spin_lock(&net->net_lock);
                /* re-check with lock */
                if (now < net->net_last_alive + timeout) {
                        spin_unlock(&net->net_lock);
-                       continue;
+                       goto check_ni_fatal;
                }
                spin_unlock(&net->net_lock);
 
@@ -854,7 +1090,25 @@ lnet_update_ni_status_locked(void)
                 * timeout on any of its constituent NIs, then mark all
                 * the NIs down.
                 */
-               push = lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN);
+               if (lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN)) {
+                       push = true;
+                       continue;
+               }
+
+check_ni_fatal:
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       /* lnet_ni_set_status() will perform the same check of
+                        * ni_status while holding the ni lock. We can safely
+                        * check ni_status without that lock because it is only
+                        * written to under net_lock/EX and our caller is
+                        * holding a net lock.
+                        */
+                       if (atomic_read(&ni->ni_fatal_error_on) &&
+                           ni->ni_status &&
+                           ni->ni_status->ns_status != LNET_NI_STATUS_DOWN &&
+                           lnet_ni_set_status(ni, LNET_NI_STATUS_DOWN))
+                               push = true;
+               }
        }
 
        return push;
@@ -874,8 +1128,7 @@ void lnet_wait_router_start(void)
  * This function is called from the monitor thread to check if there are
  * any active routers that need to be checked.
  */
-inline bool
-lnet_router_checker_active(void)
+bool lnet_router_checker_active(void)
 {
        /* Router Checker thread needs to run when routing is enabled in
         * order to call lnet_update_ni_status_locked() */
@@ -889,12 +1142,12 @@ lnet_router_checker_active(void)
 void
 lnet_check_routers(void)
 {
-       struct lnet_peer_net *first_lpn = NULL;
+       struct lnet_peer_net *first_lpn;
        struct lnet_peer_net *lpn;
        struct lnet_peer_ni *lpni;
-       struct list_head *entry;
        struct lnet_peer *rtr;
        bool push = false;
+       bool needs_ping;
        bool found_lpn;
        __u64 version;
        __u32 net_id;
@@ -906,64 +1159,70 @@ lnet_check_routers(void)
 rescan:
        version = the_lnet.ln_routers_version;
 
-       list_for_each(entry, &the_lnet.ln_routers) {
-               rtr = list_entry(entry, struct lnet_peer,
-                                lp_rtr_list);
+       list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) {
+               /* If we're currently discovering the peer then don't
+                * issue another discovery
+                */
+               if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY)
+                       continue;
 
                now = ktime_get_real_seconds();
 
-               /*
-                * only discover the router if we've passed
-                * alive_router_check_interval seconds. Some of the router
-                * interfaces could be down and in that case they would be
-                * undergoing recovery separately from this discovery.
-                */
-               /* find next peer net which is also local */
+               /* find the next local peer net which needs to be ping'd */
+               needs_ping = false;
+               first_lpn = NULL;
+               found_lpn = false;
                net_id = rtr->lp_disc_net_id;
                do {
                        lpn = lnet_get_next_peer_net_locked(rtr, net_id);
                        if (!lpn) {
                                CERROR("gateway %s has no networks\n",
-                               libcfs_nid2str(rtr->lp_primary_nid));
+                               libcfs_nidstr(&rtr->lp_primary_nid));
                                break;
                        }
+
+                       /* We looped back to the first peer net */
                        if (first_lpn == lpn)
                                break;
                        if (!first_lpn)
                                first_lpn = lpn;
-                       found_lpn = lnet_islocalnet_locked(lpn->lpn_net_id);
+
                        net_id = lpn->lpn_net_id;
-               } while (!found_lpn);
+                       if (!lnet_islocalnet_locked(net_id))
+                               continue;
+
+                       found_lpn = true;
+
+                       CDEBUG(D_NET, "rtr %s(%p) %s(%p) next ping %lld\n",
+                              libcfs_nidstr(&rtr->lp_primary_nid), rtr,
+                              libcfs_net2str(net_id), lpn,
+                              lpn->lpn_next_ping);
+
+                       needs_ping = now >= lpn->lpn_next_ping;
+
+               } while (!needs_ping);
 
                if (!found_lpn || !lpn) {
                        CERROR("no local network found for gateway %s\n",
-                              libcfs_nid2str(rtr->lp_primary_nid));
+                              libcfs_nidstr(&rtr->lp_primary_nid));
                        continue;
                }
 
-               if (now - lpn->lpn_rtrcheck_timestamp <
-                   alive_router_check_interval / lnet_current_net_count)
-                      continue;
+               if (!needs_ping)
+                       continue;
 
-               /*
-                * If we're currently discovering the peer then don't
-                * issue another discovery
-                */
                spin_lock(&rtr->lp_lock);
-               if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY) {
-                       spin_unlock(&rtr->lp_lock);
-                       continue;
-               }
-               /* make sure we actively discover the router */
+               /* make sure we fully discover the router */
                rtr->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
-               rtr->lp_state |= LNET_PEER_RTR_DISCOVERY;
+               rtr->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH |
+                       LNET_PEER_RTR_DISCOVERY;
                spin_unlock(&rtr->lp_lock);
 
                /* find the peer_ni associated with the primary NID */
-               lpni = lnet_peer_get_ni_locked(rtr, rtr->lp_primary_nid);
+               lpni = lnet_peer_ni_get_locked(rtr, &rtr->lp_primary_nid);
                if (!lpni) {
                        CDEBUG(D_NET, "Expected to find an lpni for %s, but non found\n",
-                              libcfs_nid2str(rtr->lp_primary_nid));
+                              libcfs_nidstr(&rtr->lp_primary_nid));
                        continue;
                }
                lnet_peer_ni_addref_locked(lpni);
@@ -973,19 +1232,19 @@ rescan:
 
                /* discover the router */
                CDEBUG(D_NET, "discover %s, cpt = %d\n",
-                      libcfs_nid2str(lpni->lpni_nid), cpt);
+                      libcfs_nidstr(&lpni->lpni_nid), cpt);
                rc = lnet_discover_peer_locked(lpni, cpt, false);
 
-               /* decrement ref count acquired by find_peer_ni_locked() */
+               /* drop ref taken above */
                lnet_peer_ni_decref_locked(lpni);
 
                if (!rc)
-                       lpn->lpn_rtrcheck_timestamp = now;
+                       lpn->lpn_next_ping = now + alive_router_check_interval;
                else
                        CERROR("Failed to discover router %s\n",
-                              libcfs_nid2str(rtr->lp_primary_nid));
+                              libcfs_nidstr(&rtr->lp_primary_nid));
 
-               /* NB dropped lock */
+               /* NB cpt lock was dropped in lnet_discover_peer_locked() */
                if (version != the_lnet.ln_routers_version) {
                        /* the routers list has changed */
                        goto rescan;
@@ -1008,7 +1267,7 @@ lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages)
        int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]);
 
        while (--npages >= 0)
-               __free_page(rb->rb_kiov[npages].kiov_page);
+               __free_page(rb->rb_kiov[npages].bv_page);
 
        LIBCFS_FREE(rb, sz);
 }
@@ -1029,19 +1288,19 @@ lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt)
        rb->rb_pool = rbp;
 
        for (i = 0; i < npages; i++) {
-               page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
-                                         GFP_KERNEL | __GFP_ZERO);
+               page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_KERNEL |
+                                         __GFP_ZERO | __GFP_NORETRY);
                if (page == NULL) {
                        while (--i >= 0)
-                               __free_page(rb->rb_kiov[i].kiov_page);
+                               __free_page(rb->rb_kiov[i].bv_page);
 
                        LIBCFS_FREE(rb, sz);
                        return NULL;
                }
 
-               rb->rb_kiov[i].kiov_len = PAGE_SIZE;
-               rb->rb_kiov[i].kiov_offset = 0;
-               rb->rb_kiov[i].kiov_page = page;
+               rb->rb_kiov[i].bv_len = PAGE_SIZE;
+               rb->rb_kiov[i].bv_offset = 0;
+               rb->rb_kiov[i].bv_page = page;
        }
 
        return rb;
@@ -1052,13 +1311,11 @@ lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt)
 {
        int npages = rbp->rbp_npages;
        struct lnet_rtrbuf *rb;
-       struct list_head tmp;
+       LIST_HEAD(tmp);
 
        if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
                return;
 
-       INIT_LIST_HEAD(&tmp);
-
        lnet_net_lock(cpt);
        list_splice_init(&rbp->rbp_msgs, &tmp);
        lnet_drop_routed_msgs_locked(&tmp, cpt);
@@ -1070,7 +1327,7 @@ lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt)
 
        /* Free buffers on the free list. */
        while (!list_empty(&tmp)) {
-               rb = list_entry(tmp.next, struct lnet_rtrbuf, rb_list);
+               rb = list_first_entry(&tmp, struct lnet_rtrbuf, rb_list);
                list_del(&rb->rb_list);
                lnet_destroy_rtrbuf(rb, npages);
        }
@@ -1079,7 +1336,7 @@ lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt)
 static int
 lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt)
 {
-       struct list_head rb_list;
+       LIST_HEAD(rb_list);
        struct lnet_rtrbuf *rb;
        int             num_rb;
        int             num_buffers = 0;
@@ -1107,16 +1364,14 @@ lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt)
        rbp->rbp_req_nbuffers = nbufs;
        lnet_net_unlock(cpt);
 
-       INIT_LIST_HEAD(&rb_list);
-
        /* allocate the buffers on a local list first.  If all buffers are
         * allocated successfully then join this list to the rbp buffer
         * list.  If not then free all allocated buffers. */
        while (num_rb-- > 0) {
                rb = lnet_new_rtrbuf(rbp, cpt);
                if (rb == NULL) {
-                       CERROR("Failed to allocate %d route bufs of %d pages\n",
-                              nbufs, npages);
+                       CERROR("lnet: error allocating %ux%u page router buffers on CPT %u: rc = %d\n",
+                              nbufs, npages, cpt, -ENOMEM);
 
                        lnet_net_lock(cpt);
                        rbp->rbp_req_nbuffers = old_req_nbufs;
@@ -1146,8 +1401,9 @@ lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt)
        return 0;
 
 failed:
-       while (!list_empty(&rb_list)) {
-               rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list);
+       while ((rb = list_first_entry_or_null(&rb_list,
+                                             struct lnet_rtrbuf,
+                                             rb_list)) != NULL) {
                list_del(&rb->rb_list);
                lnet_destroy_rtrbuf(rb, npages);
        }
@@ -1264,9 +1520,11 @@ lnet_rtrpools_alloc(int im_a_router)
        } else if (!strcmp(forwarding, "enabled")) {
                /* explicitly enabled */
        } else {
-               LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
-                                  "'enabled' or 'disabled'\n");
-               return -EINVAL;
+               rc = -EINVAL;
+               LCONSOLE_ERROR_MSG(0x10b,
+                                  "lnet: forwarding='%s' not set to either 'enabled' or 'disabled': rc = %d\n",
+                                  forwarding, rc);
+               return rc;
        }
 
        nrb_tiny = lnet_nrb_tiny_calculate();
@@ -1285,37 +1543,39 @@ lnet_rtrpools_alloc(int im_a_router)
                                                LNET_NRBPOOLS *
                                                sizeof(struct lnet_rtrbufpool));
        if (the_lnet.ln_rtrpools == NULL) {
+               rc = -ENOMEM;
                LCONSOLE_ERROR_MSG(0x10c,
-                                  "Failed to initialize router buffe pool\n");
-               return -ENOMEM;
+                       "lnet: error allocating router buffer pool: rc = %d\n",
+                       rc);
+               return rc;
        }
 
        cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
                lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0);
                rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX],
                                              nrb_tiny, i);
-               if (rc != 0)
+               if (rc)
                        goto failed;
 
                lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX],
                                  LNET_NRB_SMALL_PAGES);
                rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX],
                                              nrb_small, i);
-               if (rc != 0)
+               if (rc)
                        goto failed;
 
                lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX],
                                  LNET_NRB_LARGE_PAGES);
                rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX],
                                              nrb_large, i);
-               if (rc != 0)
+               if (rc)
                        goto failed;
        }
 
        lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_routing = 1;
        lnet_net_unlock(LNET_LOCK_EX);
-       wake_up(&the_lnet.ln_mt_waitq);
+       complete(&the_lnet.ln_mt_wait_complete);
        return 0;
 
  failed:
@@ -1407,6 +1667,10 @@ lnet_rtrpools_enable(void)
                ~LNET_PING_FEAT_RTE_DISABLED;
        lnet_net_unlock(LNET_LOCK_EX);
 
+       if (lnet_peer_discovery_disabled)
+               CWARN("Consider turning discovery on to enable full "
+                     "Multi-Rail routing functionality\n");
+
        return rc;
 }
 
@@ -1429,7 +1693,7 @@ lnet_rtrpools_disable(void)
 }
 
 static inline void
-lnet_notify_peer_down(struct lnet_ni *ni, lnet_nid_t nid)
+lnet_notify_peer_down(struct lnet_ni *ni, struct lnet_nid *nid)
 {
        if (ni->ni_net->net_lnd->lnd_notify_peer_down != NULL)
                (ni->ni_net->net_lnd->lnd_notify_peer_down)(nid);
@@ -1443,34 +1707,36 @@ lnet_notify_peer_down(struct lnet_ni *ni, lnet_nid_t nid)
  * when: notificaiton time.
  */
 int
-lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
+lnet_notify(struct lnet_ni *ni, lnet_nid_t nid4, bool alive, bool reset,
            time64_t when)
 {
        struct lnet_peer_ni *lpni = NULL;
+       struct lnet_route *route;
+       struct lnet_peer *lp;
        time64_t now = ktime_get_seconds();
+       struct lnet_nid nid;
        int cpt;
 
-       LASSERT (!in_interrupt ());
+       lnet_nid4_to_nid(nid4, &nid);
+       LASSERT(!in_interrupt());
 
-       CDEBUG (D_NET, "%s notifying %s: %s\n",
-               (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-               libcfs_nid2str(nid),
-               alive ? "up" : "down");
+       CDEBUG(D_NET, "%s notifying %s: %s\n",
+              (ni == NULL) ? "userspace" : libcfs_nidstr(&ni->ni_nid),
+              libcfs_nidstr(&nid), alive ? "up" : "down");
 
        if (ni != NULL &&
-           LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+           LNET_NID_NET(&ni->ni_nid) != LNET_NID_NET(&nid)) {
                CWARN("Ignoring notification of %s %s by %s (different net)\n",
-                     libcfs_nid2str(nid), alive ? "birth" : "death",
-                     libcfs_nid2str(ni->ni_nid));
+                     libcfs_nidstr(&nid), alive ? "birth" : "death",
+                     libcfs_nidstr(&ni->ni_nid));
                return -EINVAL;
        }
 
        /* can't do predictions... */
        if (when > now) {
-               CWARN("Ignoring prediction from %s of %s %s "
-                     "%lld seconds in the future\n",
-                     (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-                     libcfs_nid2str(nid), alive ? "up" : "down", when - now);
+               CWARN("Ignoring prediction from %s of %s %s %lld seconds in the future\n",
+                       ni ? libcfs_nidstr(&ni->ni_nid) :  "userspace",
+                       libcfs_nidstr(&nid), alive ? "up" : "down", when - now);
                return -EINVAL;
        }
 
@@ -1488,30 +1754,62 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
                return -ESHUTDOWN;
        }
 
-       lpni = lnet_find_peer_ni_locked(nid);
+       lpni = lnet_peer_ni_find_locked(&nid);
        if (lpni == NULL) {
                /* nid not found */
                lnet_net_unlock(0);
-               CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+               CDEBUG(D_NET, "%s not found\n", libcfs_nidstr(&nid));
                return 0;
        }
 
        if (alive) {
-               if (reset)
-                       lnet_set_healthv(&lpni->lpni_healthv,
-                                        LNET_MAX_HEALTH_VALUE);
-               else
-                       lnet_inc_healthv(&lpni->lpni_healthv);
-       } else {
-               lnet_handle_remote_failure_locked(lpni);
+               if (reset) {
+                       lpni->lpni_ns_status = LNET_NI_STATUS_UP;
+                       lnet_set_lpni_healthv_locked(lpni,
+                                                    LNET_MAX_HEALTH_VALUE);
+               } else {
+                       __u32 sensitivity = lpni->lpni_peer_net->
+                                       lpn_peer->lp_health_sensitivity;
+
+                       lnet_inc_lpni_healthv_locked(lpni,
+                                       (sensitivity) ? sensitivity :
+                                       lnet_health_sensitivity);
+               }
+       } else if (reset) {
+               lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
        }
 
        /* recalculate aliveness */
        alive = lnet_is_peer_ni_alive(lpni);
+
+       lp = lpni->lpni_peer_net->lpn_peer;
+       /* If this is an LNet router then update route aliveness */
+       if (lp->lp_rtr_refcount) {
+               if (reset)
+                       /* reset flag indicates gateway peer went up or down */
+                       lp->lp_alive = alive;
+
+               /* If discovery is disabled, locally or on the gateway, then
+                * any routes using lpni as next-hop need to be updated
+                *
+                * NB: We can get many notifications while a route is down, so
+                * we try and avoid the expensive net_lock/EX here for the
+                * common case of receiving duplicate lnet_notify() calls (i.e.
+                * only grab EX lock when we actually need to update the route
+                * aliveness).
+                */
+               if (lnet_is_discovery_disabled(lp)) {
+                       list_for_each_entry(route, &lp->lp_routes, lr_gwlist) {
+                               if (nid_same(&route->lr_nid, &lpni->lpni_nid))
+                                       lnet_set_route_aliveness(route, alive);
+                       }
+               }
+       }
+
        lnet_net_unlock(0);
 
        if (ni != NULL && !alive)
-               lnet_notify_peer_down(ni, lpni->lpni_nid);
+               lnet_notify_peer_down(ni, &lpni->lpni_nid);
 
        cpt = lpni->lpni_cpt;
        lnet_net_lock(cpt);