Whamcloud - gitweb
LU-11300 lnet: router aliveness 85/33185/34
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 6 Sep 2018 00:03:45 +0000 (17:03 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:14:48 +0000 (18:14 +0000)
A route is considered alive if the gateway is able to route
messages from the local to the remote net. That means that
at least one of the network interfaces on the remote net of
the gateway is viable.

Introduced the concept of sensitivity percentage. This defaults
to 100%. It holds a dual meaning:
1. A route is considered alive if at least one of the its interfaces'
health is >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage
100 means at least one interface has to be 100% healthy
2. On a router consider a peer_ni dead if its health is not at least
LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage.
100% means the interface has to be 100% healthy.

Re-implemented lnet_notify() to decrement the health of the
peer interface if the LND reports a failure on that peer.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ie97561fb70bf6a558bc90fa9266a6ba38fa3d293
Reviewed-on: https://review.whamcloud.com/33185
Tested-by: Jenkins
lnet/include/lnet/lib-lnet.h
lnet/lnet/router.c
lnet/lnet/router_proc.c

index 72f5c64..48a959c 100644 (file)
@@ -92,15 +92,8 @@ extern struct lnet the_lnet;                 /* THE network */
                kernel_getsockname(sock, addr, addrlen)
 #endif
 
-static inline int lnet_is_route_alive(struct lnet_route *route)
-{
-       /* TODO re-implement gateway alive indication */
-       CDEBUG(D_NET, "TODO: reimplement routing. gateway = %s\n",
-              route->lr_gateway ?
-               libcfs_nid2str(route->lr_gateway->lp_primary_nid) :
-               "undefined");
-       return 1;
-}
+bool lnet_is_route_alive(struct lnet_route *route);
+bool lnet_is_gateway_alive(struct lnet_peer *gw);
 
 static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh)
 {
index 4fc8faf..37bc811 100644 (file)
@@ -151,6 +151,85 @@ lnet_peers_start_down(void)
        return check_routers_before_use;
 }
 
+/*
+ * A net is alive if at least one gateway NI on the network is alive.
+ */
+static bool
+lnet_is_gateway_net_alive(struct lnet_peer_net *lpn)
+{
+       struct lnet_peer_ni *lpni;
+
+       list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+               if (lnet_is_peer_ni_alive(lpni))
+                       return true;
+       }
+
+       return false;
+}
+
+/*
+ * a gateway is alive only if all its nets are alive
+ * called with cpt lock held
+ */
+bool lnet_is_gateway_alive(struct lnet_peer *gw)
+{
+       struct lnet_peer_net *lpn;
+
+       list_for_each_entry(lpn, &gw->lp_peer_nets, lpn_peer_nets) {
+               if (!lnet_is_gateway_net_alive(lpn))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * lnet_is_route_alive() needs to be called with cpt lock held
+ * A route is alive if the gateway can route between the local network and
+ * the remote network of the route.
+ * This means at least one NI is alive on each of the local and remote
+ * networks of the gateway.
+ */
+bool lnet_is_route_alive(struct lnet_route *route)
+{
+       struct lnet_peer *gw = route->lr_gateway;
+       struct lnet_peer_net *llpn;
+       struct lnet_peer_net *rlpn;
+       bool route_alive;
+
+       /*
+        * check the gateway's interfaces on the route rnet to make sure
+        * that the gateway is viable.
+        */
+       llpn = lnet_peer_get_net_locked(gw, route->lr_lnet);
+       if (!llpn)
+               return false;
+
+       route_alive = lnet_is_gateway_net_alive(llpn);
+
+       if (avoid_asym_router_failure) {
+               rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
+               if (!rlpn)
+                       return false;
+               route_alive = route_alive &&
+                             lnet_is_gateway_net_alive(rlpn);
+       }
+
+       if (!route_alive)
+               return route_alive;
+
+       spin_lock(&gw->lp_lock);
+       if (!(gw->lp_state & LNET_PEER_ROUTER_ENABLED)) {
+               if (gw->lp_rtr_refcount > 0)
+                       CERROR("peer %s is being used as a gateway but routing feature is not turned on\n",
+                              libcfs_nid2str(gw->lp_primary_nid));
+               route_alive = false;
+       }
+       spin_unlock(&gw->lp_lock);
+
+       return route_alive;
+}
+
 void
 lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
                   time64_t when)
index 128c797..aaa12cc 100644 (file)
@@ -339,7 +339,7 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
                        int nrefs     = atomic_read(&peer->lp_refcount);
                        int nrtrrefs  = peer->lp_rtr_refcount;
                        int alive_cnt = 0;
-                       int alive     = 0;
+                       int alive     = lnet_is_gateway_alive(peer);
                        int pingsent  = ((peer->lp_state & LNET_PEER_PING_SENT)
                                         != 0);
                        time64_t last_ping = now - peer->lp_rtrcheck_timestamp;