Whamcloud - gitweb
LU-5485 lnet: peer aliveness status and NI status 53/12453/4
authorLiang Zhen <liang.zhen@intel.com>
Tue, 28 Oct 2014 10:04:51 +0000 (18:04 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 9 Dec 2014 08:13:40 +0000 (08:13 +0000)
A couple of changes to improve aliveness detection:
- When LNet received a message, it can determine peer of this message
  is alive

- When LNet recieved a message from remote network, it can determine
  router is alive and NI status on router is UP.

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: I7133987c5c8728248cce7bc0a95048b26bc6611a
Reviewed-on: http://review.whamcloud.com/12453
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/lib-move.c
lnet/lnet/router.c

index d0b4b51..a48a5d3 100644 (file)
@@ -702,6 +702,7 @@ lnet_net2rnethash(__u32 net)
 }
 
 extern lnd_t the_lolnd;
+extern int avoid_asym_router_failure;
 
 #ifndef __KERNEL__
 /* unconditional registration */
@@ -989,6 +990,7 @@ int lnet_peer_buffer_credits(lnet_ni_t *ni);
 
 int lnet_router_checker_start(void);
 void lnet_router_checker_stop(void);
+void lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net);
 void lnet_swap_pinginfo(lnet_ping_info_t *info);
 
 int lnet_parse_ip2nets(char **networksp, char *ip2nets);
@@ -1010,6 +1012,14 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
                       __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
                       __u32 *peer_tx_qnob);
 
+static inline void
+lnet_peer_set_alive(lnet_peer_t *lp)
+{
+       lp->lp_last_alive = lp->lp_last_query = cfs_time_current();
+       if (!lp->lp_alive)
+               lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+}
+
 #ifndef __KERNEL__
 static inline int
 lnet_parse_int_tunable(int *value, char *name)
index 4211e4f..17150a9 100644 (file)
@@ -2055,6 +2055,19 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                goto drop;
        }
 
+       if (lnet_isrouter(msg->msg_rxpeer)) {
+               lnet_peer_set_alive(msg->msg_rxpeer);
+               if (avoid_asym_router_failure &&
+                   LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+                       /* received a remote message from router, update
+                        * remote NI status on this router.
+                        * NB: multi-hop routed message will be ignored.
+                        */
+                       lnet_router_ni_update_locked(msg->msg_rxpeer,
+                                                    LNET_NIDNET(src_nid));
+               }
+       }
+
        lnet_msg_commit(msg, cpt);
 
        /* message delay simulation */
index 1c0c3af..091cfd9 100644 (file)
@@ -87,7 +87,7 @@ static int check_routers_before_use = 0;
 CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
                "Assume routers are down and ping them before use");
 
-static int avoid_asym_router_failure = 1;
+int avoid_asym_router_failure = 1;
 CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
                "Avoid asymmetrical router failures (0 to disable)");
 
@@ -823,6 +823,21 @@ lnet_wait_known_routerstate(void)
 }
 
 void
+lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
+{
+       lnet_route_t *rte;
+
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
+               list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
+                       if (rte->lr_net == net) {
+                               rte->lr_downis = 0;
+                               break;
+                       }
+               }
+       }
+}
+
+void
 lnet_update_ni_status_locked(void)
 {
        lnet_ni_t       *ni;