Whamcloud - gitweb
LU-5570 lnet: check router aliveness timestamp 48/11748/9
authorLiang Zhen <liang.zhen@intel.com>
Thu, 11 Dec 2014 20:53:31 +0000 (15:53 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Sun, 4 Jan 2015 18:33:03 +0000 (18:33 +0000)
In current LNet, peer aliveness timestamp is only recorded for peers
on routers, this patch will change this and also record aliveness of
routers on regular nodes, so lnet can check aliveness timestamp of
routers before sending message, avoid to choose router which has old
aliveness timestamp and can be potentially dead or congested.

Another change in this patch is, when lnet got a remote message from
router, it will update remote NI status of router, in this case, lnet
can get up-to-date aliveness informaiton without waiting to the next
RC ping.

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: I305ba684ced231a87656458382966aeb8ff7ba48
Reviewed-on: http://review.whamcloud.com/11748
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c
lnet/lnet/router.c
lnet/lnet/router_proc.c

index 36b1a63..73a4d68 100644 (file)
@@ -651,6 +651,31 @@ lnet_isrouter(lnet_peer_t *lp)
         return lp->lp_rtr_refcount != 0;
 }
 
+/* check if it's a router checker ping */
+static inline int
+lnet_msg_is_rc_ping(struct lnet_msg *msg)
+{
+       lnet_hdr_t      *hdr = &msg->msg_hdr;
+
+       return msg->msg_type == LNET_MSG_GET &&
+              hdr->msg.get.ptl_index == cpu_to_le32(LNET_RESERVED_PORTAL) &&
+              hdr->msg.get.match_bits ==
+                           cpu_to_le64(LNET_PROTO_PING_MATCHBITS);
+}
+
+/* peer aliveness is enabled in a network where lnet_ni_t::ni_peertimeout has
+ * been set to a positive value, it's only valid for router peers or peers on
+ * routers.
+ */
+static inline int
+lnet_peer_aliveness_enabled(struct lnet_peer *lp)
+{
+       if (lp->lp_ni->ni_peertimeout <= 0)
+               return 0;
+
+       return the_lnet.ln_routing || lnet_isrouter(lp);
+}
+
 static inline void
 lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
 {
index 5199f84..c25241f 100644 (file)
@@ -544,11 +544,6 @@ struct lnet_peer_table {
        struct list_head        *pt_hash;       /* NID->peer hash */
 };
 
-/* peer aliveness is enabled only on routers for peers in a network where the
- * lnet_ni_t::ni_peertimeout has been set to a positive value */
-#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
-                                        (lp)->lp_ni->ni_peertimeout > 0)
-
 typedef struct {
        struct list_head        lr_list;        /* chain on net */
        struct list_head        lr_gwlist;      /* chain on gateway */
index 8973813..5a5e14d 100644 (file)
@@ -746,8 +746,6 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
         int        alive;
         cfs_time_t deadline;
 
-        LASSERT (lnet_peer_aliveness_enabled(lp));
-
         /* Trust lnet_notify() if it has more recent aliveness news, but
          * ignore the initial assumed death (see lnet_peers_start_down()).
          */
@@ -776,37 +774,28 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
 int
 lnet_peer_alive_locked (lnet_peer_t *lp)
 {
-        cfs_time_t now = cfs_time_current();
-
-        if (!lnet_peer_aliveness_enabled(lp))
-                return -ENODEV;
+       cfs_time_t now = cfs_time_current();
+       bool       query;
 
-        if (lnet_peer_is_alive(lp, now))
-                return 1;
+       if (!lnet_peer_aliveness_enabled(lp))
+               return -ENODEV;
 
-        /* Peer appears dead, but we should avoid frequent NI queries (at
-         * most once per lnet_queryinterval seconds). */
-        if (lp->lp_last_query != 0) {
-                static const int lnet_queryinterval = 1;
-
-                cfs_time_t next_query =
-                           cfs_time_add(lp->lp_last_query,
-                                        cfs_time_seconds(lnet_queryinterval));
-
-                if (cfs_time_before(now, next_query)) {
-                        if (lp->lp_alive)
-                                CWARN("Unexpected aliveness of peer %s: "
-                                      "%d < %d (%d/%d)\n",
-                                      libcfs_nid2str(lp->lp_nid),
-                                      (int)now, (int)next_query,
-                                      lnet_queryinterval,
-                                      lp->lp_ni->ni_peertimeout);
-                        return 0;
-                }
+       if (lp->lp_last_query == 0) {
+               query = true;
+       } else {
+               /* Peer appears dead, but we should avoid frequent NI queries
+                * (at most once per ni_query_interval seconds). */
+               static const int ni_query_interval = 1;
+               cfs_time_t       next_query;
+
+               next_query = cfs_time_add(lp->lp_last_query,
+                                         cfs_time_seconds(ni_query_interval));
+               query = cfs_time_aftereq(now, next_query);
         }
 
         /* query NI for latest aliveness news */
-       lnet_ni_query_locked(lp->lp_ni, lp);
+       if (query)
+               lnet_ni_query_locked(lp->lp_ni, lp);
 
         if (lnet_peer_is_alive(lp, now))
                 return 1;
@@ -841,7 +830,8 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
 
        /* NB 'lp' is always the next hop */
        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-           lnet_peer_alive_locked(lp) == 0) {
+           lnet_peer_alive_locked(lp) == 0 &&
+           !lnet_msg_is_rc_ping(msg)) { /* send RC ping even for dead router */
                the_lnet.ln_counters[cpt]->drop_count++;
                the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
                lnet_net_unlock(cpt);
@@ -1194,6 +1184,30 @@ lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
        lnet_peer_t *p1 = r1->lr_gateway;
        lnet_peer_t *p2 = r2->lr_gateway;
 
+       if (p1->lp_ni->ni_peertimeout > 0 &&
+           p2->lp_ni->ni_peertimeout > 0) {
+               /* if a router has queued bytes but no aliveness update for
+                * the last 10 seconds, it could be potentially dead or
+                * congested, so we prefer not to choose it even its status
+                * is still alive.
+                */
+               int        router_slow = cfs_time_seconds(10);
+               bool       r1_slow;
+               bool       r2_slow;
+               cfs_time_t now = cfs_time_current();
+
+               r1_slow = p1->lp_txqnob != 0 &&
+                       cfs_time_aftereq(now, p1->lp_last_alive + router_slow);
+               r2_slow = p2->lp_txqnob != 0 &&
+                       cfs_time_aftereq(now, p2->lp_last_alive + router_slow);
+
+               if (!r1_slow && r2_slow)
+                       return 1;
+
+               if (r1_slow && !r2_slow)
+                       return -1;
+       }
+
        if (r1->lr_priority < r2->lr_priority)
                return 1;
 
@@ -1876,17 +1890,18 @@ int
 lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
           void *private, int rdma_req)
 {
-       int             rc = 0;
-       int             cpt;
-       int             for_me;
        struct lnet_msg *msg;
-       lnet_pid_t     dest_pid;
-       lnet_nid_t     dest_nid;
-       lnet_nid_t     src_nid;
-       __u32          payload_length;
-       __u32          type;
-
-       LASSERT (!in_interrupt ());
+       lnet_peer_t     *rxpeer;
+       lnet_pid_t       dest_pid;
+       lnet_nid_t       dest_nid;
+       lnet_nid_t       src_nid;
+       __u32            payload_length;
+       __u32            type;
+       int              for_me;
+       int              cpt;
+       int              rc = 0;
+
+       LASSERT(!in_interrupt());
 
        type = le32_to_cpu(hdr->type);
        src_nid = le64_to_cpu(hdr->src_nid);
@@ -2069,6 +2084,18 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
        }
 
        lnet_msg_commit(msg, cpt);
+       /* LND just notified me for incoming message from rxpeer, so assume
+        * it is alive */
+       rxpeer = msg->msg_rxpeer;
+       rxpeer->lp_last_alive = rxpeer->lp_last_query = cfs_time_current();
+       if (!rxpeer->lp_alive)
+               lnet_notify_locked(rxpeer, 0, 1, rxpeer->lp_last_alive);
+
+       if (lnet_isrouter(msg->msg_rxpeer) &&
+           LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
+               lnet_router_ni_update_locked(msg->msg_rxpeer,
+                                            LNET_NIDNET(src_nid));
+       }
 
        /* message delay simulation */
        if (unlikely(!list_empty(&the_lnet.ln_delay_rules) &&
index ca5df09..26bd68b 100644 (file)
@@ -489,7 +489,17 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
         * (finalize sending first then finalize receiving)
         */
        cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+
        lnet_net_lock(cpt);
+       if (msg->msg_tx_committed && msg->msg_txpeer != NULL && status == 0) {
+               lnet_peer_t *lp = msg->msg_txpeer;
+
+               /* LND is finalising a outgoing message for txpeer, so asssume
+                * it is alive */
+               lp->lp_last_alive = lp->lp_last_query = cfs_time_current();
+               if (!lp->lp_alive)
+                       lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+       }
 
        container = the_lnet.ln_msg_containers[cpt];
        list_add_tail(&msg->msg_list, &container->msc_finalizing);
index 6521fbc..363d270 100644 (file)
@@ -375,7 +375,7 @@ lnet_debug_peer(lnet_nid_t nid)
                 return;
         }
 
-        if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+       if (lnet_peer_aliveness_enabled(lp))
                 aliveness = lp->lp_alive ? "up" : "down";
 
         CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
@@ -427,8 +427,7 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
                                continue;
 
                        snprintf(aliveness, LNET_MAX_STR_LEN, "NA");
-                       if (lnet_isrouter(lp) ||
-                               lnet_peer_aliveness_enabled(lp))
+                       if (lnet_peer_aliveness_enabled(lp))
                                snprintf(aliveness, LNET_MAX_STR_LEN,
                                         lp->lp_alive ? "up" : "down");
 
index 58526b5..015a2e4 100644 (file)
@@ -827,6 +827,7 @@ lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
 {
        lnet_route_t *rte;
 
+       /* NB: this can't help for multi-hop routing */
        if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
                list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
                        if (rte->lr_net == net) {
@@ -1137,11 +1138,9 @@ lnet_router_checker_start(void)
         /* EQ size doesn't matter; the callback is guaranteed to get every
          * event */
        eqsz = 0;
-        rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
-                         &the_lnet.ln_rc_eqh);
+       rc = LNetEQAlloc(eqsz, lnet_router_checker_event, &the_lnet.ln_rc_eqh);
 #else
-        rc = LNetEQAlloc(eqsz, LNET_EQ_HANDLER_NONE,
-                         &the_lnet.ln_rc_eqh);
+       rc = LNetEQAlloc(eqsz, LNET_EQ_HANDLER_NONE, &the_lnet.ln_rc_eqh);
 #endif
         if (rc != 0) {
                 CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
index e39e1ee..95d34c9 100644 (file)
@@ -507,14 +507,11 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                         int        minrtrcr  = peer->lp_minrtrcredits;
                         int        txqnob    = peer->lp_txqnob;
 
-                        if (lnet_isrouter(peer) ||
-                            lnet_peer_aliveness_enabled(peer))
-                                aliveness = peer->lp_alive ? "up" : "down";
-
                         if (lnet_peer_aliveness_enabled(peer)) {
-                                cfs_time_t     now = cfs_time_current();
-                                cfs_duration_t delta;
+                               cfs_time_t     now = cfs_time_current();
+                               cfs_duration_t delta;
 
+                               aliveness = peer->lp_alive ? "up" : "down";
                                 delta = cfs_time_sub(now, peer->lp_last_alive);
                                 lastalive = cfs_duration_sec(delta);