Whamcloud - gitweb
i=liang,b=13065:
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index ef9a7bd..f31b0fe 100644 (file)
@@ -899,12 +899,104 @@ lnet_eager_recv_locked(lnet_msg_t *msg)
         return rc;
 }
 
+/* NB: caller shall hold a ref on 'lp' as I'd drop LNET_LOCK */
+void
+lnet_ni_peer_alive(lnet_peer_t *lp)
+{
+        time_t      last_alive = 0;
+        lnet_ni_t  *ni = lp->lp_ni;
+
+        LASSERT (ni != NULL);
+        LASSERT (ni->ni_peertimeout > 0);
+        LASSERT (ni->ni_lnd->lnd_query != NULL);
+
+        LNET_UNLOCK();
+        (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+        LNET_LOCK();
+
+        lp->lp_last_query = cfs_time_current_sec();
+
+        if (last_alive != 0) /* NI has updated timestamp */
+                lp->lp_last_alive = last_alive;
+        return;
+}
+
+/* NB: always called with LNET_LOCK held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, time_t now)
+{
+        lnet_ni_t  *ni = lp->lp_ni;
+        time_t      deadline;
+        int         alive;
+
+        LASSERT (ni != NULL);
+        LASSERT (ni->ni_peertimeout > 0);
+
+        if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+            cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+                        return 0;
+
+        deadline = cfs_time_add(lp->lp_last_alive, ni->ni_peertimeout);
+        alive = cfs_time_after(deadline, now);
+        if (alive && !lp->lp_alive) /* update obsolete lp_alive */
+                lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+        return alive;
+}
+
+/* don't query LND about aliveness of a dead peer more frequently than: */
+static int lnet_queryinterval = 1; /* 1 second */
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the LNET_LOCK */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+        lnet_ni_t  *ni = lp->lp_ni;
+        time_t      now = cfs_time_current_sec();
+
+        LASSERT (ni != NULL);
+
+        if (ni->ni_peertimeout <= 0)  /* disabled */
+                return -ENODEV;
+
+        if (lnet_peer_is_alive(lp, now))
+                return 1;
+
+        /* peer appears dead, should we query right now? */
+        if (lp->lp_last_query != 0) {
+                time_t deadline =
+                        cfs_time_add(lp->lp_last_query,
+                                     lnet_queryinterval);
+
+                if (cfs_time_before(now, deadline)) {
+                        if (lp->lp_alive)
+                                CWARN("Unexpected aliveness of peer %s: "
+                                      "%d < %d (%d/%d)\n",
+                                      libcfs_nid2str(lp->lp_nid),
+                                      (int)now, (int)deadline,
+                                      lnet_queryinterval, ni->ni_peertimeout);
+                        return 0;
+                }
+        }
+
+        /* query LND for latest aliveness news */
+        lnet_ni_peer_alive(lp);
+
+        if (lnet_peer_is_alive(lp, now))
+                return 1;
+
+        lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+        return 0;
+}
+
 int
 lnet_post_send_locked (lnet_msg_t *msg, int do_send)
 {
         /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets
          * do_send FALSE and I don't do the unlock/send/lock bit.  I return
-         * EAGAIN if msg blocked and 0 if sent or OK to send */
+         * EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer appears dead, and
+         * 0 if sent or OK to send */
         lnet_peer_t *lp = msg->msg_txpeer;
         lnet_ni_t   *ni = lp->lp_ni;
 
@@ -912,6 +1004,20 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send)
         LASSERT (!do_send || msg->msg_delayed);
         LASSERT (!msg->msg_receiving);
 
+        /* NB 'lp' is always the next hop */
+        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+            lnet_peer_alive_locked(lp) == 0) {
+                LNET_UNLOCK();
+
+                CDEBUG(D_NETERROR, "Dropping message for %s: peer not alive\n",
+                       libcfs_id2str(msg->msg_target));
+                if (do_send)
+                        lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+                LNET_LOCK();
+                return EHOSTUNREACH;
+        }
+
         if (!msg->msg_peertxcredit) {
                 LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq));
 
@@ -993,7 +1099,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
 {
         /* lnet_parse is going to LNET_UNLOCK immediately after this, so it
          * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
-         * return EAGAIN if msg blocked and 0 if sent or OK to send */
+         * return EAGAIN if msg blocked and 0 if received or OK to receive */
         lnet_peer_t         *lp = msg->msg_rxpeer;
         lnet_rtrbufpool_t   *rbp;
         lnet_rtrbuf_t       *rb;
@@ -1255,6 +1361,19 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                 }
                 LASSERT (lp->lp_ni == src_ni);
         } else {
+#ifndef __KERNEL__
+                LNET_UNLOCK();
+
+                /* NB
+                 * - once application finishes computation, check here to update
+                 *   router states before it waits for pending IO in LNetEQPoll
+                 * - recursion breaker: router checker sends no message
+                 *   to remote networks */
+                if (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING)
+                        lnet_router_checker();
+
+                LNET_LOCK();
+#endif
                 /* sending to a remote network */
                 rnet = lnet_find_net_locked(LNET_NIDNET(dst_nid));
                 if (rnet == NULL) {
@@ -1327,6 +1446,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
         rc = lnet_post_send_locked(msg, 0);
         LNET_UNLOCK();
 
+        if (rc == EHOSTUNREACH)
+                return -EHOSTUNREACH;
+
         if (rc == 0)
                 lnet_ni_send(src_ni, msg);
 
@@ -1605,12 +1727,14 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
 {
         int               rc;
         int               index;
+        __u64             version;
         lnet_hdr_t       *hdr = &msg->msg_hdr;
         unsigned int      rlength = hdr->payload_length;
         unsigned int      mlength = 0;
         unsigned int      offset = 0;
         lnet_process_id_t src= {0};
         lnet_libmd_t     *md;
+        lnet_portal_t    *ptl;
 
         src.nid = hdr->src_nid;
         src.pid = hdr->src_pid;
@@ -1621,9 +1745,11 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
         hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
 
         index = hdr->msg.put.ptl_index;
+        ptl = &the_lnet.ln_portals[index];
 
         LNET_LOCK();
 
+ again:
         rc = lnet_match_md(index, LNET_MD_OP_PUT, src,
                            rlength, hdr->msg.put.offset,
                            hdr->msg.put.match_bits, msg,
@@ -1634,24 +1760,31 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
 
         case LNET_MATCHMD_OK:
                 LNET_UNLOCK();
-                lnet_recv_put(md, msg, 0, offset, mlength);
+                lnet_recv_put(md, msg, msg->msg_delayed, offset, mlength);
                 return 0;
 
         case LNET_MATCHMD_NONE:
-                rc = lnet_eager_recv_locked(msg);
-                if (rc == 0 && !the_lnet.ln_shutdown) {
-                        list_add_tail(&msg->msg_list,
-                                      &the_lnet.ln_portals[index].ptl_msgq);
+                version = ptl->ptl_ml_version;
+
+                rc = 0;
+                if (!msg->msg_delayed)
+                        rc = lnet_eager_recv_locked(msg);
 
-                        the_lnet.ln_portals[index].ptl_msgq_version++;
+                if (rc == 0 &&
+                    !the_lnet.ln_shutdown &&
+                    ((ptl->ptl_options & LNET_PTL_LAZY) != 0)) {
+                        if (version != ptl->ptl_ml_version)
+                                goto again;
+
+                        list_add_tail(&msg->msg_list, &ptl->ptl_msgq);
+                        ptl->ptl_msgq_version++;
+                        LNET_UNLOCK();
 
                         CDEBUG(D_NET, "Delaying PUT from %s portal %d match "
                                LPU64" offset %d length %d: no match \n",
                                libcfs_id2str(src), index,
                                hdr->msg.put.match_bits,
                                hdr->msg.put.offset, rlength);
-
-                        LNET_UNLOCK();
                         return 0;
                 }
                 /* fall through */
@@ -1715,17 +1848,17 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 
         LNET_UNLOCK();
 
+        msg->msg_ev.type = LNET_EVENT_GET;
+        msg->msg_ev.target.pid = hdr->dest_pid;
+        msg->msg_ev.target.nid = hdr->dest_nid;
+        msg->msg_ev.hdr_data = 0;
+
         reply_wmd = hdr->msg.get.return_wmd;
 
         lnet_prep_send(msg, LNET_MSG_REPLY, src, offset, mlength);
 
         msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
 
-        msg->msg_ev.type = LNET_EVENT_GET;
-        msg->msg_ev.target.pid = hdr->dest_pid;
-        msg->msg_ev.target.nid = hdr->dest_nid;
-        msg->msg_ev.hdr_data = 0;
-
         if (rdma_get) {
                 /* The LND completes the REPLY from her recv procedure */
                 lnet_ni_recv(ni, msg->msg_private, msg, 0,
@@ -1972,6 +2105,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         int            rc = 0;
         int            for_me;
         lnet_msg_t    *msg;
+        lnet_pid_t     dest_pid;
         lnet_nid_t     dest_nid;
         lnet_nid_t     src_nid;
         __u32          payload_length;
@@ -1982,6 +2116,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         type = le32_to_cpu(hdr->type);
         src_nid = le64_to_cpu(hdr->src_nid);
         dest_nid = le64_to_cpu(hdr->dest_nid);
+        dest_pid = le32_to_cpu(hdr->dest_pid);
         payload_length = le32_to_cpu(hdr->payload_length);
 
         for_me = (ni->ni_nid == dest_nid);
@@ -2109,7 +2244,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         LASSERT (for_me);
 #else
         if (!for_me) {
-                msg->msg_target.pid = le32_to_cpu(hdr->dest_pid);
+                msg->msg_target.pid = dest_pid;
                 msg->msg_target.nid = dest_nid;
                 msg->msg_routing = 1;
                 msg->msg_offset = 0;
@@ -2123,7 +2258,6 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                                 goto free_drop;
                         }
                 }
-
                 lnet_commit_routedmsg(msg);
                 rc = lnet_post_routed_recv_locked(msg, 0);
                 LNET_UNLOCK();
@@ -2139,7 +2273,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         msg->msg_hdr.src_nid = src_nid;
         msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
         msg->msg_hdr.dest_nid = dest_nid;
-        msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid);
+        msg->msg_hdr.dest_pid = dest_pid;
         msg->msg_hdr.payload_length = payload_length;
 
         msg->msg_ev.sender = from_nid;