Whamcloud - gitweb
i=maxim,b=18460,b=20171:
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 5b207da..743146c 100644 (file)
@@ -899,12 +899,112 @@ lnet_eager_recv_locked(lnet_msg_t *msg)
         return rc;
 }
 
+/* NB: caller shall hold a ref on 'lp' as I'd drop LNET_LOCK */
+void
+lnet_ni_peer_alive(lnet_peer_t *lp)
+{
+        cfs_time_t  last_alive = 0;
+        lnet_ni_t  *ni = lp->lp_ni;
+
+        LASSERT (ni != NULL);
+        LASSERT (ni->ni_peertimeout > 0);
+        LASSERT (ni->ni_lnd->lnd_query != NULL);
+
+        LNET_UNLOCK();
+        (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+        LNET_LOCK();
+
+        lp->lp_last_query = cfs_time_current();
+
+        if (last_alive != 0) /* NI has updated timestamp */
+                lp->lp_last_alive = last_alive;
+        return;
+}
+
+/* NB: always called with LNET_LOCK held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+        lnet_ni_t  *ni = lp->lp_ni;
+        cfs_time_t  deadline;
+        int         alive;
+
+        LASSERT (ni != NULL);
+        LASSERT (ni->ni_peertimeout > 0);
+
+        /* Trust lnet_notify() if it has more recent aliveness news, but
+         * ignore the initial assumed death (see lnet_peers_start_down()).
+         */
+        if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+            cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+                return 0;
+
+        deadline = cfs_time_add(lp->lp_last_alive,
+                                cfs_time_seconds(ni->ni_peertimeout));
+        alive = cfs_time_after(deadline, now);
+
+        /* Update obsolete lp_alive */
+        if (alive && !lp->lp_alive && lp->lp_timestamp != 0 &&
+            cfs_time_before(lp->lp_timestamp, lp->lp_last_alive))
+                lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+        return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the LNET_LOCK */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+        lnet_ni_t  *ni = lp->lp_ni;
+        cfs_time_t  now = cfs_time_current();
+
+        LASSERT (ni != NULL);
+
+        if (ni->ni_peertimeout <= 0)  /* disabled */
+                return -ENODEV;
+
+        if (lnet_peer_is_alive(lp, now))
+                return 1;
+
+        /* Peer appears dead, but we should avoid frequent NI queries (at
+         * most once per lnet_queryinterval seconds). */
+        if (lp->lp_last_query != 0) {
+                static const int lnet_queryinterval = 1;
+
+                cfs_time_t next_query =
+                           cfs_time_add(lp->lp_last_query,
+                                        cfs_time_seconds(lnet_queryinterval));
+
+                if (cfs_time_before(now, next_query)) {
+                        if (lp->lp_alive)
+                                CWARN("Unexpected aliveness of peer %s: "
+                                      "%d < %d (%d/%d)\n",
+                                      libcfs_nid2str(lp->lp_nid),
+                                      (int)now, (int)next_query,
+                                      lnet_queryinterval, ni->ni_peertimeout);
+                        return 0;
+                }
+        }
+
+        /* query NI for latest aliveness news */
+        lnet_ni_peer_alive(lp);
+
+        if (lnet_peer_is_alive(lp, now))
+                return 1;
+
+        lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+        return 0;
+}
+
 int
 lnet_post_send_locked (lnet_msg_t *msg, int do_send)
 {
         /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets
          * do_send FALSE and I don't do the unlock/send/lock bit.  I return
-         * EAGAIN if msg blocked and 0 if sent or OK to send */
+         * EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer appears dead, and
+         * 0 if sent or OK to send */
         lnet_peer_t *lp = msg->msg_txpeer;
         lnet_ni_t   *ni = lp->lp_ni;
 
@@ -912,6 +1012,20 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send)
         LASSERT (!do_send || msg->msg_delayed);
         LASSERT (!msg->msg_receiving);
 
+        /* NB 'lp' is always the next hop */
+        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+            lnet_peer_alive_locked(lp) == 0) {
+                LNET_UNLOCK();
+
+                CDEBUG(D_NETERROR, "Dropping message for %s: peer not alive\n",
+                       libcfs_id2str(msg->msg_target));
+                if (do_send)
+                        lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+                LNET_LOCK();
+                return EHOSTUNREACH;
+        }
+
         if (!msg->msg_peertxcredit) {
                 LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq));
 
@@ -993,7 +1107,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
 {
         /* lnet_parse is going to LNET_UNLOCK immediately after this, so it
          * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
-         * return EAGAIN if msg blocked and 0 if sent or OK to send */
+         * return EAGAIN if msg blocked and 0 if received or OK to receive */
         lnet_peer_t         *lp = msg->msg_rxpeer;
         lnet_rtrbufpool_t   *rbp;
         lnet_rtrbuf_t       *rb;
@@ -1255,6 +1369,19 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                 }
                 LASSERT (lp->lp_ni == src_ni);
         } else {
+#ifndef __KERNEL__
+                LNET_UNLOCK();
+
+                /* NB
+                 * - once application finishes computation, check here to update
+                 *   router states before it waits for pending IO in LNetEQPoll
+                 * - recursion breaker: router checker sends no message
+                 *   to remote networks */
+                if (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING)
+                        lnet_router_checker();
+
+                LNET_LOCK();
+#endif
                 /* sending to a remote network */
                 rnet = lnet_find_net_locked(LNET_NIDNET(dst_nid));
                 if (rnet == NULL) {
@@ -1273,6 +1400,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                         lp2 = route->lr_gateway;
 
                         if (lp2->lp_alive &&
+                            lnet_router_down_ni(lp2, rnet->lrn_net) <= 0 &&
                             (src_ni == NULL || lp2->lp_ni == src_ni) &&
                             (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) {
                                 best_route = route;
@@ -1327,6 +1455,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
         rc = lnet_post_send_locked(msg, 0);
         LNET_UNLOCK();
 
+        if (rc == EHOSTUNREACH)
+                return -EHOSTUNREACH;
+
         if (rc == 0)
                 lnet_ni_send(src_ni, msg);
 
@@ -1605,12 +1736,14 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
 {
         int               rc;
         int               index;
+        __u64             version;
         lnet_hdr_t       *hdr = &msg->msg_hdr;
         unsigned int      rlength = hdr->payload_length;
         unsigned int      mlength = 0;
         unsigned int      offset = 0;
         lnet_process_id_t src= {0};
         lnet_libmd_t     *md;
+        lnet_portal_t    *ptl;
 
         src.nid = hdr->src_nid;
         src.pid = hdr->src_pid;
@@ -1621,9 +1754,11 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
         hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
 
         index = hdr->msg.put.ptl_index;
+        ptl = &the_lnet.ln_portals[index];
 
         LNET_LOCK();
 
+ again:
         rc = lnet_match_md(index, LNET_MD_OP_PUT, src,
                            rlength, hdr->msg.put.offset,
                            hdr->msg.put.match_bits, msg,
@@ -1634,24 +1769,31 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
 
         case LNET_MATCHMD_OK:
                 LNET_UNLOCK();
-                lnet_recv_put(md, msg, 0, offset, mlength);
+                lnet_recv_put(md, msg, msg->msg_delayed, offset, mlength);
                 return 0;
 
         case LNET_MATCHMD_NONE:
-                rc = lnet_eager_recv_locked(msg);
-                if (rc == 0 && !the_lnet.ln_shutdown) {
-                        list_add_tail(&msg->msg_list,
-                                      &the_lnet.ln_portals[index].ptl_msgq);
+                version = ptl->ptl_ml_version;
 
-                        the_lnet.ln_portals[index].ptl_msgq_version++;
+                rc = 0;
+                if (!msg->msg_delayed)
+                        rc = lnet_eager_recv_locked(msg);
+
+                if (rc == 0 &&
+                    !the_lnet.ln_shutdown &&
+                    ((ptl->ptl_options & LNET_PTL_LAZY) != 0)) {
+                        if (version != ptl->ptl_ml_version)
+                                goto again;
+
+                        list_add_tail(&msg->msg_list, &ptl->ptl_msgq);
+                        ptl->ptl_msgq_version++;
+                        LNET_UNLOCK();
 
                         CDEBUG(D_NET, "Delaying PUT from %s portal %d match "
                                LPU64" offset %d length %d: no match \n",
                                libcfs_id2str(src), index,
                                hdr->msg.put.match_bits,
                                hdr->msg.put.offset, rlength);
-
-                        LNET_UNLOCK();
                         return 0;
                 }
                 /* fall through */
@@ -1964,7 +2106,6 @@ lnet_print_hdr(lnet_hdr_t * hdr)
 
 }
 
-
 int
 lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, 
            void *private, int rdma_req)
@@ -1972,6 +2113,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         int            rc = 0;
         int            for_me;
         lnet_msg_t    *msg;
+        lnet_pid_t     dest_pid;
         lnet_nid_t     dest_nid;
         lnet_nid_t     src_nid;
         __u32          payload_length;
@@ -1982,6 +2124,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         type = le32_to_cpu(hdr->type);
         src_nid = le64_to_cpu(hdr->src_nid);
         dest_nid = le64_to_cpu(hdr->dest_nid);
+        dest_pid = le32_to_cpu(hdr->dest_pid);
         payload_length = le32_to_cpu(hdr->payload_length);
 
         for_me = (ni->ni_nid == dest_nid);
@@ -2019,6 +2162,19 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                 return -EPROTO;
         }
 
+        if (the_lnet.ln_routing) {
+                cfs_time_t now = cfs_time_current();
+
+                LNET_LOCK();
+
+                ni->ni_last_alive = now;
+                if (ni->ni_status != NULL &&
+                    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+                        ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+
+                LNET_UNLOCK();
+        }
+
         /* Regard a bad destination NID as a protocol error.  Senders should
          * know what they're doing; if they don't they're misconfigured, buggy
          * or malicious so we chop them off at the knees :) */
@@ -2109,7 +2265,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         LASSERT (for_me);
 #else
         if (!for_me) {
-                msg->msg_target.pid = le32_to_cpu(hdr->dest_pid);
+                msg->msg_target.pid = dest_pid;
                 msg->msg_target.nid = dest_nid;
                 msg->msg_routing = 1;
                 msg->msg_offset = 0;
@@ -2123,7 +2279,6 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                                 goto free_drop;
                         }
                 }
-
                 lnet_commit_routedmsg(msg);
                 rc = lnet_post_routed_recv_locked(msg, 0);
                 LNET_UNLOCK();
@@ -2139,7 +2294,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
         msg->msg_hdr.src_nid = src_nid;
         msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
         msg->msg_hdr.dest_nid = dest_nid;
-        msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid);
+        msg->msg_hdr.dest_pid = dest_pid;
         msg->msg_hdr.payload_length = payload_length;
 
         msg->msg_ev.sender = from_nid;