Whamcloud - gitweb
LU-630 lnet: only router checks peer health
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index abdceaa..b93c8ee 100644 (file)
@@ -923,6 +923,7 @@ lnet_ni_peer_alive(lnet_peer_t *lp)
 
         LASSERT (lnet_peer_aliveness_enabled(lp));
         LASSERT (ni->ni_lnd->lnd_query != NULL);
+        LASSERT (the_lnet.ln_routing == 1);
 
         LNET_UNLOCK();
         (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
@@ -932,7 +933,6 @@ lnet_ni_peer_alive(lnet_peer_t *lp)
 
         if (last_alive != 0) /* NI has updated timestamp */
                 lp->lp_last_alive = last_alive;
-        return;
 }
 
 /* NB: always called with LNET_LOCK held */
@@ -943,6 +943,7 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
         cfs_time_t deadline;
 
         LASSERT (lnet_peer_aliveness_enabled(lp));
+        LASSERT (the_lnet.ln_routing == 1);
 
         /* Trust lnet_notify() if it has more recent aliveness news, but
          * ignore the initial assumed death (see lnet_peers_start_down()).
@@ -974,6 +975,10 @@ lnet_peer_alive_locked (lnet_peer_t *lp)
 {
         cfs_time_t now = cfs_time_current();
 
+        /* LU-630: only router checks peer health. */
+        if (the_lnet.ln_routing == 0)
+                return 1;
+
         if (!lnet_peer_aliveness_enabled(lp))
                 return -ENODEV;
 
@@ -1028,10 +1033,12 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send)
         /* NB 'lp' is always the next hop */
         if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
             lnet_peer_alive_locked(lp) == 0) {
+                the_lnet.ln_counters.drop_count++;
+                the_lnet.ln_counters.drop_length += msg->msg_len;
                 LNET_UNLOCK();
 
-                CDEBUG(D_NETERROR, "Dropping message for %s: peer not alive\n",
-                       libcfs_id2str(msg->msg_target));
+                CNETERR("Dropping message for %s: peer not alive\n",
+                        libcfs_id2str(msg->msg_target));
                 if (do_send)
                         lnet_finalize(ni, msg, -EHOSTUNREACH);
 
@@ -1342,8 +1349,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                 src_ni = lnet_nid2ni_locked(src_nid);
                 if (src_ni == NULL) {
                         LNET_UNLOCK();
-                        CERROR("Can't send to %s: src %s is not a local nid\n",
-                               libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
+                        LCONSOLE_WARN("Can't send to %s: src %s is not a "
+                                      "local nid\n", libcfs_nid2str(dst_nid),
+                                      libcfs_nid2str(src_nid));
                         return -EINVAL;
                 }
                 LASSERT (!msg->msg_routing);
@@ -1362,8 +1370,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                         lnet_ni_decref_locked(local_ni);
                         lnet_ni_decref_locked(src_ni);
                         LNET_UNLOCK();
-                        CERROR("No route to %s via from %s\n",
-                               libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
+                        LCONSOLE_WARN("No route to %s via from %s\n",
+                                      libcfs_nid2str(dst_nid),
+                                      libcfs_nid2str(src_nid));
                         return -EINVAL;
                 }
 
@@ -1384,8 +1393,8 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                 lnet_ni_decref_locked(src_ni);  /* lp has ref on src_ni; lose mine */
                 if (rc != 0) {
                         LNET_UNLOCK();
-                        CERROR("Error %d finding peer %s\n", rc,
-                               libcfs_nid2str(dst_nid));
+                        LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+                                      libcfs_nid2str(dst_nid));
                         /* ENOMEM or shutting down */
                         return rc;
                 }
@@ -1410,7 +1419,8 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                         if (src_ni != NULL)
                                 lnet_ni_decref_locked(src_ni);
                         LNET_UNLOCK();
-                        CERROR("No route to %s\n", libcfs_id2str(msg->msg_target));
+                        LCONSOLE_WARN("No route to %s\n",
+                                      libcfs_id2str(msg->msg_target));
                         return -EHOSTUNREACH;
                 }
 
@@ -1436,9 +1446,10 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                                 lnet_ni_decref_locked(src_ni);
                         LNET_UNLOCK();
 
-                        CERROR("No route to %s via %s (all routers down)\n",
-                               libcfs_id2str(msg->msg_target),
-                               libcfs_nid2str(src_nid));
+                        LCONSOLE_WARN("No route to %s via %s "
+                                      "(all routers down)\n",
+                                      libcfs_id2str(msg->msg_target),
+                                      libcfs_nid2str(src_nid));
                         return -EHOSTUNREACH;
                 }
 
@@ -1446,6 +1457,9 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
                  * fairness; everything else being equal... */
                 cfs_list_del(&best_route->lr_list);
                 cfs_list_add_tail(&best_route->lr_list, &rnet->lrn_routes);
+                CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+                       libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+                       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
                 if (src_ni == NULL) {
                         src_ni = lp->lp_ni;
@@ -1860,12 +1874,11 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
                 /* fall through */
 
         case LNET_MATCHMD_DROP:
-                CDEBUG(D_NETERROR,
-                       "Dropping PUT from %s portal %d match "LPU64
-                       " offset %d length %d: %d\n",
-                       libcfs_id2str(src), index,
-                       hdr->msg.put.match_bits,
-                       hdr->msg.put.offset, rlength, rc);
+                CNETERR("Dropping PUT from %s portal %d match "LPU64
+                        " offset %d length %d: %d\n",
+                        libcfs_id2str(src), index,
+                        hdr->msg.put.match_bits,
+                        hdr->msg.put.offset, rlength, rc);
                 LNET_UNLOCK();
 
                 return ENOENT;          /* +ve: OK but no match */
@@ -1899,14 +1912,13 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
                            hdr->msg.get.match_bits, msg,
                            &mlength, &offset, &md);
         if (rc == LNET_MATCHMD_DROP) {
-                CDEBUG(D_NETERROR,
-                       "Dropping GET from %s portal %d match "LPU64
-                       " offset %d length %d\n",
-                       libcfs_id2str(src),
-                       hdr->msg.get.ptl_index,
-                       hdr->msg.get.match_bits,
-                       hdr->msg.get.src_offset,
-                       hdr->msg.get.sink_length);
+                CNETERR("Dropping GET from %s portal %d match "LPU64
+                        " offset %d length %d\n",
+                        libcfs_id2str(src),
+                        hdr->msg.get.ptl_index,
+                        hdr->msg.get.match_bits,
+                        hdr->msg.get.src_offset,
+                        hdr->msg.get.sink_length);
                 LNET_UNLOCK();
                 return ENOENT;                  /* +ve: OK but no match */
         }
@@ -1969,12 +1981,12 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
         /* NB handles only looked up by creator (no flips) */
         md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
         if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
-                CDEBUG(D_NETERROR, "%s: Dropping REPLY from %s for %s "
-                       "MD "LPX64"."LPX64"\n", 
-                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
-                       (md == NULL) ? "invalid" : "inactive",
-                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
-                       hdr->msg.reply.dst_wmd.wh_object_cookie);
+                CNETERR("%s: Dropping REPLY from %s for %s "
+                        "MD "LPX64"."LPX64"\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                        (md == NULL) ? "invalid" : "inactive",
+                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                        hdr->msg.reply.dst_wmd.wh_object_cookie);
                 if (md != NULL && md->md_me != NULL)
                         CERROR("REPLY MD also attached to portal %d\n",
                                md->md_me->me_portal);
@@ -1990,9 +2002,9 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
 
         if (mlength < rlength &&
             (md->md_options & LNET_MD_TRUNCATE) == 0) {
-                CDEBUG(D_NETERROR, "%s: Dropping REPLY from %s length %d "
-                       "for MD "LPX64" would overflow (%d)\n",
-                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                CNETERR("%s: Dropping REPLY from %s length %d "
+                        "for MD "LPX64" would overflow (%d)\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
                         rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
                         mlength);
                 LNET_UNLOCK();