Whamcloud - gitweb
b=21678 Add more debug info to lnd_query code path
authorIsaac Huang <he.h.huang@oracle.com>
Wed, 19 May 2010 20:58:12 +0000 (14:58 -0600)
committerJohann Lombardi <johann@sun.com>
Wed, 19 May 2010 21:05:05 +0000 (23:05 +0200)
The peer health code lacked some important debugging info in lnd_query
code paths. This patch added necessary debug prints, not just for bug
21678, but also for future troubleshooting.

i=liang
i=maxim

lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd_cb.c

index 0cc6faa..c7370de 100644 (file)
@@ -1078,6 +1078,7 @@ void
 kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         cfs_time_t     last_alive = 0;
+        cfs_time_t     now = cfs_time_current();
         rwlock_t      *glock = &kiblnd_data.kib_global_lock;
         kib_peer_t    *peer;
         unsigned long  flags;
@@ -1101,6 +1102,10 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
          * and connection establishment with a NULL tx */
         if (peer == NULL)
                 kiblnd_launch_tx(ni, NULL, nid);
+
+        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+               libcfs_nid2str(nid), peer,
+               last_alive ? cfs_duration_sec(now - last_alive) : -1);
         return;
 }
 
index 84d7ddc..c373f60 100644 (file)
@@ -1810,6 +1810,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         int                connect = 1;
         cfs_time_t         last_alive = 0;
+        cfs_time_t         now = cfs_time_current();
         ksock_peer_t      *peer = NULL;
         rwlock_t          *glock = &ksocknal_data.ksnd_global_lock;
         lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
@@ -1830,7 +1831,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
                                 /* something got ACKed */
                                 conn->ksnc_tx_deadline =
                                         cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-                                peer->ksnp_last_alive = cfs_time_current();
+                                peer->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
                 }
@@ -1845,6 +1846,11 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
         if (last_alive != 0)
                 *when = last_alive;
 
+        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+               libcfs_nid2str(nid), peer,
+               last_alive ? cfs_duration_sec(now - last_alive) : -1,
+               connect);
+
         if (!connect)
                 return;
 
index f4b0174..724f9b1 100644 (file)
@@ -780,6 +780,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 ksock_route_t *
 ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 {
+        cfs_time_t         now = cfs_time_current();
         struct list_head  *tmp;
         ksock_route_t     *route;
 
@@ -795,11 +796,17 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
                 if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
                         continue;
 
-                /* too soon to retry this guy? */
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
-                      cfs_time_aftereq (cfs_time_current(),
-                                        route->ksnr_timeout)))
+                      cfs_time_aftereq(now, route->ksnr_timeout))) {
+                        CDEBUG(D_NET,
+                               "Too soon to retry route %u.%u.%u.%u "
+                               "(cnted %d, interval %ld, %ld secs later)\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_connected,
+                               route->ksnr_retry_interval,
+                               cfs_duration_sec(route->ksnr_timeout - now));
                         continue;
+                }
 
                 return (route);
         }