Whamcloud - gitweb
b=21678 Add more debug info to lnd_query code path
authorIsaac Huang <he.h.huang@oracle.com>
Fri, 23 Apr 2010 04:03:14 +0000 (00:03 -0400)
committerRobert Read <robert.read@oracle.com>
Sun, 25 Apr 2010 02:22:22 +0000 (19:22 -0700)
The peer health code lacked some important debugging info in lnd_query
code paths. This patch added necessary debug prints, not just for bug
21678, but also for future troubleshooting.

i=liang
i=maxim

lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd_cb.c

index 2251f23..55b5913 100644 (file)
@@ -1081,6 +1081,7 @@ void
 kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         cfs_time_t         last_alive = 0;
 kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         cfs_time_t         last_alive = 0;
+        cfs_time_t         now = cfs_time_current();
         cfs_rwlock_t      *glock = &kiblnd_data.kib_global_lock;
         kib_peer_t        *peer;
         unsigned long      flags;
         cfs_rwlock_t      *glock = &kiblnd_data.kib_global_lock;
         kib_peer_t        *peer;
         unsigned long      flags;
@@ -1104,6 +1105,10 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
          * and connection establishment with a NULL tx */
         if (peer == NULL)
                 kiblnd_launch_tx(ni, NULL, nid);
          * and connection establishment with a NULL tx */
         if (peer == NULL)
                 kiblnd_launch_tx(ni, NULL, nid);
+
+        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+               libcfs_nid2str(nid), peer,
+               last_alive ? cfs_duration_sec(now - last_alive) : -1);
         return;
 }
 
         return;
 }
 
index 45058c5..c960578 100644 (file)
@@ -1815,6 +1815,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 {
         int                connect = 1;
         cfs_time_t         last_alive = 0;
 {
         int                connect = 1;
         cfs_time_t         last_alive = 0;
+        cfs_time_t         now = cfs_time_current();
         ksock_peer_t      *peer = NULL;
         cfs_rwlock_t      *glock = &ksocknal_data.ksnd_global_lock;
         lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
         ksock_peer_t      *peer = NULL;
         cfs_rwlock_t      *glock = &ksocknal_data.ksnd_global_lock;
         lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
@@ -1835,7 +1836,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
                                 /* something got ACKed */
                                 conn->ksnc_tx_deadline =
                                         cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
                                 /* something got ACKed */
                                 conn->ksnc_tx_deadline =
                                         cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-                                peer->ksnp_last_alive = cfs_time_current();
+                                peer->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
                 }
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
                 }
@@ -1850,6 +1851,11 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
         if (last_alive != 0)
                 *when = last_alive;
 
         if (last_alive != 0)
                 *when = last_alive;
 
+        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+               libcfs_nid2str(nid), peer,
+               last_alive ? cfs_duration_sec(now - last_alive) : -1,
+               connect);
+
         if (!connect)
                 return;
 
         if (!connect)
                 return;
 
index ac5088d..fa343f7 100644 (file)
@@ -781,8 +781,9 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 ksock_route_t *
 ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 {
 ksock_route_t *
 ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 {
-        cfs_list_t        *tmp;
-        ksock_route_t     *route;
+        cfs_time_t     now = cfs_time_current();
+        cfs_list_t    *tmp;
+        ksock_route_t *route;
 
         cfs_list_for_each (tmp, &peer->ksnp_routes) {
                 route = cfs_list_entry (tmp, ksock_route_t, ksnr_list);
 
         cfs_list_for_each (tmp, &peer->ksnp_routes) {
                 route = cfs_list_entry (tmp, ksock_route_t, ksnr_list);
@@ -796,11 +797,17 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
                 if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
                         continue;
 
                 if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
                         continue;
 
-                /* too soon to retry this guy? */
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
                 if (!(route->ksnr_retry_interval == 0 || /* first attempt */
-                      cfs_time_aftereq (cfs_time_current(),
-                                        route->ksnr_timeout)))
+                      cfs_time_aftereq(now, route->ksnr_timeout))) {
+                        CDEBUG(D_NET,
+                               "Too soon to retry route %u.%u.%u.%u "
+                               "(cnted %d, interval %ld, %ld secs later)\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_connected,
+                               route->ksnr_retry_interval,
+                               cfs_duration_sec(route->ksnr_timeout - now));
                         continue;
                         continue;
+                }
 
                 return (route);
         }
 
                 return (route);
         }