From: Isaac Huang Date: Wed, 19 May 2010 20:58:12 +0000 (-0600) Subject: b=21678 Add more debug info to lnd_query code path X-Git-Tag: v1_8_3_51~12 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=0ff4d6245f325bc30f24e9566a39dd60e3e05dde;p=fs%2Flustre-release.git b=21678 Add more debug info to lnd_query code path The peer health code lacked some important debugging info in lnd_query code paths. This patch added necessary debug prints, not just for bug 21678, but also for future troubleshooting. i=liang i=maxim --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 0cc6faa..c7370de 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -1078,6 +1078,7 @@ void kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); rwlock_t *glock = &kiblnd_data.kib_global_lock; kib_peer_t *peer; unsigned long flags; @@ -1101,6 +1102,10 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) * and connection establishment with a NULL tx */ if (peer == NULL) kiblnd_launch_tx(ni, NULL, nid); + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1); return; } diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 84d7ddc..c373f60 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1810,6 +1810,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { int connect = 1; cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); ksock_peer_t *peer = NULL; rwlock_t *glock = &ksocknal_data.ksnd_global_lock; lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; @@ -1830,7 +1831,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) /* something got ACKed */ conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); - peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_last_alive = now; conn->ksnc_tx_bufnob = bufnob; } } @@ -1845,6 +1846,11 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) if (last_alive != 0) *when = last_alive; + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1, + connect); + if (!connect) return; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index f4b0174..724f9b1 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -780,6 +780,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) ksock_route_t * ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { + cfs_time_t now = cfs_time_current(); struct list_head *tmp; ksock_route_t *route; @@ -795,11 +796,17 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) continue; - /* too soon to retry this guy? */ if (!(route->ksnr_retry_interval == 0 || /* first attempt */ - cfs_time_aftereq (cfs_time_current(), - route->ksnr_timeout))) + cfs_time_aftereq(now, route->ksnr_timeout))) { + CDEBUG(D_NET, + "Too soon to retry route %u.%u.%u.%u " + "(cnted %d, interval %ld, %ld secs later)\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_connected, + route->ksnr_retry_interval, + cfs_duration_sec(route->ksnr_timeout - now)); continue; + } return (route); }