From c5a960586832e621261e7a2dc3cab9d7af6c499d Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Fri, 23 Apr 2010 00:03:14 -0400 Subject: [PATCH 1/1] b=21678 Add more debug info to lnd_query code path The peer health code lacked some important debugging info in lnd_query code paths. This patch added necessary debug prints, not just for bug 21678, but also for future troubleshooting. i=liang i=maxim --- lnet/klnds/o2iblnd/o2iblnd.c | 5 +++++ lnet/klnds/socklnd/socklnd.c | 8 +++++++- lnet/klnds/socklnd/socklnd_cb.c | 17 ++++++++++++----- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 2251f23..55b5913 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -1081,6 +1081,7 @@ void kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock; kib_peer_t *peer; unsigned long flags; @@ -1104,6 +1105,10 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) * and connection establishment with a NULL tx */ if (peer == NULL) kiblnd_launch_tx(ni, NULL, nid); + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1); return; } diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 45058c5..c960578 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1815,6 +1815,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) { int connect = 1; cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); ksock_peer_t *peer = NULL; cfs_rwlock_t *glock = &ksocknal_data.ksnd_global_lock; lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; @@ -1835,7 +1836,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) /* something got ACKed */ conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); - peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_last_alive = now; conn->ksnc_tx_bufnob = bufnob; } } @@ -1850,6 +1851,11 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) if (last_alive != 0) *when = last_alive; + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1, + connect); + if (!connect) return; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index ac5088d..fa343f7 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -781,8 +781,9 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) ksock_route_t * ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { - cfs_list_t *tmp; - ksock_route_t *route; + cfs_time_t now = cfs_time_current(); + cfs_list_t *tmp; + ksock_route_t *route; cfs_list_for_each (tmp, &peer->ksnp_routes) { route = cfs_list_entry (tmp, ksock_route_t, ksnr_list); @@ -796,11 +797,17 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) continue; - /* too soon to retry this guy? */ if (!(route->ksnr_retry_interval == 0 || /* first attempt */ - cfs_time_aftereq (cfs_time_current(), - route->ksnr_timeout))) + cfs_time_aftereq(now, route->ksnr_timeout))) { + CDEBUG(D_NET, + "Too soon to retry route %u.%u.%u.%u " + "(cnted %d, interval %ld, %ld secs later)\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_connected, + route->ksnr_retry_interval, + cfs_duration_sec(route->ksnr_timeout - now)); continue; + } return (route); } -- 1.8.3.1