summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
4c01e64)
The peer health code lacked some important debugging info in lnd_query
code paths. This patch added necessary debug prints, not just for bug
21678, but also for future troubleshooting.
i=liang
i=maxim
kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
{
cfs_time_t last_alive = 0;
kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
{
cfs_time_t last_alive = 0;
+ cfs_time_t now = cfs_time_current();
cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock;
kib_peer_t *peer;
unsigned long flags;
cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock;
kib_peer_t *peer;
unsigned long flags;
* and connection establishment with a NULL tx */
if (peer == NULL)
kiblnd_launch_tx(ni, NULL, nid);
* and connection establishment with a NULL tx */
if (peer == NULL)
kiblnd_launch_tx(ni, NULL, nid);
+
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1);
{
int connect = 1;
cfs_time_t last_alive = 0;
{
int connect = 1;
cfs_time_t last_alive = 0;
+ cfs_time_t now = cfs_time_current();
ksock_peer_t *peer = NULL;
cfs_rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
ksock_peer_t *peer = NULL;
cfs_rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
/* something got ACKed */
conn->ksnc_tx_deadline =
cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
/* something got ACKed */
conn->ksnc_tx_deadline =
cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
- peer->ksnp_last_alive = cfs_time_current();
+ peer->ksnp_last_alive = now;
conn->ksnc_tx_bufnob = bufnob;
}
}
conn->ksnc_tx_bufnob = bufnob;
}
}
if (last_alive != 0)
*when = last_alive;
if (last_alive != 0)
*when = last_alive;
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1,
+ connect);
+
ksock_route_t *
ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
{
ksock_route_t *
ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
{
- cfs_list_t *tmp;
- ksock_route_t *route;
+ cfs_time_t now = cfs_time_current();
+ cfs_list_t *tmp;
+ ksock_route_t *route;
cfs_list_for_each (tmp, &peer->ksnp_routes) {
route = cfs_list_entry (tmp, ksock_route_t, ksnr_list);
cfs_list_for_each (tmp, &peer->ksnp_routes) {
route = cfs_list_entry (tmp, ksock_route_t, ksnr_list);
if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
continue;
if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
continue;
- /* too soon to retry this guy? */
if (!(route->ksnr_retry_interval == 0 || /* first attempt */
if (!(route->ksnr_retry_interval == 0 || /* first attempt */
- cfs_time_aftereq (cfs_time_current(),
- route->ksnr_timeout)))
+ cfs_time_aftereq(now, route->ksnr_timeout))) {
+ CDEBUG(D_NET,
+ "Too soon to retry route %u.%u.%u.%u "
+ "(cnted %d, interval %ld, %ld secs later)\n",
+ HIPQUAD(route->ksnr_ipaddr),
+ route->ksnr_connected,
+ route->ksnr_retry_interval,
+ cfs_duration_sec(route->ksnr_timeout - now));