From c4390bfd480bdd706e20c3522cf0faa049ae887e Mon Sep 17 00:00:00 2001 From: minhdiep Date: Mon, 22 Nov 2010 13:29:15 -0700 Subject: [PATCH] b=21525 debug enhancements + minor comment updates a=Issac i=Liang i=Maxim - when a message is dropped, the lnet counters should be increased; - /proc/sys/lnet/peers should contain some data for debugging, like the last_alive time stamp. --- lnet/lnet/api-ni.c | 4 ++-- lnet/lnet/lib-move.c | 2 ++ lnet/lnet/router_proc.c | 29 +++++++++++++++++++++++------ lustre/tests/sanity.sh | 8 ++++---- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index b8ba025..fc25673 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -925,7 +925,7 @@ lnet_shutdown_lndnis (void) cfs_list_del (&ni->ni_list); the_lnet.ln_nzombie_nis++; - lnet_ni_decref_locked(ni); /* drop apini's ref */ + lnet_ni_decref_locked(ni); /* drop ln_nis' ref */ } /* Drop the cached eqwait NI. */ @@ -952,7 +952,7 @@ lnet_shutdown_lndnis (void) lnet_clear_peer_table(); LNET_LOCK(); - /* Now wait for the NI's I just nuked to show up on apini_zombie_nis + /* Now wait for the NI's I just nuked to show up on ln_zombie_nis * and shut them down in guaranteed thread context */ i = 2; while (the_lnet.ln_nzombie_nis != 0) { diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 95b4985..932591c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1028,6 +1028,8 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && lnet_peer_alive_locked(lp) == 0) { + the_lnet.ln_counters.drop_count++; + the_lnet.ln_counters.drop_length += msg->msg_len; LNET_UNLOCK(); CNETERR("Dropping message for %s: peer not alive\n", diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index cd9bdca..fde0e0a 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -27,7 +27,8 @@ #if defined(__KERNEL__) && defined(LNET_ROUTER) -/* this is really lnet_proc.c */ +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ static cfs_sysctl_table_header_t *lnet_table_header = NULL; @@ -407,8 +408,8 @@ int LL_PROC_PROTO(proc_lnet_peers) if (*ppos == 0) { s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4s %5s %5s %5s %5s %5s %5s %s\n", - "nid", "refs", "state", "max", + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", "rtr", "min", "tx", "min", "queue"); LASSERT (tmpstr + tmpsiz - s > 0); @@ -441,7 +442,7 @@ int LL_PROC_PROTO(proc_lnet_peers) if (skip == 0) { peer = lp; - /* minor optimiztion: start from idx+1 + /* minor optimization: start from idx+1 * on next iteration if we've just * drained lp_hashlist */ if (lp->lp_hashlist.next == @@ -470,6 +471,7 @@ int LL_PROC_PROTO(proc_lnet_peers) if (peer != NULL) { lnet_nid_t nid = peer->lp_nid; int nrefs = peer->lp_refcount; + int lastalive = -1; char *aliveness = "NA"; int maxcr = peer->lp_ni->ni_peertxcredits; int txcr = peer->lp_txcredits; @@ -482,10 +484,25 @@ int LL_PROC_PROTO(proc_lnet_peers) lnet_peer_aliveness_enabled(peer)) aliveness = peer->lp_alive ? "up" : "down"; + if (lnet_peer_aliveness_enabled(peer)) { + cfs_time_t now = cfs_time_current(); + cfs_duration_t delta; + + delta = cfs_time_sub(now, peer->lp_last_alive); + lastalive = cfs_duration_sec(delta); + + /* No need to mess up peers contents with + * arbitrarily long integers - it suffices to + * know that lastalive is more than 10000s old + */ + if (lastalive >= 10000) + lastalive = 9999; + } + s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4d %5s %5d %5d %5d %5d %5d %d\n", + "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n", libcfs_nid2str(nid), nrefs, aliveness, - maxcr, rtrcr, minrtrcr, txcr, + lastalive, maxcr, rtrcr, minrtrcr, txcr, mintxcr, txqnob); LASSERT (tmpstr + tmpsiz - s > 0); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a40f77d..4f15c5a 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7616,12 +7616,12 @@ test_215() { # for bugs 18102, 21079, 21517 remove_lnet_proc_files "routers" # /proc/sys/lnet/peers should look like this: - # nid refs state max rtr min tx min queue + # nid refs state last max rtr min tx min queue # where nid is a string like 192.168.1.1@tcp2, refs > 0, - # state is up/down/NA, max >= 0. rtr, min, tx, min are + # state is up/down/NA, max >= 0. last, rtr, min, tx, min are # numeric (0 or >0 or <0), queue >= 0. - L1="^nid +refs +state +max +rtr +min +tx +min +queue$" - BR="^$NID +$P +(up|down|NA) +$N +$I +$I +$I +$I +$N$" + L1="^nid +refs +state +last +max +rtr +min +tx +min +queue$" + BR="^$NID +$P +(up|down|NA) +$I +$N +$I +$I +$I +$I +$N$" create_lnet_proc_files "peers" check_lnet_proc_entry "peers.out" "/proc/sys/lnet/peers" "$BR" "$L1" check_lnet_proc_entry "peers.sys" "lnet.peers" "$BR" "$L1" -- 1.8.3.1