From 5d88d521ad1abc2d94ac6a9c6a9b2e023335b757 Mon Sep 17 00:00:00 2001 From: eeb Date: Fri, 16 Feb 2007 15:44:57 +0000 Subject: [PATCH] Severity : enhancement Bugzilla : 11684 Description: Added D_NETTRACE for recording network packet history (initially only for ptllnd). Also a separate userspace ptllnd facility to gather history which should really be covered by D_NETTRACE too, if only CDEBUG recorded history in userspace. --- lnet/ChangeLog | 8 +++ lnet/include/libcfs/libcfs.h | 7 +-- lnet/klnds/iiblnd/iiblnd.h | 5 +- lnet/klnds/o2iblnd/o2iblnd.h | 2 +- lnet/klnds/openiblnd/openiblnd.h | 2 +- lnet/klnds/ptllnd/ptllnd.h | 2 +- lnet/klnds/ptllnd/ptllnd_cb.c | 20 +++++++ lnet/klnds/ptllnd/ptllnd_peer.c | 31 +++++++---- lnet/klnds/ptllnd/ptllnd_rx_buf.c | 20 +++---- lnet/klnds/ptllnd/ptllnd_tx.c | 7 ++- lnet/klnds/viblnd/viblnd.h | 6 +- lnet/libcfs/debug.c | 2 + lnet/ulnds/ptllnd/ptllnd.c | 112 ++++++++++++++++++++++++++++++++++++++ lnet/ulnds/ptllnd/ptllnd.h | 23 ++++++++ lnet/ulnds/ptllnd/ptllnd_cb.c | 90 ++++++++++++++++++++++++++---- lnet/utils/debug.c | 10 ++-- 16 files changed, 296 insertions(+), 51 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 4b98fe5..b0a0f04 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -13,6 +13,14 @@ TBD Cluster File Systems, Inc. ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x * bug fixes +Severity : enhancement +Bugzilla : 11684 +Description: Added D_NETTRACE for recording network packet history + (initially only for ptllnd). Also a separate userspace + ptllnd facility to gather history which should really be + covered by D_NETTRACE too, if only CDEBUG recorded history in + userspace. + Severity : major Frequency : rare Bugzilla : 11616 diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index 2e11e7c..8ac781b 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -94,7 +94,6 @@ extern unsigned int libcfs_catastrophe; #define S_LLITE 0x00000080 #define S_RPC 0x00000100 #define S_MGMT 0x00000200 -/* unused */ #define S_LNET 0x00000400 #define S_LND 0x00000800 /* ALL LNDs */ #define S_PINGER 0x00001000 @@ -120,7 +119,7 @@ extern unsigned int libcfs_catastrophe; /* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ /* Debugging masks (32 bits, non-overlapping) */ -/* keep these in sync with lnet/utils/debug.c */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ #define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ #define D_INODE 0x00000002 #define D_SUPER 0x00000004 @@ -135,7 +134,7 @@ extern unsigned int libcfs_catastrophe; #define D_BUFFS 0x00000800 #define D_OTHER 0x00001000 #define D_DENTRY 0x00002000 -/* unused: keep in sync with lnet/utils/debug.c */ +#define D_NETTRACE 0x00004000 #define D_PAGE 0x00008000 /* bulk page handling */ #define D_DLMTRACE 0x00010000 #define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ @@ -149,7 +148,7 @@ extern unsigned int libcfs_catastrophe; #define D_CONSOLE 0x02000000 #define D_QUOTA 0x04000000 #define D_SEC 0x08000000 -/* keep these in sync with lnet/utils/debug.c */ +/* keep these in sync with lnet/{utils,libcfs}/debug.c */ #define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index 0a2fa94..8b72f24 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -68,13 +68,14 @@ #error Invalid GCC version. Must use GCC >= 3.2.3 #endif -#if CONFIG_SMP +#ifdef CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define IBNAL_FMR 0 /* map on demand v. use whole mem mapping */ +#define IBNAL_USE_FMR 0 /* map on demand v. use whole mem mapping */ +#define KIBLND_DETAILED_DEBUG 0 /* tunables fixed at compile time */ #define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 24e4be2..8afba88 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -68,7 +68,7 @@ typedef int gfp_t; #include /* tunables fixed at compile time */ -#if CONFIG_SMP +#ifdef CONFIG_SMP # define IBLND_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBLND_N_SCHED 1 /* # schedulers */ diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index 6ed306c..ad97c9d 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -102,7 +102,7 @@ typedef void ib_fmr_pool_t; #endif -#if CONFIG_SMP +#ifdef CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 7243a6b..4072a41 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -65,7 +65,7 @@ */ //#define PJK_DEBUGGING -#if CONFIG_SMP +#ifdef CONFIG_SMP # define PTLLND_N_SCHED num_online_cpus() /* # schedulers */ #else # define PTLLND_N_SCHED 1 /* # schedulers */ diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c index 89456ac..22adc85 100644 --- a/lnet/klnds/ptllnd/ptllnd_cb.c +++ b/lnet/klnds/ptllnd/ptllnd_cb.c @@ -330,6 +330,11 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr; kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_PUT, sizeof(kptl_rdma_msg_t)); + + CDEBUG(D_NETTRACE, "%s: passive PUT p %d %p\n", + libcfs_id2str(target), + le32_to_cpu(lntmsg->msg_hdr.msg.put.ptl_index), tx); + kptllnd_tx_launch(tx, target); return 0; @@ -374,6 +379,11 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr; kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET, sizeof(kptl_rdma_msg_t)); + + CDEBUG(D_NETTRACE, "%s: passive GET p %d %p\n", + libcfs_id2str(target), + le32_to_cpu(lntmsg->msg_hdr.msg.put.ptl_index), tx); + kptllnd_tx_launch(tx, target); return 0; @@ -408,6 +418,16 @@ kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]); kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, nob); + + CDEBUG(D_NETTRACE, "%s: immediate %s p %d %p\n", + libcfs_id2str(target), + lnet_msgtyp2str(lntmsg->msg_type), + (le32_to_cpu(lntmsg->msg_type) == LNET_MSG_PUT) ? + le32_to_cpu(lntmsg->msg_hdr.msg.put.ptl_index) : + (le32_to_cpu(lntmsg->msg_type) == LNET_MSG_GET) ? + le32_to_cpu(lntmsg->msg_hdr.msg.get.ptl_index) : -1, + tx); + kptllnd_tx_launch(tx, target); return 0; } diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index cefcb7d..f8999e0 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -572,7 +572,8 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) spin_lock_irqsave(&peer->peer_lock, flags); if (list_empty(&peer->peer_sendq) && - peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) { + peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER && + peer->peer_credits != 0) { /* post a NOOP to return credits */ spin_unlock_irqrestore(&peer->peer_lock, flags); @@ -612,8 +613,9 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) } if (peer->peer_credits == 0) { - CDEBUG(D_NET, "%s: no credits\n", - libcfs_id2str(peer->peer_id)); + CDEBUG(D_NETTRACE, "%s[%d/%d]: no credits for %p\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, tx); break; } @@ -621,8 +623,9 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) * return */ if (peer->peer_credits == 1 && peer->peer_outstanding_credits == 0) { - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_id2str(peer->peer_id)); + CDEBUG(D_NETTRACE, "%s[%d/%d]: not using last credit for %p\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, tx); break; } @@ -646,17 +649,19 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) continue; } - CDEBUG(D_NET, "tx=%p nob=%d to %s(%s)\n", - tx, tx->tx_msg->ptlm_nob, - libcfs_id2str(peer->peer_id), - kptllnd_ptlid2str(peer->peer_ptlid)); - /* fill last-minute msg header fields */ kptllnd_msg_pack(tx->tx_msg, peer); peer->peer_outstanding_credits = 0; peer->peer_credits--; + CDEBUG(D_NETTRACE, "%s[%d/%d]: %s tx=%p nob=%d cred=%d\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, + kptllnd_msgtype2str(tx->tx_msg->ptlm_type), + tx, tx->tx_msg->ptlm_nob, + tx->tx_msg->ptlm_credits); + list_add_tail(&tx->tx_list, &peer->peer_activeq); kptllnd_tx_addref(tx); /* 1 ref for me... */ @@ -1063,6 +1068,9 @@ kptllnd_peer_handle_hello (ptl_process_id_t initiator, /* NB someone else could get in now and post a message before I post * the HELLO, but post_tx/check_sends take care of that! */ + CDEBUG(D_NETTRACE, "%s: post response hello %p\n", + libcfs_id2str(new_peer->peer_id), hello_tx); + kptllnd_post_tx(new_peer, hello_tx); kptllnd_peer_check_sends(new_peer); @@ -1187,6 +1195,9 @@ kptllnd_tx_launch(kptl_tx_t *tx, lnet_process_id_t target) /* NB someone else could get in now and post a message before I post * the HELLO, but post_tx/check_sends take care of that! */ + CDEBUG(D_NETTRACE, "%s: post initial hello %p\n", + libcfs_id2str(new_peer->peer_id), hello_tx); + peer = new_peer; kptllnd_post_tx(peer, hello_tx); diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c index 74019ef..b83ab51 100644 --- a/lnet/klnds/ptllnd/ptllnd_rx_buf.c +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -348,11 +348,11 @@ kptllnd_rx_done(kptl_rx_t *rx) LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits); - spin_unlock_irqrestore(&peer->peer_lock, flags); + CDEBUG(D_NETTRACE, "%s[%d/%d]: rx %p done\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, rx); - CDEBUG(D_NET, "Peer=%s Credits=%d Outstanding=%d\n", - libcfs_id2str(peer->peer_id), - peer->peer_credits, peer->peer_outstanding_credits); + spin_unlock_irqrestore(&peer->peer_lock, flags); /* I might have to send back credits */ kptllnd_peer_check_sends(peer); @@ -514,11 +514,6 @@ kptllnd_rx_parse(kptl_rx_t *rx) LASSERT (rx->rx_peer == NULL); - CDEBUG (D_NET, "%s: nob=%d %08x %04x %02x %d %d\n", - kptllnd_ptlid2str(rx->rx_initiator), - rx->rx_nob, msg->ptlm_magic, msg->ptlm_version, - msg->ptlm_type, msg->ptlm_credits, msg->ptlm_nob); - if ((rx->rx_nob >= 4 && (msg->ptlm_magic == LNET_PROTO_MAGIC || msg->ptlm_magic == __swab32(LNET_PROTO_MAGIC))) || @@ -545,13 +540,12 @@ kptllnd_rx_parse(kptl_rx_t *rx) goto rx_done; } - CDEBUG(D_NET, "rx=%p type=%s(%d) nob %d cred %d\n", - rx, kptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_type, - msg->ptlm_nob, msg->ptlm_credits); - srcid.nid = msg->ptlm_srcnid; srcid.pid = msg->ptlm_srcpid; + CDEBUG(D_NETTRACE, "%s: RX %s c %d %p\n", libcfs_id2str(srcid), + kptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_credits, rx); + if (srcid.nid != kptllnd_ptl2lnetnid(rx->rx_initiator.nid)) { CERROR("Bad source id %s from %s\n", libcfs_id2str(srcid), diff --git a/lnet/klnds/ptllnd/ptllnd_tx.c b/lnet/klnds/ptllnd/ptllnd_tx.c index 1c086d8..a3eac04 100644 --- a/lnet/klnds/ptllnd/ptllnd_tx.c +++ b/lnet/klnds/ptllnd/ptllnd_tx.c @@ -401,10 +401,11 @@ kptllnd_tx_callback(ptl_event_t *ev) #else unlinked = (ev->type == PTL_EVENT_UNLINK); #endif - CDEBUG(D_NET, "%s(%d) tx=%p(%s) fail=%d unlinked=%d\n", + CDEBUG(D_NETTRACE, "%s[%d/%d]: %s(%d) tx=%p fail=%d unlinked=%d\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, kptllnd_evtype2str(ev->type), ev->type, - tx, libcfs_id2str(peer->peer_id), - ev->ni_fail_type, unlinked); + tx, ev->ni_fail_type, unlinked); switch (tx->tx_type) { default: diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index 12c8df4..aae8d1e 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -81,7 +81,7 @@ # error Invalid GCC version. Must use GCC < 3.0.0 || GCC >= 3.2.3 #endif -#if CONFIG_SMP +#ifdef CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ @@ -559,6 +559,10 @@ kibnal_send_keepalive(kib_conn_t *conn) *kibnal_tunables.kib_keepalive*HZ); } +#ifndef IBNAL_VOIDSTAR_SGADDR +# define IBNAL_VOIDSTAR_SGADDR 0 +#endif + #if IBNAL_VOIDSTAR_SGADDR # if CONFIG_HIGHMEM # if CONFIG_X86 && CONFIG_HIGHMEM4G diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 18bc5d5..5de53f4 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -161,6 +161,8 @@ libcfs_debug_dbg2str(int debug) return "other"; case D_DENTRY: return "dentry"; + case D_NETTRACE: + return "nettrace"; case D_PAGE: return "page"; case D_DLMTRACE: diff --git a/lnet/ulnds/ptllnd/ptllnd.c b/lnet/ulnds/ptllnd/ptllnd.c index b13f520..e36301b 100644 --- a/lnet/ulnds/ptllnd/ptllnd.c +++ b/lnet/ulnds/ptllnd/ptllnd.c @@ -33,6 +33,111 @@ lnd_t the_ptllnd = { static int ptllnd_ni_count = 0; +static struct list_head ptllnd_idle_history; +static struct list_head ptllnd_history_list; + +void +ptllnd_history_fini(void) +{ + ptllnd_he_t *he; + + while (!list_empty(&ptllnd_idle_history)) { + he = list_entry(ptllnd_idle_history.next, + ptllnd_he_t, he_list); + + list_del(&he->he_list); + LIBCFS_FREE(he, sizeof(*he)); + } + + while (!list_empty(&ptllnd_history_list)) { + he = list_entry(ptllnd_history_list.next, + ptllnd_he_t, he_list); + + list_del(&he->he_list); + LIBCFS_FREE(he, sizeof(*he)); + } +} + +int +ptllnd_history_init(void) +{ + int i; + ptllnd_he_t *he; + int n; + int rc; + + CFS_INIT_LIST_HEAD(&ptllnd_idle_history); + CFS_INIT_LIST_HEAD(&ptllnd_history_list); + + rc = ptllnd_parse_int_tunable(&n, "PTLLND_HISTORY", 0); + if (rc != 0) + return rc; + + for (i = 0; i < n; i++) { + LIBCFS_ALLOC(he, sizeof(*he)); + if (he == NULL) { + ptllnd_history_fini(); + return -ENOMEM; + } + + list_add(&he->he_list, &ptllnd_idle_history); + } + + return 0; +} + +void +ptllnd_history(const char *fn, const char *file, const int line, + const char *fmt, ...) +{ + static int seq; + + va_list ap; + ptllnd_he_t *he; + + if (!list_empty(&ptllnd_idle_history)) { + he = list_entry(ptllnd_idle_history.next, + ptllnd_he_t, he_list); + } else if (!list_empty(&ptllnd_history_list)) { + he = list_entry(ptllnd_history_list.next, + ptllnd_he_t, he_list); + } else { + return; + } + + list_del(&he->he_list); + list_add_tail(&he->he_list, &ptllnd_history_list); + + he->he_seq = seq++; + he->he_fn = fn; + he->he_file = file; + he->he_line = line; + gettimeofday(&he->he_time, NULL); + + va_start(ap, fmt); + vsnprintf(he->he_msg, sizeof(he->he_msg), fmt, ap); + va_end(ap); +} + +void +ptllnd_dump_history(void) +{ + ptllnd_he_t *he; + + while (!list_empty(&ptllnd_history_list)) { + he = list_entry(ptllnd_history_list.next, + ptllnd_he_t, he_list); + + list_del(&he->he_list); + + CDEBUG(D_WARNING, "%d %d.%06d (%s:%d:%s()) %s\n", he->he_seq, + (int)he->he_time.tv_sec, (int)he->he_time.tv_usec, + he->he_file, he->he_line, he->he_fn, he->he_msg); + + list_add_tail(&he->he_list, &ptllnd_idle_history); + } +} + void ptllnd_assert_wire_constants (void) { @@ -495,6 +600,12 @@ ptllnd_startup (lnet_ni_t *ni) ptllnd_ni_count++; + rc = ptllnd_history_init(); + if (rc != 0) { + CERROR("Can't init history\n"); + goto failed0; + } + LIBCFS_ALLOC(plni, sizeof(*plni)); if (plni == NULL) { CERROR("Can't allocate ptllnd state\n"); @@ -584,6 +695,7 @@ ptllnd_startup (lnet_ni_t *ni) failed1: LIBCFS_FREE(plni, sizeof(*plni)); failed0: + ptllnd_history_fini(); ptllnd_ni_count--; CDEBUG(D_NET, "<<< rc=%d\n",rc); return rc; diff --git a/lnet/ulnds/ptllnd/ptllnd.h b/lnet/ulnds/ptllnd/ptllnd.h index f637c7d..09c0c89 100644 --- a/lnet/ulnds/ptllnd/ptllnd.h +++ b/lnet/ulnds/ptllnd/ptllnd.h @@ -24,6 +24,7 @@ #include #include /* Depends on portals/p30.h */ +#include #define PTLLND_DEBUG_TIMING 0 @@ -39,6 +40,27 @@ #define PTLLND_WARN_LONG_WAIT 5 /* seconds */ #define PTLLND_ABORT_ON_NAK 1 /* abort app on protocol version mismatch */ + +/* Hack to record history + * This should really be done by CDEBUG(D_NETTRACE... */ + +typedef struct { + struct list_head he_list; + struct timeval he_time; + const char *he_fn; + const char *he_file; + int he_seq; + int he_line; + char he_msg[80]; +} ptllnd_he_t; + +void ptllnd_dump_history(); +void ptllnd_history(const char *fn, const char *file, const int line, + const char *fmt, ...); +#define PTLLND_HISTORY(fmt, a...) \ + ptllnd_history(__FUNCTION__, __FILE__, __LINE__, fmt, ## a) + + #define PTLLND_MD_OPTIONS (PTL_MD_LUSTRE_COMPLETION_SEMANTICS |\ PTL_MD_EVENT_START_DISABLE) typedef struct @@ -184,6 +206,7 @@ ptllnd_eventarg2obj (void *arg) # define DBGT_ARGS(tv) #endif +int ptllnd_parse_int_tunable(int *value, char *name, int dflt); void ptllnd_cull_tx_history(ptllnd_ni_t *plni); int ptllnd_startup(lnet_ni_t *ni); void ptllnd_shutdown(lnet_ni_t *ni); diff --git a/lnet/ulnds/ptllnd/ptllnd_cb.c b/lnet/ulnds/ptllnd/ptllnd_cb.c index 0114c42..d5df024 100644 --- a/lnet/ulnds/ptllnd/ptllnd_cb.c +++ b/lnet/ulnds/ptllnd/ptllnd_cb.c @@ -157,6 +157,9 @@ ptllnd_find_peer(lnet_ni_t *ni, lnet_process_id_t id, int create) tx->tx_msg.ptlm_u.hello.kptlhm_matchbits = PTL_RESERVED_MATCHBITS; tx->tx_msg.ptlm_u.hello.kptlhm_max_msg_size = plni->plni_max_msg_size; + PTLLND_HISTORY("%s[%d/%d]: post hello %p", libcfs_id2str(id), + tx->tx_peer->plp_credits, + tx->tx_peer->plp_outstanding_credits, tx); ptllnd_post_tx(tx); return plp; @@ -274,6 +277,7 @@ ptllnd_debug_peer(lnet_ni_t *ni, lnet_process_id_t id) } ptllnd_peer_decref(plp); + ptllnd_dump_history(); } void @@ -680,8 +684,8 @@ ptllnd_check_sends(ptllnd_peer_t *peer) CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits); if (list_empty(&peer->plp_txq) && - peer->plp_outstanding_credits >= - PTLLND_CREDIT_HIGHWATER(plni)) { + peer->plp_outstanding_credits >= PTLLND_CREDIT_HIGHWATER(plni) && + peer->plp_credits != 0) { tx = ptllnd_new_tx(peer, PTLLND_MSG_TYPE_NOOP, 0); CDEBUG(D_NET, "NOOP tx=%p\n",tx); @@ -708,13 +712,23 @@ ptllnd_check_sends(ptllnd_peer_t *peer) LASSERT (peer->plp_credits >= 0); LASSERT (peer->plp_credits <= peer->plp_max_credits); - if (peer->plp_credits == 0) /* no credits */ + if (peer->plp_credits == 0) { /* no credits */ + PTLLND_HISTORY("%s[%d/%d]: no creds for %p", + libcfs_id2str(peer->plp_id), + peer->plp_credits, + peer->plp_outstanding_credits, tx); break; - + } + if (peer->plp_credits == 1 && /* last credit reserved for */ - peer->plp_outstanding_credits == 0) /* returning credits */ + peer->plp_outstanding_credits == 0) { /* returning credits */ + PTLLND_HISTORY("%s[%d/%d]: too few creds for %p", + libcfs_id2str(peer->plp_id), + peer->plp_credits, + peer->plp_outstanding_credits, tx); break; - + } + list_del(&tx->tx_list); list_add_tail(&tx->tx_list, &peer->plp_activeq); @@ -771,6 +785,11 @@ ptllnd_check_sends(ptllnd_peer_t *peer) tx->tx_reqmdh = mdh; PTLLND_DBGT_STAMP(tx->tx_req_posted); + PTLLND_HISTORY("%s[%d/%d]: %s %p c %d", libcfs_id2str(peer->plp_id), + peer->plp_credits, peer->plp_outstanding_credits, + ptllnd_msgtype2str(tx->tx_type), tx, + tx->tx_msg.ptlm_credits); + rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid, plni->plni_portal, 0, LNET_MSG_MATCHBITS, 0, 0); if (rc != PTL_OK) { @@ -903,6 +922,15 @@ ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg, } tx->tx_lnetmsg = msg; + PTLLND_HISTORY("%s[%d/%d]: post passive %s p %d %p", + libcfs_id2str(msg->msg_target), + peer->plp_credits, peer->plp_outstanding_credits, + lnet_msgtyp2str(msg->msg_type), + (le32_to_cpu(msg->msg_type) == LNET_MSG_PUT) ? + le32_to_cpu(msg->msg_hdr.msg.put.ptl_index) : + (le32_to_cpu(msg->msg_type) == LNET_MSG_GET) ? + le32_to_cpu(msg->msg_hdr.msg.get.ptl_index) : -1, + tx); ptllnd_post_tx(tx); return 0; @@ -1080,6 +1108,15 @@ ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg) tx->tx_msg.ptlm_u.immediate.kptlim_hdr = msg->msg_hdr; tx->tx_lnetmsg = msg; + PTLLND_HISTORY("%s[%d/%d]: post immediate %s p %d %p", + libcfs_id2str(msg->msg_target), + plp->plp_credits, plp->plp_outstanding_credits, + lnet_msgtyp2str(msg->msg_type), + (le32_to_cpu(msg->msg_type) == LNET_MSG_PUT) ? + le32_to_cpu(msg->msg_hdr.msg.put.ptl_index) : + (le32_to_cpu(msg->msg_type) == LNET_MSG_GET) ? + le32_to_cpu(msg->msg_hdr.msg.get.ptl_index) : -1, + tx); ptllnd_post_tx(tx); ptllnd_peer_decref(plp); return 0; @@ -1092,9 +1129,11 @@ ptllnd_rx_done(ptllnd_rx_t *rx) lnet_ni_t *ni = plp->plp_ni; ptllnd_ni_t *plni = ni->ni_data; - CDEBUG(D_NET, "rx=%p\n", rx); - plp->plp_outstanding_credits++; + + PTLLND_HISTORY("%s[%d/%d]: rx=%p done\n", libcfs_id2str(plp->plp_id), + plp->plp_credits, plp->plp_outstanding_credits, rx); + ptllnd_check_sends(rx->rx_peer); LASSERT (plni->plni_nrxs > 0); @@ -1283,7 +1322,10 @@ ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator, libcfs_id2str(srcid)); return; } - + + PTLLND_HISTORY("RX %s: %s %d %p", libcfs_id2str(srcid), + ptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_credits, &rx); + switch (msg->ptlm_type) { case PTLLND_MSG_TYPE_PUT: case PTLLND_MSG_TYPE_GET: @@ -1543,6 +1585,13 @@ ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event) LASSERT (!isreq != !isbulk); /* always one and only 1 match */ + PTLLND_HISTORY("%s[%d/%d]: TX done %p %s%s", + libcfs_id2str(tx->tx_peer->plp_id), + tx->tx_peer->plp_credits, + tx->tx_peer->plp_outstanding_credits, + tx, isreq ? "REQ" : "BULK", unlinked ? "(unlinked)" : ""); + + LASSERT (!isreq != !isbulk); /* always one and only 1 match */ switch (tx->tx_type) { default: LBUG(); @@ -1608,6 +1657,13 @@ ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event) void ptllnd_wait (lnet_ni_t *ni, int milliseconds) { + static struct timeval prevt; + static int prevt_count; + static int call_count; + + struct timeval t1; + struct timeval t2; + ptllnd_ni_t *plni = ni->ni_data; ptllnd_tx_t *tx; ptl_event_t event; @@ -1621,6 +1677,9 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds) * Otherwise block for the timeout and handle all events queued * then. */ + gettimeofday(&t1, NULL); + call_count++; + for (;;) { time_t then = cfs_time_current_sec(); @@ -1647,7 +1706,8 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds) break; blocked = 1; - timeout = milliseconds; + timeout = (milliseconds < 0) ? + PTL_TIME_FOREVER : milliseconds; continue; } @@ -1681,4 +1741,14 @@ ptllnd_wait (lnet_ni_t *ni, int milliseconds) CDEBUG(D_NET, "Process ZOMBIE tx=%p\n",tx); ptllnd_tx_done(tx); } + + gettimeofday(&t2, NULL); + + if (prevt.tv_sec == 0 || + prevt.tv_sec != t2.tv_sec) { + PTLLND_HISTORY("%d wait entered at %d.%06d - prev %d %d.%06d", + call_count, (int)t1.tv_sec, (int)t1.tv_usec, + prevt_count, (int)prevt.tv_sec, (int)prevt.tv_usec); + prevt = t2; + } } diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 6dec5b8..e4dc15e 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -69,17 +69,17 @@ static int debug_mask = ~0; static const char *libcfs_debug_subsystems[] = {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", - "rpc", "", "lnet", "lnd", + "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", "", "echo", "ldlm", "lov", "", "", "", "", "", "lmv", - "", "sec", "gss", "", "mgc", "mgs", - "fid", "fld", NULL}; + "", "sec", "gss", "", + "mgc", "mgs", "fid", "fld", NULL}; static const char *libcfs_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", - "blocks", "net", "warning", "buffs", - "other", "dentry", "lnet", "page", + "neterror", "net", "warning", "buffs", + "other", "dentry", "nettrace", "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", "reada", "mmap", "config", "console", "quota", "sec", NULL}; -- 1.8.3.1