X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fulnds%2Fptllnd%2Fptllnd.c;h=a3d06f20812ad68162868851233ae6a07be46b1d;hb=4e78f6f53a87801f55a7158455b38d177abc5aba;hp=e36301b3b79e60ecd2d847313bdd0045bcd4f112;hpb=5d88d521ad1abc2d94ac6a9c6a9b2e023335b757;p=fs%2Flustre-release.git diff --git a/lnet/ulnds/ptllnd/ptllnd.c b/lnet/ulnds/ptllnd/ptllnd.c index e36301b..a3d06f2 100644 --- a/lnet/ulnds/ptllnd/ptllnd.c +++ b/lnet/ulnds/ptllnd/ptllnd.c @@ -29,6 +29,7 @@ lnd_t the_ptllnd = { .lnd_eager_recv = ptllnd_eager_recv, .lnd_notify = ptllnd_notify, .lnd_wait = ptllnd_wait, + .lnd_setasync = ptllnd_setasync, }; static int ptllnd_ni_count = 0; @@ -83,6 +84,8 @@ ptllnd_history_init(void) list_add(&he->he_list, &ptllnd_idle_history); } + PTLLND_HISTORY("Init"); + return 0; } @@ -123,6 +126,8 @@ void ptllnd_dump_history(void) { ptllnd_he_t *he; + + PTLLND_HISTORY("dumping..."); while (!list_empty(&ptllnd_history_list)) { he = list_entry(ptllnd_history_list.next, @@ -136,6 +141,8 @@ ptllnd_dump_history(void) list_add_tail(&he->he_list, &ptllnd_idle_history); } + + PTLLND_HISTORY("complete"); } void @@ -244,6 +251,11 @@ ptllnd_get_tunables(lnet_ni_t *ni) int rc; int temp; + /* Other tunable defaults depend on this */ + rc = ptllnd_parse_int_tunable(&plni->plni_debug, "PTLLND_DEBUG", 0); + if (rc != 0) + return rc; + rc = ptllnd_parse_int_tunable(&plni->plni_portal, "PTLLND_PORTAL", PTLLND_PORTAL); if (rc != 0) @@ -262,31 +274,28 @@ ptllnd_get_tunables(lnet_ni_t *ni) rc = ptllnd_parse_int_tunable(&max_msg_size, "PTLLND_MAX_MSG_SIZE", - PTLLND_MAX_MSG_SIZE); + PTLLND_MAX_ULND_MSG_SIZE); if (rc != 0) return rc; rc = ptllnd_parse_int_tunable(&msgs_per_buffer, - "PTLLND_MSGS_PER_BUFFER", - PTLLND_MSGS_PER_BUFFER); + "PTLLND_MSGS_PER_BUFFER", 64); if (rc != 0) return rc; rc = ptllnd_parse_int_tunable(&plni->plni_msgs_spare, - "PTLLND_MSGS_SPARE", - PTLLND_MSGS_SPARE); + "PTLLND_MSGS_SPARE", 256); if (rc != 0) return rc; rc = ptllnd_parse_int_tunable(&plni->plni_peer_hash_size, - "PTLLND_PEER_HASH_SIZE", - PTLLND_PEER_HASH_SIZE); + "PTLLND_PEER_HASH_SIZE", 101); if (rc != 0) return rc; rc = ptllnd_parse_int_tunable(&plni->plni_eq_size, - "PTLLND_EQ_SIZE", PTLLND_EQ_SIZE); + "PTLLND_EQ_SIZE", 1024); if (rc != 0) return rc; @@ -296,19 +305,50 @@ ptllnd_get_tunables(lnet_ni_t *ni) return rc; rc = ptllnd_parse_int_tunable(&plni->plni_max_tx_history, - "PTLLND_TX_HISTORY", PTLLND_TX_HISTORY); + "PTLLND_TX_HISTORY", + plni->plni_debug ? 1024 : 0); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_protocol_mismatch, + "PTLLND_ABORT_ON_PROTOCOL_MISMATCH", 1); if (rc != 0) return rc; rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_nak, - "PTLLND_ABORT_ON_NAK", - PTLLND_ABORT_ON_NAK); + "PTLLND_ABORT_ON_NAK", 0); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_dump_on_nak, + "PTLLND_DUMP_ON_NAK", plni->plni_debug); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_watchdog_interval, + "PTLLND_WATCHDOG_INTERVAL", 1); + if (rc != 0) + return rc; + if (plni->plni_watchdog_interval <= 0) + plni->plni_watchdog_interval = 1; + + rc = ptllnd_parse_int_tunable(&plni->plni_timeout, + "PTLLND_TIMEOUT", 50); if (rc != 0) return rc; + rc = ptllnd_parse_int_tunable(&plni->plni_long_wait, + "PTLLND_LONG_WAIT", + plni->plni_debug ? 5 : plni->plni_timeout); + if (rc != 0) + return rc; + plni->plni_long_wait *= 1000; /* convert to mS */ + plni->plni_max_msg_size = max_msg_size & ~7; - if (plni->plni_max_msg_size < sizeof(kptl_msg_t)) - plni->plni_max_msg_size = (sizeof(kptl_msg_t) + 7) & ~7; + if (plni->plni_max_msg_size < PTLLND_MIN_BUFFER_SIZE) + plni->plni_max_msg_size = PTLLND_MIN_BUFFER_SIZE; + CLASSERT ((PTLLND_MIN_BUFFER_SIZE & 7) == 0); + CLASSERT (sizeof(kptl_msg_t) <= PTLLND_MIN_BUFFER_SIZE); plni->plni_buffer_size = plni->plni_max_msg_size * msgs_per_buffer; @@ -369,7 +409,7 @@ ptllnd_destroy_buffer (ptllnd_buffer_t *buf) } int -ptllnd_grow_buffers (lnet_ni_t *ni) +ptllnd_size_buffers (lnet_ni_t *ni, int delta) { ptllnd_ni_t *plni = ni->ni_data; ptllnd_buffer_t *buf; @@ -380,8 +420,10 @@ ptllnd_grow_buffers (lnet_ni_t *ni) CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers); CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers); - nmsgs = plni->plni_npeers * plni->plni_peer_credits + - plni->plni_msgs_spare; + plni->plni_nmsgs += delta; + LASSERT(plni->plni_nmsgs >= 0); + + nmsgs = plni->plni_nmsgs + plni->plni_msgs_spare; nbufs = (nmsgs * plni->plni_max_msg_size + plni->plni_buffer_size - 1) / plni->plni_buffer_size; @@ -393,7 +435,7 @@ ptllnd_grow_buffers (lnet_ni_t *ni) return -ENOMEM; rc = ptllnd_post_buffer(buf); - if (rc != 0){ + if (rc != 0) { /* TODO - this path seems to orpahn the buffer * in a state where its not posted and will never be * However it does not leak the buffer as it's @@ -428,19 +470,20 @@ ptllnd_destroy_buffers (lnet_ni_t *ni) LASSERT (plni->plni_nbuffers > 0); if (buf->plb_posted) { time_t start = cfs_time_current_sec(); - int w = PTLLND_WARN_LONG_WAIT; - + int w = plni->plni_long_wait; + LASSERT (plni->plni_nposted_buffers > 0); #ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS (void) PtlMDUnlink(buf->plb_md); while (buf->plb_posted) { - if (cfs_time_current_sec() > start + w) { - CWARN("Waited %ds to unlink buffer\n", w); + if (w > 0 && cfs_time_current_sec() > start + w/1000) { + CWARN("Waited %ds to unlink buffer\n", + (int)(cfs_time_current_sec() - start)); w *= 2; } - ptllnd_wait(ni, w*1000); + ptllnd_wait(ni, w); } #else while (buf->plb_posted) { @@ -451,11 +494,12 @@ ptllnd_destroy_buffers (lnet_ni_t *ni) break; } LASSERT (rc == PTL_MD_IN_USE); - if (cfs_time_current_sec() > start + w) { - CWARN("Waited %ds to unlink buffer\n", w); + if (w > 0 && cfs_time_current_sec() > start + w/1000) { + CWARN("Waited %ds to unlink buffer\n", + cfs_time_current_sec() - start); w *= 2; } - ptllnd_wait(ni, w*1000); + ptllnd_wait(ni, w); } #endif } @@ -527,7 +571,7 @@ ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { switch (cmd) { case IOC_LIBCFS_DEBUG_PEER: - ptllnd_debug_peer(ni, *((lnet_process_id_t *)arg)); + ptllnd_dump_debug(ni, *((lnet_process_id_t *)arg)); return 0; default: @@ -551,22 +595,23 @@ ptllnd_shutdown (lnet_ni_t *ni) ptllnd_ni_t *plni = ni->ni_data; int rc; time_t start = cfs_time_current_sec(); - int w = PTLLND_WARN_LONG_WAIT; + int w = plni->plni_long_wait; LASSERT (ptllnd_ni_count == 1); plni->plni_max_tx_history = 0; ptllnd_cull_tx_history(plni); - ptllnd_destroy_buffers(ni); ptllnd_close_peers(ni); + ptllnd_destroy_buffers(ni); while (plni->plni_npeers > 0) { - if (cfs_time_current_sec() > start + w) { - CWARN("Waited %ds for peers to shutdown\n", w); + if (w > 0 && cfs_time_current_sec() > start + w/1000) { + CWARN("Waited %ds for peers to shutdown\n", + (int)(cfs_time_current_sec() - start)); w *= 2; } - ptllnd_wait(ni, w*1000); + ptllnd_wait(ni, w); } LASSERT (plni->plni_ntxs == 0); @@ -619,6 +664,8 @@ ptllnd_startup (lnet_ni_t *ni) plni->plni_nrxs = 0; plni->plni_ntxs = 0; plni->plni_ntx_history = 0; + plni->plni_watchdog_peeridx = 0; + plni->plni_watchdog_nextt = cfs_time_current_sec(); CFS_INIT_LIST_HEAD(&plni->plni_zombie_txs); CFS_INIT_LIST_HEAD(&plni->plni_tx_history); @@ -666,8 +713,6 @@ ptllnd_startup (lnet_ni_t *ni) goto failed4; } - CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid); - /* * Create the new NID. Based on the LND network type * and the lower ni's address data. @@ -679,7 +724,7 @@ ptllnd_startup (lnet_ni_t *ni) libcfs_id2str((lnet_process_id_t) { .nid = ni->ni_nid, .pid = the_lnet.ln_pid})); - rc = ptllnd_grow_buffers(ni); + rc = ptllnd_size_buffers(ni, 0); if (rc != 0) goto failed4; @@ -717,7 +762,7 @@ const char *ptllnd_evtype2str(int type) DO_TYPE(PTL_EVENT_SEND_END); DO_TYPE(PTL_EVENT_UNLINK); default: - return ""; + return ""; } #undef DO_TYPE } @@ -735,7 +780,51 @@ const char *ptllnd_msgtype2str(int type) DO_TYPE(PTLLND_MSG_TYPE_NOOP); DO_TYPE(PTLLND_MSG_TYPE_NAK); default: - return ""; + return ""; + } +#undef DO_TYPE +} + +const char *ptllnd_errtype2str(int type) +{ +#define DO_TYPE(x) case x: return #x; + switch(type) + { + DO_TYPE(PTL_OK); + DO_TYPE(PTL_SEGV); + DO_TYPE(PTL_NO_SPACE); + DO_TYPE(PTL_ME_IN_USE); + DO_TYPE(PTL_NAL_FAILED); + DO_TYPE(PTL_NO_INIT); + DO_TYPE(PTL_IFACE_DUP); + DO_TYPE(PTL_IFACE_INVALID); + DO_TYPE(PTL_HANDLE_INVALID); + DO_TYPE(PTL_MD_INVALID); + DO_TYPE(PTL_ME_INVALID); + DO_TYPE(PTL_PROCESS_INVALID); + DO_TYPE(PTL_PT_INDEX_INVALID); + DO_TYPE(PTL_SR_INDEX_INVALID); + DO_TYPE(PTL_EQ_INVALID); + DO_TYPE(PTL_EQ_DROPPED); + DO_TYPE(PTL_EQ_EMPTY); + DO_TYPE(PTL_MD_NO_UPDATE); + DO_TYPE(PTL_FAIL); + DO_TYPE(PTL_AC_INDEX_INVALID); + DO_TYPE(PTL_MD_ILLEGAL); + DO_TYPE(PTL_ME_LIST_TOO_LONG); + DO_TYPE(PTL_MD_IN_USE); + DO_TYPE(PTL_NI_INVALID); + DO_TYPE(PTL_PID_INVALID); + DO_TYPE(PTL_PT_FULL); + DO_TYPE(PTL_VAL_FAILED); + DO_TYPE(PTL_NOT_IMPLEMENTED); + DO_TYPE(PTL_NO_ACK); + DO_TYPE(PTL_EQ_IN_USE); + DO_TYPE(PTL_PID_IN_USE); + DO_TYPE(PTL_INV_EQ_SIZE); + DO_TYPE(PTL_AGAIN); + default: + return ""; } #undef DO_TYPE }