int rc;
int temp;
+ /* Other tunable defaults depend on this */
+ rc = ptllnd_parse_int_tunable(&plni->plni_debug, "PTLLND_DEBUG", 0);
+ if (rc != 0)
+ return rc;
+
rc = ptllnd_parse_int_tunable(&plni->plni_portal,
"PTLLND_PORTAL", PTLLND_PORTAL);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&msgs_per_buffer,
- "PTLLND_MSGS_PER_BUFFER",
- PTLLND_MSGS_PER_BUFFER);
+ "PTLLND_MSGS_PER_BUFFER", 64);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_msgs_spare,
- "PTLLND_MSGS_SPARE",
- PTLLND_MSGS_SPARE);
+ "PTLLND_MSGS_SPARE", 256);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_peer_hash_size,
- "PTLLND_PEER_HASH_SIZE",
- PTLLND_PEER_HASH_SIZE);
+ "PTLLND_PEER_HASH_SIZE", 101);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_eq_size,
- "PTLLND_EQ_SIZE", PTLLND_EQ_SIZE);
+ "PTLLND_EQ_SIZE", 1024);
if (rc != 0)
return rc;
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_max_tx_history,
- "PTLLND_TX_HISTORY", PTLLND_TX_HISTORY);
+ "PTLLND_TX_HISTORY",
+ plni->plni_debug ? 1024 : 0);
+ if (rc != 0)
+ return rc;
+
+ rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_protocol_mismatch,
+ "PTLLND_ABORT_ON_PROTOCOL_MISMATCH", 1);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_nak,
- "PTLLND_ABORT_ON_NAK",
- PTLLND_ABORT_ON_NAK);
+ "PTLLND_ABORT_ON_NAK", 0);
if (rc != 0)
return rc;
rc = ptllnd_parse_int_tunable(&plni->plni_dump_on_nak,
- "PTLLND_DUMP_ON_NAK",
- PTLLND_DUMP_ON_NAK);
+ "PTLLND_DUMP_ON_NAK", plni->plni_debug);
+ if (rc != 0)
+ return rc;
+
+ rc = ptllnd_parse_int_tunable(&plni->plni_watchdog_interval,
+ "PTLLND_WATCHDOG_INTERVAL", 1);
if (rc != 0)
return rc;
+ if (plni->plni_watchdog_interval <= 0)
+ plni->plni_watchdog_interval = 1;
+
+ rc = ptllnd_parse_int_tunable(&plni->plni_timeout,
+ "PTLLND_TIMEOUT", 50);
+ if (rc != 0)
+ return rc;
+
+ rc = ptllnd_parse_int_tunable(&plni->plni_long_wait,
+ "PTLLND_LONG_WAIT",
+ plni->plni_debug ? 5 : plni->plni_timeout);
+ if (rc != 0)
+ return rc;
+ plni->plni_long_wait *= 1000; /* convert to mS */
plni->plni_max_msg_size = max_msg_size & ~7;
if (plni->plni_max_msg_size < PTLLND_MIN_BUFFER_SIZE)
LASSERT (plni->plni_nbuffers > 0);
if (buf->plb_posted) {
time_t start = cfs_time_current_sec();
- int w = PTLLND_WARN_LONG_WAIT;
-
+ int w = plni->plni_long_wait;
+
LASSERT (plni->plni_nposted_buffers > 0);
#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
(void) PtlMDUnlink(buf->plb_md);
while (buf->plb_posted) {
- if (cfs_time_current_sec() > start + w) {
- CWARN("Waited %ds to unlink buffer\n", w);
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
+ CWARN("Waited %ds to unlink buffer\n",
+ (int)(cfs_time_current_sec() - start));
w *= 2;
}
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
#else
while (buf->plb_posted) {
break;
}
LASSERT (rc == PTL_MD_IN_USE);
- if (cfs_time_current_sec() > start + w) {
- CWARN("Waited %ds to unlink buffer\n", w);
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
+ CWARN("Waited %ds to unlink buffer\n",
+ cfs_time_current_sec() - start);
w *= 2;
}
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
#endif
}
{
switch (cmd) {
case IOC_LIBCFS_DEBUG_PEER:
- ptllnd_debug_peer(ni, *((lnet_process_id_t *)arg));
+ ptllnd_dump_debug(ni, *((lnet_process_id_t *)arg));
return 0;
default:
ptllnd_ni_t *plni = ni->ni_data;
int rc;
time_t start = cfs_time_current_sec();
- int w = PTLLND_WARN_LONG_WAIT;
+ int w = plni->plni_long_wait;
LASSERT (ptllnd_ni_count == 1);
plni->plni_max_tx_history = 0;
ptllnd_destroy_buffers(ni);
while (plni->plni_npeers > 0) {
- if (cfs_time_current_sec() > start + w) {
- CWARN("Waited %ds for peers to shutdown\n", w);
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
+ CWARN("Waited %ds for peers to shutdown\n",
+ (int)(cfs_time_current_sec() - start));
w *= 2;
}
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
LASSERT (plni->plni_ntxs == 0);
plni->plni_nrxs = 0;
plni->plni_ntxs = 0;
plni->plni_ntx_history = 0;
+ plni->plni_watchdog_peeridx = 0;
+ plni->plni_watchdog_nextt = cfs_time_current_sec();
CFS_INIT_LIST_HEAD(&plni->plni_zombie_txs);
CFS_INIT_LIST_HEAD(&plni->plni_tx_history);
goto failed4;
}
- CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
-
/*
* Create the new NID. Based on the LND network type
* and the lower ni's address data.
#include "ptllnd.h"
+void
+ptllnd_set_tx_deadline(ptllnd_tx_t *tx)
+{
+ ptllnd_peer_t *peer = tx->tx_peer;
+ lnet_ni_t *ni = peer->plp_ni;
+ ptllnd_ni_t *plni = ni->ni_data;
+
+ tx->tx_deadline = cfs_time_current_sec() + plni->plni_timeout;
+}
+
+void
+ptllnd_post_tx(ptllnd_tx_t *tx)
+{
+ ptllnd_peer_t *peer = tx->tx_peer;
+
+ ptllnd_set_tx_deadline(tx);
+ list_add_tail(&tx->tx_list, &peer->plp_txq);
+ ptllnd_check_sends(peer);
+}
+
char *
ptllnd_ptlid2str(ptl_process_id_t id)
{
if (!list_empty(&peer->plp_txq) ||
!list_empty(&peer->plp_activeq) ||
error != 0) {
- CERROR("Closing %s\n", libcfs_id2str(peer->plp_id));
- ptllnd_debug_peer(ni, peer->plp_id);
+ CWARN("Closing %s\n", libcfs_id2str(peer->plp_id));
+ if (plni->plni_debug)
+ ptllnd_dump_debug(ni, peer->plp_id);
}
ptllnd_abort_txs(plni, &peer->plp_txq);
void
ptllnd_debug_tx(ptllnd_tx_t *tx)
{
- CDEBUG(D_WARNING, "%s %s b "DBGT_FMT"/"DBGT_FMT
- " r "DBGT_FMT"/"DBGT_FMT" status %d\n",
+ CDEBUG(D_WARNING, "%s %s b %ld.%06ld/%ld.%06ld"
+ " r %ld.%06ld/%ld.%06ld status %d\n",
ptllnd_tx_typestr(tx->tx_type),
- libcfs_id2str(tx->tx_peer->plp_id)
- DBGT_ARGS(tx->tx_bulk_posted) DBGT_ARGS(tx->tx_bulk_done)
- DBGT_ARGS(tx->tx_req_posted) DBGT_ARGS(tx->tx_req_done),
+ libcfs_id2str(tx->tx_peer->plp_id),
+ tx->tx_bulk_posted.tv_sec, tx->tx_bulk_posted.tv_usec,
+ tx->tx_bulk_done.tv_sec, tx->tx_bulk_done.tv_usec,
+ tx->tx_req_posted.tv_sec, tx->tx_req_posted.tv_usec,
+ tx->tx_req_done.tv_sec, tx->tx_req_done.tv_usec,
tx->tx_status);
}
return;
}
- CDEBUG(D_WARNING, "%s %s%s [%d] "LPD64".%06d m "LPD64" q %d/%d c %d/%d+%d(%d)\n",
+ CDEBUG(D_WARNING, "%s %s%s [%d] "LPU64".%06d m "LPU64" q %d/%d c %d/%d+%d(%d)\n",
libcfs_id2str(id),
plp->plp_recvd_hello ? "H" : "_",
plp->plp_closing ? "C" : "_",
}
ptllnd_peer_decref(plp);
+}
+
+void
+ptllnd_dump_debug(lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ptllnd_debug_peer(ni, id);
ptllnd_dump_history();
}
lnet_process_id_t id;
ptllnd_peer_t *peer;
time_t start = cfs_time_current_sec();
- int w = PTLLND_WARN_LONG_WAIT;
+ ptllnd_ni_t *plni = ni->ni_data;
+ int w = plni->plni_long_wait;
/* This is only actually used to connect to routers at startup! */
- if (!alive) {
- LBUG();
- return;
- }
+ LASSERT(alive);
id.nid = nid;
id.pid = LUSTRE_SRV_LNET_PID;
/* wait for the peer to reply */
while (!peer->plp_recvd_hello) {
- if (cfs_time_current_sec() > start + w) {
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
CWARN("Waited %ds to connect to %s\n",
- w, libcfs_id2str(id));
+ (int)(cfs_time_current_sec() - start),
+ libcfs_id2str(id));
w *= 2;
}
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
ptllnd_peer_decref(peer);
tx->tx_completing = 0;
tx->tx_status = 0;
- PTLLND_DBGT_INIT(tx->tx_bulk_posted);
- PTLLND_DBGT_INIT(tx->tx_bulk_done);
- PTLLND_DBGT_INIT(tx->tx_req_posted);
- PTLLND_DBGT_INIT(tx->tx_req_done);
+ memset(&tx->tx_bulk_posted, 0, sizeof(tx->tx_bulk_posted));
+ memset(&tx->tx_bulk_done, 0, sizeof(tx->tx_bulk_done));
+ memset(&tx->tx_req_posted, 0, sizeof(tx->tx_req_posted));
+ memset(&tx->tx_req_done, 0, sizeof(tx->tx_req_done));
if (msgsize != 0) {
tx->tx_msg.ptlm_magic = PTLLND_MSG_MAGIC;
lnet_ni_t *ni = peer->plp_ni;
int rc;
time_t start = cfs_time_current_sec();
- int w = PTLLND_WARN_LONG_WAIT;
+ ptllnd_ni_t *plni = ni->ni_data;
+ int w = plni->plni_long_wait;
while (!PtlHandleIsEqual(*mdh, PTL_INVALID_HANDLE)) {
rc = PtlMDUnlink(*mdh);
return;
LASSERT (rc == PTL_MD_IN_USE);
#endif
- if (cfs_time_current_sec() > start + w) {
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
CWARN("Waited %ds to abort tx to %s\n",
- w, libcfs_id2str(peer->plp_id));
+ (int)(cfs_time_current_sec() - start),
+ libcfs_id2str(peer->plp_id));
w *= 2;
}
/* Wait for ptllnd_tx_event() to invalidate */
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
}
list_del_init(&tx->tx_list);
if (tx->tx_status != 0) {
- CERROR("Completing tx with error\n");
- ptllnd_debug_tx(tx);
+ if (plni->plni_debug) {
+ CERROR("Completing tx for %s with error %d\n",
+ libcfs_id2str(peer->plp_id), tx->tx_status);
+ ptllnd_debug_tx(tx);
+ }
ptllnd_close_peer(peer, tx->tx_status);
}
tx->tx_type != PTLLND_RDMA_READ);
tx->tx_reqmdh = mdh;
- PTLLND_DBGT_STAMP(tx->tx_req_posted);
+ gettimeofday(&tx->tx_req_posted, NULL);
PTLLND_HISTORY("%s[%d/%d+%d(%d)]: %s %p c %d",
libcfs_id2str(peer->plp_id),
ptllnd_set_md_buffer(&md, tx);
start = cfs_time_current_sec();
- w = PTLLND_WARN_LONG_WAIT;
+ w = plni->plni_long_wait;
while (!peer->plp_recvd_hello) { /* wait to validate plp_match */
if (peer->plp_closing) {
rc = -EIO;
goto failed;
}
- if (cfs_time_current_sec() > start + w) {
+ if (w > 0 && cfs_time_current_sec() > start + w/1000) {
CWARN("Waited %ds to connect to %s\n",
- w, libcfs_id2str(peer->plp_id));
+ (int)(cfs_time_current_sec() - start),
+ libcfs_id2str(peer->plp_id));
w *= 2;
}
- ptllnd_wait(ni, w*1000);
+ ptllnd_wait(ni, w);
}
if (peer->plp_match < PTL_RESERVED_MATCHBITS)
peer->plp_match = PTL_RESERVED_MATCHBITS;
matchbits = peer->plp_match++;
- CDEBUG(D_NET, "matchbits " LPX64 " %s\n", matchbits,
- ptllnd_ptlid2str(peer->plp_ptlid));
rc = PtlMEAttach(plni->plni_nih, plni->plni_portal, peer->plp_ptlid,
matchbits, 0, PTL_UNLINK, PTL_INS_BEFORE, &meh);
goto failed;
}
- PTLLND_DBGT_STAMP(tx->tx_bulk_posted);
+ gettimeofday(&tx->tx_bulk_posted, NULL);
rc = PtlMDAttach(meh, md, LNET_UNLINK, &mdh);
if (rc != PTL_OK) {
tx->tx_bulkmdh = mdh;
tx->tx_lnetmsg = msg;
+ ptllnd_set_tx_deadline(tx);
list_add_tail(&tx->tx_list, &peer->plp_activeq);
- PTLLND_DBGT_STAMP(tx->tx_bulk_posted);
+ gettimeofday(&tx->tx_bulk_posted, NULL);
if (type == PTLLND_RDMA_READ)
rc = PtlGet(mdh, peer->plp_ptlid,
}
void
-ptllnd_abort_on_nak(lnet_ni_t *ni)
-{
- ptllnd_ni_t *plni = ni->ni_data;
-
- if (plni->plni_dump_on_nak)
- ptllnd_dump_history();
-
- if (plni->plni_abort_on_nak)
- abort();
-}
-
-void
ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
kptl_msg_t *msg, unsigned int nob)
{
msg_version = flip ? __swab16(msg->ptlm_version) : msg->ptlm_version;
if (msg_version != PTLLND_MSG_VERSION) {
- CERROR("Bad protocol version %04x from %s\n",
- (__u32)msg_version, ptllnd_ptlid2str(initiator));
- ptllnd_abort_on_nak(ni);
+ CERROR("Bad protocol version %04x from %s: %04x expected\n",
+ (__u32)msg_version, ptllnd_ptlid2str(initiator), PTLLND_MSG_VERSION);
+
+ if (plni->plni_abort_on_protocol_mismatch)
+ abort();
+
return;
}
CERROR("NAK from %s (%s)\n",
libcfs_id2str(srcid),
ptllnd_ptlid2str(initiator));
- ptllnd_abort_on_nak(ni);
+
+ if (plni->plni_dump_on_nak)
+ ptllnd_dump_debug(ni, srcid);
+
+ if (plni->plni_abort_on_nak)
+ abort();
+
return;
}
LASSERT (event->md.start == (void *)&tx->tx_msg);
if (unlinked) {
tx->tx_reqmdh = PTL_INVALID_HANDLE;
- PTLLND_DBGT_STAMP(tx->tx_req_done);
+ gettimeofday(&tx->tx_req_done, NULL);
}
}
isbulk = PtlHandleIsEqual(event->md_handle, tx->tx_bulkmdh);
if ( isbulk && unlinked ) {
tx->tx_bulkmdh = PTL_INVALID_HANDLE;
- PTLLND_DBGT_STAMP(tx->tx_bulk_done);
+ gettimeofday(&tx->tx_bulk_done, NULL);
}
LASSERT (!isreq != !isbulk); /* always one and only 1 match */
}
}
+ptllnd_tx_t *
+ptllnd_find_timed_out_tx(ptllnd_peer_t *peer)
+{
+ time_t now = cfs_time_current_sec();
+ struct list_head *tmp;
+
+ list_for_each(tmp, &peer->plp_txq) {
+ ptllnd_tx_t *tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+
+ if (tx->tx_deadline < now)
+ return tx;
+ }
+
+ list_for_each(tmp, &peer->plp_activeq) {
+ ptllnd_tx_t *tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+
+ if (tx->tx_deadline < now)
+ return tx;
+ }
+
+ return NULL;
+}
+
+void
+ptllnd_check_peer(ptllnd_peer_t *peer)
+{
+ ptllnd_tx_t *tx = ptllnd_find_timed_out_tx(peer);
+
+ if (tx == NULL)
+ return;
+
+ CERROR("%s: timed out\n", libcfs_id2str(peer->plp_id));
+ ptllnd_close_peer(peer, -ETIMEDOUT);
+}
+
+void
+ptllnd_watchdog (lnet_ni_t *ni, time_t now)
+{
+ ptllnd_ni_t *plni = ni->ni_data;
+ const int n = 4;
+ int p = plni->plni_watchdog_interval;
+ int chunk = plni->plni_peer_hash_size;
+ int interval = now - (plni->plni_watchdog_nextt - p);
+ int i;
+ struct list_head *hashlist;
+ struct list_head *tmp;
+ struct list_head *nxt;
+
+ /* Time to check for RDMA timeouts on a few more peers:
+ * I try to do checks every 'p' seconds on a proportion of the peer
+ * table and I need to check every connection 'n' times within a
+ * timeout interval, to ensure I detect a timeout on any connection
+ * within (n+1)/n times the timeout interval. */
+
+ LASSERT (now >= plni->plni_watchdog_nextt);
+
+ if (plni->plni_timeout > n * interval) { /* Scan less than the whole table? */
+ chunk = (chunk * n * interval) / plni->plni_timeout;
+ if (chunk == 0)
+ chunk = 1;
+ }
+
+ for (i = 0; i < chunk; i++) {
+ hashlist = &plni->plni_peer_hash[plni->plni_watchdog_peeridx];
+
+ list_for_each_safe(tmp, nxt, hashlist) {
+ ptllnd_check_peer(list_entry(tmp, ptllnd_peer_t, plp_list));
+ }
+
+ plni->plni_watchdog_peeridx = (plni->plni_watchdog_peeridx + 1) %
+ plni->plni_peer_hash_size;
+ }
+
+ plni->plni_watchdog_nextt = now + p;
+}
+
void
ptllnd_wait (lnet_ni_t *ni, int milliseconds)
{
static int prevt_count;
static int call_count;
- struct timeval t1;
- struct timeval t2;
+ struct timeval start;
+ struct timeval then;
+ struct timeval now;
+ struct timeval deadline;
ptllnd_ni_t *plni = ni->ni_data;
ptllnd_tx_t *tx;
ptl_event_t event;
int which;
int rc;
- int blocked = 0;
int found = 0;
int timeout = 0;
* Otherwise block for the timeout and handle all events queued
* then. */
- gettimeofday(&t1, NULL);
+ gettimeofday(&start, NULL);
call_count++;
+ if (milliseconds <= 0) {
+ deadline = start;
+ } else {
+ deadline.tv_sec = start.tv_sec + milliseconds/1000;
+ deadline.tv_usec = start.tv_usec + (milliseconds % 1000)*1000;
+
+ if (deadline.tv_usec >= 1000000) {
+ start.tv_usec -= 1000000;
+ start.tv_sec++;
+ }
+ }
+
for (;;) {
- time_t then = cfs_time_current_sec();
+ gettimeofday(&then, NULL);
+
+ rc = PtlEQPoll(&plni->plni_eqh, 1, timeout, &event, &which);
- rc = PtlEQPoll(&plni->plni_eqh, 1,
- (timeout < 0) ? PTL_TIME_FOREVER : timeout,
- &event, &which);
+ gettimeofday(&now, NULL);
- if (timeout >= 0 &&
- (cfs_time_current_sec() - then)*1000 > timeout + 1000) {
- /* 1000 mS grace.............................^ */
- CERROR("SLOW PtlEQPoll(%d): %d seconds\n", timeout,
- (int)(cfs_time_current_sec() - then));
+ if ((now.tv_sec*1000 + now.tv_usec/1000) -
+ (then.tv_sec*1000 + then.tv_usec/1000) > timeout + 1000) {
+ /* 1000 mS grace...........................^ */
+ CERROR("SLOW PtlEQPoll(%d): %dmS elapsed\n", timeout,
+ (int)(now.tv_sec*1000 + now.tv_usec/1000) -
+ (int)(then.tv_sec*1000 + then.tv_usec/1000));
}
-
- timeout = 0;
if (rc == PTL_EQ_EMPTY) {
- if (found || /* handled some events */
- milliseconds == 0 || /* just checking */
- blocked) /* blocked already */
+ if (found) /* handled some events */
break;
- blocked = 1;
- timeout = (milliseconds < 0) ?
- PTL_TIME_FOREVER : milliseconds;
+ if (now.tv_sec >= plni->plni_watchdog_nextt) { /* check timeouts? */
+ ptllnd_watchdog(ni, now.tv_sec);
+ LASSERT (now.tv_sec < plni->plni_watchdog_nextt);
+ }
+
+ if (now.tv_sec > deadline.tv_sec || /* timeout expired */
+ (now.tv_sec == deadline.tv_sec &&
+ now.tv_usec >= deadline.tv_usec))
+ break;
+
+ if (milliseconds < 0 ||
+ plni->plni_watchdog_nextt <= deadline.tv_sec) {
+ timeout = (plni->plni_watchdog_nextt - now.tv_sec)*1000;
+ } else {
+ timeout = (deadline.tv_sec - now.tv_sec)*1000 +
+ (deadline.tv_usec - now.tv_usec)/1000;
+ }
+
continue;
}
-
+
LASSERT (rc == PTL_OK || rc == PTL_EQ_DROPPED);
if (rc == PTL_EQ_DROPPED)
CERROR("Event queue: size %d is too small\n",
plni->plni_eq_size);
+ timeout = 0;
found = 1;
+
switch (ptllnd_eventarg2type(event.md.user_ptr)) {
default:
LBUG();
while (!list_empty(&plni->plni_zombie_txs)) {
tx = list_entry(plni->plni_zombie_txs.next,
ptllnd_tx_t, tx_list);
+ list_del_init(&tx->tx_list);
ptllnd_tx_done(tx);
}
- gettimeofday(&t2, NULL);
-
if (prevt.tv_sec == 0 ||
- prevt.tv_sec != t2.tv_sec) {
+ prevt.tv_sec != now.tv_sec) {
PTLLND_HISTORY("%d wait entered at %d.%06d - prev %d %d.%06d",
- call_count, (int)t1.tv_sec, (int)t1.tv_usec,
+ call_count, (int)start.tv_sec, (int)start.tv_usec,
prevt_count, (int)prevt.tv_sec, (int)prevt.tv_usec);
- prevt = t2;
+ prevt = now;
}
}