gmlnd - GM 2.1.22 and later,
mxlnd - MX 1.2.1 or later,
ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
+
+Severity : major
+Bugzilla : 14425
+Description: o2iblnd/ptllnd credit deadlock in a routed config.
+Details : o2iblnd/ptllnd credit deadlock in a routed config.
+
Severity : normal
Bugzilla : 14956
Description: High load after starting lnet
list_for_each(tmp, &conn->ibc_early_rxs)
kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+ CDEBUG(D_CONSOLE, " tx_noops:\n");
+ list_for_each(tmp, &conn->ibc_tx_noops)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
list_for_each(tmp, &conn->ibc_tx_queue_nocred)
kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
conn->ibc_cmid = cmid;
INIT_LIST_HEAD(&conn->ibc_early_rxs);
+ INIT_LIST_HEAD(&conn->ibc_tx_noops);
INIT_LIST_HEAD(&conn->ibc_tx_queue);
INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
memset(init_qp_attr, 0, sizeof(*init_qp_attr));
init_qp_attr->event_handler = kiblnd_qp_event;
init_qp_attr->qp_context = conn;
- init_qp_attr->cap.max_send_wr = (*kiblnd_tunables.kib_concurrent_sends) *
- (1 + IBLND_MAX_RDMA_FRAGS);
- init_qp_attr->cap.max_recv_wr = IBLND_RX_MSGS;
+ init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS;
+ init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS;
init_qp_attr->cap.max_send_sge = 1;
init_qp_attr->cap.max_recv_sge = 1;
init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
LASSERT (!in_interrupt());
LASSERT (atomic_read(&conn->ibc_refcount) == 0);
LASSERT (list_empty(&conn->ibc_early_rxs));
+ LASSERT (list_empty(&conn->ibc_tx_noops));
LASSERT (list_empty(&conn->ibc_tx_queue));
LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
#define IBLND_TX_MSG_PAGES() ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
/* RX messages (per connection) */
-#define IBLND_RX_MSGS (IBLND_MSG_QUEUE_SIZE*2)
+#define IBLND_RX_MSGS (IBLND_MSG_QUEUE_SIZE * 2)
#define IBLND_RX_MSG_BYTES (IBLND_RX_MSGS * IBLND_MSG_SIZE)
#define IBLND_RX_MSG_PAGES ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-#define IBLND_CQ_ENTRIES() (IBLND_RX_MSGS + \
- (*kiblnd_tunables.kib_concurrent_sends) * \
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS IBLND_RX_MSGS
+#define IBLND_SEND_WRS ((*kiblnd_tunables.kib_concurrent_sends) * \
(1 + IBLND_MAX_RDMA_FRAGS))
+#define IBLND_CQ_ENTRIES() (IBLND_RECV_WRS + IBLND_SEND_WRS)
typedef struct
{
int ibc_ready:1; /* CQ callback fired */
unsigned long ibc_last_send; /* time of last send */
struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */
+ struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs */
struct list_head ibc_tx_queue; /* sends that need a credit */
struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */
struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
*kiblnd_tunables.kib_keepalive*HZ);
}
+static inline int
+kiblnd_send_noop(kib_conn_t *conn)
+{
+ LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ if (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
+ !kiblnd_send_keepalive(conn))
+ return 0; /* No need to send NOOP */
+
+ if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+ !list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */
+ conn->ibc_credits == 0) /* no credit */
+ return 0;
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ return 0;
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
static inline void
kiblnd_abort_receives(kib_conn_t *conn)
{
conn->ibc_credits += credits;
+ /* This ensures the credit taken by NOOP can be returned */
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_outstanding_credits++;
+
spin_unlock(&conn->ibc_lock);
kiblnd_check_sends(conn);
}
break;
case IBLND_MSG_NOOP:
- post_credit = IBLND_POSTRX_PEER_CREDIT;
+ if (credits != 0) /* credit already posted */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ else /* a keepalive NOOP */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
break;
case IBLND_MSG_IMMEDIATE:
conn->ibc_reserved_credits--;
}
- if (list_empty(&conn->ibc_tx_queue) &&
- list_empty(&conn->ibc_tx_queue_nocred) &&
- (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER ||
- kiblnd_send_keepalive(conn))) {
+ if (kiblnd_send_noop(conn)) {
spin_unlock(&conn->ibc_lock);
tx = kiblnd_get_idle_tx(ni);
}
for (;;) {
- if (!list_empty (&conn->ibc_tx_queue_nocred)) {
- tx = list_entry (conn->ibc_tx_queue_nocred.next,
- kib_tx_t, tx_list);
+ if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+ tx = list_entry(conn->ibc_tx_queue_nocred.next,
+ kib_tx_t, tx_list);
consume_cred = 0;
- } else if (!list_empty (&conn->ibc_tx_queue)) {
- tx = list_entry (conn->ibc_tx_queue.next,
- kib_tx_t, tx_list);
+ } else if (!list_empty(&conn->ibc_tx_noops)) {
+ tx = list_entry(conn->ibc_tx_noops.next,
+ kib_tx_t, tx_list);
+ consume_cred = 1;
+ } else if (!list_empty(&conn->ibc_tx_queue)) {
+ tx = list_entry(conn->ibc_tx_queue.next,
+ kib_tx_t, tx_list);
consume_cred = 1;
} else {
/* nothing to send right now */
if (conn->ibc_credits == 0) { /* no credits */
CDEBUG(D_NET, "%s: no credits\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid));
- break;
+ break; /* NB ibc_tx_queue_nocred checked */
}
- if (conn->ibc_credits == 1 && /* last credit reserved for */
- conn->ibc_outstanding_credits == 0) { /* giving back credits */
+ /* Last credit reserved for NOOP */
+ if (conn->ibc_credits == 1 &&
+ tx->tx_msg->ibm_type != IBLND_MSG_NOOP) {
CDEBUG(D_NET, "%s: not using last credit\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid));
- break;
+ break; /* NB ibc_tx_noops checked */
}
}
- list_del (&tx->tx_list);
+ list_del(&tx->tx_list);
tx->tx_queued = 0;
/* NB don't drop ibc_lock before bumping tx_sending */
if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP &&
- (!list_empty(&conn->ibc_tx_queue) ||
- !list_empty(&conn->ibc_tx_queue_nocred) ||
- (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
- !kiblnd_send_keepalive(conn)))) {
+ !kiblnd_send_noop(conn)) {
/* redundant NOOP */
spin_unlock(&conn->ibc_lock);
kiblnd_tx_done(ni, tx);
break;
case IBLND_MSG_NOOP:
+ q = &conn->ibc_tx_noops;
+ break;
+
case IBLND_MSG_IMMEDIATE:
q = &conn->ibc_tx_queue;
break;
return; /* already being handled */
if (error == 0 &&
+ list_empty(&conn->ibc_tx_noops) &&
list_empty(&conn->ibc_tx_queue) &&
list_empty(&conn->ibc_tx_queue_rsrvd) &&
list_empty(&conn->ibc_tx_queue_nocred) &&
CDEBUG(D_NET, "closing conn to %s\n",
libcfs_nid2str(peer->ibp_nid));
} else {
- CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n",
+ CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s%s\n",
libcfs_nid2str(peer->ibp_nid), error,
list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
/* Complete all tx descs not waiting for sends to complete.
* NB we should be safe from RDMA now that the QP has changed state */
+ kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
/* conn now "owns" cmid, so I return success from here on to ensure the
* CM callback doesn't destroy cmid. */
- conn->ibc_incarnation = reqmsg->ibm_srcstamp;
- conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+ conn->ibc_incarnation = reqmsg->ibm_srcstamp;
+ conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
<= IBLND_RX_MSGS);
goto failed;
}
- conn->ibc_incarnation = msg->ibm_srcstamp;
- conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+ conn->ibc_incarnation = msg->ibm_srcstamp;
+ conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
<= IBLND_RX_MSGS);
kiblnd_conn_timed_out (kib_conn_t *conn)
{
return kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
+ kiblnd_check_txs(conn, &conn->ibc_tx_noops) ||
kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
kiblnd_check_txs(conn, &conn->ibc_active_txs);
return -EINVAL;
}
+ /* kptl_msg_t::ptlm_credits is only a __u8 */
+ if (*kptllnd_tunables.kptl_peercredits > 255) {
+ CERROR("kptl_peercredits must be <= 255\n");
+ return -EINVAL;
+ }
+
*kptllnd_tunables.kptl_max_msg_size &= ~7;
if (*kptllnd_tunables.kptl_max_msg_size < PTLLND_MIN_BUFFER_SIZE)
*kptllnd_tunables.kptl_max_msg_size = PTLLND_MIN_BUFFER_SIZE;
char rx_space[0]; /* copy of incoming request */
} kptl_rx_t;
+#define PTLLND_POSTRX_DONT_POST 0 /* don't post */
+#define PTLLND_POSTRX_NO_CREDIT 1 /* post: no credits */
+#define PTLLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
+
typedef struct kptl_rx_buffer_pool
{
spinlock_t rxbp_lock;
atomic_t peer_refcount; /* The current refrences */
enum kptllnd_peer_state peer_state;
spinlock_t peer_lock; /* serialize */
+ struct list_head peer_noops; /* PTLLND_MSG_TYPE_NOOP txs */
struct list_head peer_sendq; /* txs waiting for mh handles */
struct list_head peer_activeq; /* txs awaiting completion */
lnet_process_id_t peer_id; /* Peer's LNET id */
/*
* RX SUPPORT FUNCTIONS
*/
-void kptllnd_rx_done(kptl_rx_t *rx);
void kptllnd_rx_parse(kptl_rx_t *rx);
+void kptllnd_rx_done(kptl_rx_t *rx, int post_credit);
/*
* PEER SUPPORT FUNCTIONS
/*
* We're done with the RX
*/
- kptllnd_rx_done(rx);
+ kptllnd_rx_done(rx, PTLLND_POSTRX_PEER_CREDIT);
return rc;
}
memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+ INIT_LIST_HEAD (&peer->peer_noops);
INIT_LIST_HEAD (&peer->peer_sendq);
INIT_LIST_HEAD (&peer->peer_activeq);
spin_lock_init (&peer->peer_lock);
LASSERT (atomic_read(&peer->peer_refcount) == 0);
LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
peer->peer_state == PEER_STATE_ZOMBIE);
+ LASSERT (list_empty(&peer->peer_noops));
LASSERT (list_empty(&peer->peer_sendq));
LASSERT (list_empty(&peer->peer_activeq));
spin_lock_irqsave(&peer->peer_lock, flags);
+ kptllnd_cancel_txlist(&peer->peer_noops, txs);
kptllnd_cancel_txlist(&peer->peer_sendq, txs);
kptllnd_cancel_txlist(&peer->peer_activeq, txs);
tx->tx_msg_mdh = msg_mdh;
/* Ensure HELLO is sent first */
- if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
+ if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP)
+ list_add(&tx->tx_list, &peer->peer_noops);
+ else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
list_add(&tx->tx_list, &peer->peer_sendq);
else
list_add_tail(&tx->tx_list, &peer->peer_sendq);
spin_unlock_irqrestore(&peer->peer_lock, flags);
}
+static inline int
+kptllnd_peer_send_noop (kptl_peer_t *peer)
+{
+ if (!peer->peer_sent_hello ||
+ peer->peer_credits == 0 ||
+ !list_empty(&peer->peer_noops) ||
+ peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)
+ return 0;
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&peer->peer_sendq) || peer->peer_credits == 1);
+}
+
void
kptllnd_peer_check_sends (kptl_peer_t *peer)
{
ptl_handle_me_t meh;
kptl_tx_t *tx;
int rc;
+ int msg_type;
unsigned long flags;
LASSERT(!in_interrupt());
peer->peer_retry_noop = 0;
- if (list_empty(&peer->peer_sendq) &&
- peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER &&
- peer->peer_credits != 0) {
-
+ if (kptllnd_peer_send_noop(peer)) {
/* post a NOOP to return credits */
spin_unlock_irqrestore(&peer->peer_lock, flags);
peer->peer_retry_noop = (tx == NULL);
}
- while (!list_empty(&peer->peer_sendq)) {
- tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list);
+ for (;;) {
+ if (!list_empty(&peer->peer_noops)) {
+ LASSERT (peer->peer_sent_hello);
+ tx = list_entry(peer->peer_noops.next,
+ kptl_tx_t, tx_list);
+ } else if (!list_empty(&peer->peer_sendq)) {
+ tx = list_entry(peer->peer_sendq.next,
+ kptl_tx_t, tx_list);
+ } else {
+ /* nothing to send right now */
+ break;
+ }
LASSERT (tx->tx_active);
LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
*kptllnd_tunables.kptl_peercredits);
LASSERT (peer->peer_credits >= 0);
- /* Ensure HELLO is sent first */
- if (!peer->peer_sent_hello) {
- if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO)
- break;
- peer->peer_sent_hello = 1;
- }
+ msg_type = tx->tx_msg->ptlm_type;
+
+ /* Ensure HELLO is sent first */
+ if (!peer->peer_sent_hello) {
+ LASSERT (list_empty(&peer->peer_noops));
+ if (msg_type != PTLLND_MSG_TYPE_HELLO)
+ break;
+ peer->peer_sent_hello = 1;
+ }
if (peer->peer_credits == 0) {
- CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %p\n",
+ CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %s[%p]\n",
libcfs_id2str(peer->peer_id),
peer->peer_credits,
peer->peer_outstanding_credits,
- peer->peer_sent_credits, tx);
+ peer->peer_sent_credits,
+ kptllnd_msgtype2str(msg_type), tx);
break;
}
- /* Don't use the last credit unless I've got credits to
- * return */
+ /* Last/Initial credit reserved for NOOP/HELLO */
if (peer->peer_credits == 1 &&
- peer->peer_outstanding_credits == 0) {
+ msg_type != PTLLND_MSG_TYPE_HELLO &&
+ msg_type != PTLLND_MSG_TYPE_NOOP) {
CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: "
- "not using last credit for %p\n",
+ "not using last credit for %s[%p]\n",
libcfs_id2str(peer->peer_id),
peer->peer_credits,
peer->peer_outstanding_credits,
- peer->peer_sent_credits, tx);
+ peer->peer_sent_credits,
+ kptllnd_msgtype2str(msg_type), tx);
break;
}
/* Discard any NOOP I queued if I'm not at the high-water mark
* any more or more messages have been queued */
- if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
- (!list_empty(&peer->peer_sendq) ||
- peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
-
+ if (msg_type == PTLLND_MSG_TYPE_NOOP &&
+ !kptllnd_peer_send_noop(peer)) {
tx->tx_active = 0;
spin_unlock_irqrestore(&peer->peer_lock, flags);
tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
peer->peer_next_matchbits++;
}
-
+
peer->peer_sent_credits += peer->peer_outstanding_credits;
peer->peer_outstanding_credits = 0;
peer->peer_credits--;
CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n",
libcfs_id2str(peer->peer_id), peer->peer_credits,
peer->peer_outstanding_credits, peer->peer_sent_credits,
- kptllnd_msgtype2str(tx->tx_msg->ptlm_type),
- tx, tx->tx_msg->ptlm_nob,
+ kptllnd_msgtype2str(msg_type), tx, tx->tx_msg->ptlm_nob,
tx->tx_msg->ptlm_credits);
list_add_tail(&tx->tx_list, &peer->peer_activeq);
}
void
-kptllnd_rx_done(kptl_rx_t *rx)
+kptllnd_rx_done(kptl_rx_t *rx, int post_credit)
{
kptl_rx_buffer_t *rxb = rx->rx_rxb;
kptl_peer_t *peer = rx->rx_peer;
unsigned long flags;
+ LASSERT (post_credit == PTLLND_POSTRX_NO_CREDIT ||
+ post_credit == PTLLND_POSTRX_PEER_CREDIT);
+
CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer);
if (rxb != NULL)
/* Update credits (after I've decref-ed the buffer) */
spin_lock_irqsave(&peer->peer_lock, flags);
- peer->peer_outstanding_credits++;
+ if (post_credit == PTLLND_POSTRX_PEER_CREDIT)
+ peer->peer_outstanding_credits++;
+
LASSERT (peer->peer_outstanding_credits +
peer->peer_sent_credits <=
*kptllnd_tunables.kptl_peercredits);
kptllnd_rx_parse(kptl_rx_t *rx)
{
kptl_msg_t *msg = rx->rx_msg;
+ int post_credit = PTLLND_POSTRX_PEER_CREDIT;
kptl_peer_t *peer;
int rc;
unsigned long flags;
int c = peer->peer_credits;
int oc = peer->peer_outstanding_credits;
int sc = peer->peer_sent_credits;
-
+
spin_unlock_irqrestore(&peer->peer_lock, flags);
CERROR("%s: buffer overrun [%d/%d+%d]\n",
* buffers after the startup handshake. */
peer->peer_credits += msg->ptlm_credits;
+ /* This ensures the credit taken by NOOP can be returned */
+ if (msg->ptlm_type == PTLLND_MSG_TYPE_NOOP) {
+ peer->peer_outstanding_credits++;
+ post_credit = PTLLND_POSTRX_NO_CREDIT;
+ }
+
spin_unlock_irqrestore(&peer->peer_lock, flags);
/* See if something can go out now that credits have come in */
if (rx->rx_peer == NULL) /* drop ref on peer */
kptllnd_peer_decref(peer); /* unless rx_done will */
rx_done:
- kptllnd_rx_done(rx);
+ kptllnd_rx_done(rx, post_credit);
}