*
*/
-#include "vibnal.h"
+#include "viblnd.h"
-static void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
-
-/*
- * LIB functions follow
- *
- */
-static void
-kibnal_schedule_tx_done (kib_tx_t *tx)
-{
- unsigned long flags;
-
- spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
-
- list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
- wake_up (&kibnal_data.kib_sched_waitq);
-
- spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
-}
-
-static void
+void
kibnal_tx_done (kib_tx_t *tx)
{
- ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
- unsigned long flags;
- int i;
- vv_return_t retval;
-
- LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
- LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
-
- switch (tx->tx_mapped) {
- default:
- LBUG();
+ lnet_msg_t *lntmsg[2];
+ int rc = tx->tx_status;
+ int i;
- case KIB_TX_UNMAPPED:
- break;
+ LASSERT (!in_interrupt());
+ LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
+ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
- case KIB_TX_MAPPED:
- if (in_interrupt()) {
- /* can't deregister memory in IRQ context... */
- kibnal_schedule_tx_done(tx);
- return;
- }
- retval = vv_mem_region_destroy(kibnal_data.kib_hca, tx->tx_md.md_handle);
- LASSERT (retval == vv_return_ok);
- tx->tx_mapped = KIB_TX_UNMAPPED;
- break;
+#if IBNAL_USE_FMR
+ if (tx->tx_md.md_fmrcount == 0 ||
+ (rc != 0 && tx->tx_md.md_active)) {
+ vv_return_t vvrc;
-#if IBNAL_FMR
- case KIB_TX_MAPPED_FMR:
- if (in_interrupt() && tx->tx_status != 0) {
- /* can't flush FMRs in IRQ context... */
- kibnal_schedule_tx_done(tx);
- return;
- }
+ /* mapping must be active (it dropped fmrcount to 0) */
+ LASSERT (tx->tx_md.md_active);
- rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
- LASSERT (rc == 0);
+ vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
+ 1, &tx->tx_md.md_fmrhandle);
+ LASSERT (vvrc == vv_return_ok);
- if (tx->tx_status != 0)
- ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
- tx->tx_mapped = KIB_TX_UNMAPPED;
- break;
-#endif
+ tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
}
+ tx->tx_md.md_active = 0;
+#endif
- for (i = 0; i < 2; i++) {
- /* tx may have up to 2 libmsgs to finalise */
- if (tx->tx_libmsg[i] == NULL)
- continue;
+ /* tx may have up to 2 lnet msgs to finalise */
+ lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+ lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
- lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
- tx->tx_libmsg[i] = NULL;
- }
-
if (tx->tx_conn != NULL) {
- kibnal_put_conn (tx->tx_conn);
+ kibnal_conn_decref(tx->tx_conn);
tx->tx_conn = NULL;
}
- tx->tx_nsp = 0;
- tx->tx_passive_rdma = 0;
+ tx->tx_nwrq = 0;
tx->tx_status = 0;
- spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
-
- if (tx->tx_isnblk) {
- list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
- } else {
- list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
- wake_up (&kibnal_data.kib_idle_tx_waitq);
- }
-
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
-}
-
-static kib_tx_t *
-kibnal_get_idle_tx (int may_block)
-{
- unsigned long flags;
- kib_tx_t *tx = NULL;
- ENTRY;
-
- for (;;) {
- spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
-
- /* "normal" descriptor is free */
- if (!list_empty (&kibnal_data.kib_idle_txs)) {
- tx = list_entry (kibnal_data.kib_idle_txs.next,
- kib_tx_t, tx_list);
- break;
- }
+ spin_lock(&kibnal_data.kib_tx_lock);
- if (!may_block) {
- /* may dip into reserve pool */
- if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
- CERROR ("reserved tx desc pool exhausted\n");
- break;
- }
+ list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
- tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
- kib_tx_t, tx_list);
- break;
- }
+ spin_unlock(&kibnal_data.kib_tx_lock);
- /* block for idle tx */
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+ /* delay finalize until my descs have been freed */
+ for (i = 0; i < 2; i++) {
+ if (lntmsg[i] == NULL)
+ continue;
- wait_event (kibnal_data.kib_idle_tx_waitq,
- !list_empty (&kibnal_data.kib_idle_txs) ||
- kibnal_data.kib_shutdown);
+ lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
}
-
- if (tx != NULL) {
- list_del (&tx->tx_list);
-
- /* Allocate a new passive RDMA completion cookie. It might
- * not be needed, but we've got a lock right now and we're
- * unlikely to wrap... */
- tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
-
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
- LASSERT (tx->tx_nsp == 0);
- LASSERT (tx->tx_sending == 0);
- LASSERT (tx->tx_status == 0);
- LASSERT (tx->tx_conn == NULL);
- LASSERT (!tx->tx_passive_rdma);
- LASSERT (!tx->tx_passive_rdma_wait);
- LASSERT (tx->tx_libmsg[0] == NULL);
- LASSERT (tx->tx_libmsg[1] == NULL);
- }
-
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
-
- RETURN(tx);
}
-static int
-kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+void
+kibnal_txlist_done (struct list_head *txlist, int status)
{
- /* I would guess that if kibnal_get_peer (nid) == NULL,
- and we're not routing, then 'nid' is very distant :) */
- if ( nal->libnal_ni.ni_pid.nid == nid ) {
- *dist = 0;
- } else {
- *dist = 1;
- }
+ kib_tx_t *tx;
- return 0;
+ while (!list_empty (txlist)) {
+ tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ /* complete now */
+ tx->tx_waiting = 0;
+ tx->tx_status = status;
+ kibnal_tx_done (tx);
+ }
}
-static void
-kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+kib_tx_t *
+kibnal_get_idle_tx (void)
{
- struct list_head *ttmp;
- unsigned long flags;
- int idle;
-
- spin_lock_irqsave (&conn->ibc_lock, flags);
-
- list_for_each (ttmp, &conn->ibc_active_txs) {
- kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+ kib_tx_t *tx;
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
+ spin_lock(&kibnal_data.kib_tx_lock);
- LASSERT (tx->tx_passive_rdma_wait ||
- tx->tx_sending != 0);
-
- if (!tx->tx_passive_rdma_wait ||
- tx->tx_passive_rdma_cookie != cookie)
- continue;
-
- CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+ if (list_empty (&kibnal_data.kib_idle_txs)) {
+ spin_unlock(&kibnal_data.kib_tx_lock);
+ return NULL;
+ }
- tx->tx_status = status;
- tx->tx_passive_rdma_wait = 0;
- idle = (tx->tx_sending == 0);
+ tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
+ list_del (&tx->tx_list);
- if (idle)
- list_del (&tx->tx_list);
+ /* Allocate a new completion cookie. It might not be needed,
+ * but we've got a lock right now and we're unlikely to
+ * wrap... */
+ tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&kibnal_data.kib_tx_lock);
- /* I could be racing with tx callbacks. It's whoever
- * _makes_ tx idle that frees it */
- if (idle)
- kibnal_tx_done (tx);
- return;
- }
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ LASSERT (tx->tx_nwrq == 0);
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_sending == 0);
+ LASSERT (!tx->tx_waiting);
+ LASSERT (tx->tx_status == 0);
+ LASSERT (tx->tx_conn == NULL);
+ LASSERT (tx->tx_lntmsg[0] == NULL);
+ LASSERT (tx->tx_lntmsg[1] == NULL);
- CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
- cookie, conn->ibc_peer->ibp_nid);
+ return tx;
}
-static void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+int
+kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
{
kib_conn_t *conn = rx->rx_conn;
int rc = 0;
- unsigned long flags;
- vv_return_t retval;
+ __u64 addr = (__u64)((unsigned long)((rx)->rx_msg));
+ vv_return_t vvrc;
+
+ LASSERT (!in_interrupt());
+ /* old peers don't reserve rxs for RDMA replies */
+ LASSERT (!rsrvd_credit ||
+ conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
- ENTRY;
-
rx->rx_gl = (vv_scatgat_t) {
- .v_address = (void *)rx->rx_msg,
+ .v_address = KIBNAL_ADDR2SG(addr),
+ .l_key = rx->rx_lkey,
.length = IBNAL_MSG_SIZE,
- .l_key = rx->l_key,
};
rx->rx_wrq = (vv_wr_t) {
- .wr_id = kibnal_ptr2wreqid(rx, 1),
+ .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
.completion_notification = 1,
.scatgat_list = &rx->rx_gl,
.num_of_data_segments = 1,
.wr_type = vv_wr_receive,
};
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
- IBNAL_CONN_DREP);
- LASSERT (!rx->rx_posted);
- rx->rx_posted = 1;
- mb();
+ LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
+ LASSERT (rx->rx_nob >= 0); /* not posted */
- if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
- rc = -ECONNABORTED;
- else {
- retval = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq);
+ CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
+ rx->rx_wrq.scatgat_list->length,
+ rx->rx_wrq.scatgat_list->l_key,
+ KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address));
- if (retval) {
- CDEBUG(D_NET, "post failed %d\n", retval);
- rc = -EINVAL;
- }
- CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
+ /* No more posts for this rx; so lose its ref */
+ kibnal_conn_decref(conn);
+ return 0;
}
- if (rc == 0) {
- if (do_credits) {
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ rx->rx_nob = -1; /* flag posted */
+
+ spin_lock(&conn->ibc_lock);
+ /* Serialise vv_post_receive; it's not re-entrant on the same QP */
+ vvrc = vv_post_receive(kibnal_data.kib_hca,
+ conn->ibc_qp, &rx->rx_wrq);
+
+ if (vvrc == vv_return_ok) {
+ if (credit)
conn->ibc_outstanding_credits++;
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ if (rsrvd_credit)
+ conn->ibc_reserved_credits++;
+ spin_unlock(&conn->ibc_lock);
+
+ if (credit || rsrvd_credit)
kibnal_check_sends(conn);
- }
- EXIT;
- return;
- }
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- CERROR ("Error posting receive -> "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, rc);
- kibnal_close_conn (rx->rx_conn, rc);
- } else {
- CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, rc);
+ return 0;
}
- /* Drop rx's ref */
- kibnal_put_conn (conn);
- EXIT;
+ spin_unlock(&conn->ibc_lock);
+
+ CERROR ("post rx -> %s failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
+ rc = -EIO;
+ kibnal_close_conn(conn, rc);
+ /* No more posts for this rx; so lose its ref */
+ kibnal_conn_decref(conn);
+ return rc;
}
-#if IBNAL_CKSUM
-static inline __u32 kibnal_cksum (void *ptr, int nob)
+int
+kibnal_post_receives (kib_conn_t *conn)
{
- char *c = ptr;
- __u32 sum = 0;
+ int i;
+ int rc;
+
+ LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_comms_error == 0);
+
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
+ /* +1 ref for rx desc. This ref remains until kibnal_post_rx
+ * fails (i.e. actual failure or we're disconnecting) */
+ kibnal_conn_addref(conn);
+ rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
+ if (rc != 0)
+ return rc;
+ }
- while (nob-- > 0)
- sum = ((sum << 1) | (sum >> 31)) + *c++;
-
- return (sum);
+ return 0;
}
-#endif
-static void
-kibnal_rx_callback (vv_wc_t *wc)
+kib_tx_t *
+kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
{
- kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->wr_id);
- kib_msg_t *msg = rx->rx_msg;
- kib_conn_t *conn = rx->rx_conn;
- int nob = wc->num_bytes_transfered;
- const int base_nob = offsetof(kib_msg_t, ibm_u);
- int credits;
- int flipped;
- unsigned long flags;
- __u32 i;
-#if IBNAL_CKSUM
- __u32 msg_cksum;
- __u32 computed_cksum;
-#endif
+ struct list_head *tmp;
- /* we set the QP to erroring after we've finished disconnecting,
- * maybe we should do so sooner. */
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
- IBNAL_CONN_DISCONNECTED);
+ list_for_each(tmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
- CDEBUG(D_NET, "rx %p conn %p, nob=%d\n", rx, conn, nob);
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
- LASSERT (rx->rx_posted);
- rx->rx_posted = 0;
- mb();
+ if (tx->tx_cookie != cookie)
+ continue;
- /* receives complete with error in any case after we've started
- * disconnecting */
- if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
- goto failed;
+ if (tx->tx_waiting &&
+ tx->tx_msg->ibm_type == txtype)
+ return tx;
- if (wc->completion_status != vv_comp_status_success) {
- CERROR("Rx from "LPX64" failed: %d\n",
- conn->ibc_peer->ibp_nid, wc->completion_status);
- goto failed;
+ CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+ tx->tx_waiting ? "" : "NOT ",
+ tx->tx_msg->ibm_type, txtype);
}
+ return NULL;
+}
- if (nob < base_nob) {
- CERROR ("Short rx from "LPX64": %d < expected %d\n",
- conn->ibc_peer->ibp_nid, nob, base_nob);
- goto failed;
- }
+void
+kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+ kib_tx_t *tx;
+ int idle;
- /* Receiver does any byte flipping if necessary... */
+ spin_lock(&conn->ibc_lock);
- if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
- flipped = 0;
- } else {
- if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
- CERROR ("Unrecognised magic: %08x from "LPX64"\n",
- msg->ibm_magic, conn->ibc_peer->ibp_nid);
- goto failed;
- }
- flipped = 1;
- __swab16s (&msg->ibm_version);
- LASSERT (sizeof(msg->ibm_type) == 1);
- LASSERT (sizeof(msg->ibm_credits) == 1);
- }
+ tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
+ if (tx == NULL) {
+ spin_unlock(&conn->ibc_lock);
- if (msg->ibm_version != IBNAL_MSG_VERSION) {
- CERROR ("Incompatible msg version %d (%d expected)\n",
- msg->ibm_version, IBNAL_MSG_VERSION);
- goto failed;
+ CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_close_conn (conn, -EPROTO);
+ return;
}
-#if IBNAL_CKSUM
- if (nob != msg->ibm_nob) {
- CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
- goto failed;
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ } else if (txtype == IBNAL_MSG_GET_REQ) {
+ lnet_set_reply_msg_len(kibnal_data.kib_ni,
+ tx->tx_lntmsg[1], status);
+ }
}
- msg_cksum = le32_to_cpu(msg->ibm_cksum);
- msg->ibm_cksum = 0;
- computed_cksum = kibnal_cksum (msg, nob);
-
- if (msg_cksum != computed_cksum) {
- CERROR ("Checksum failure %d: (%d expected)\n",
- computed_cksum, msg_cksum);
-// goto failed;
+ tx->tx_waiting = 0;
+
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
+ list_del(&tx->tx_list);
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kibnal_tx_done(tx);
+}
+
+void
+kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+ kib_tx_t *tx = kibnal_get_idle_tx();
+
+ if (tx == NULL) {
+ CERROR("Can't get tx for completion %x for %s\n",
+ type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
}
- CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
-#endif
- /* Have I received credits that will let me send? */
- credits = msg->ibm_credits;
+ tx->tx_msg->ibm_u.completion.ibcm_status = status;
+ tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+ kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
+
+ kibnal_queue_tx(tx, conn);
+}
+
+void
+kibnal_handle_rx (kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ int credits = msg->ibm_credits;
+ kib_tx_t *tx;
+ int rc = 0;
+ int repost = 1;
+ int rsrvd_credit = 0;
+ int rc2;
+
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+ CDEBUG (D_NET, "Received %x[%d] from %s\n",
+ msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
if (credits != 0) {
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ /* Have I received credits that will let me send? */
+ spin_lock(&conn->ibc_lock);
conn->ibc_credits += credits;
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
+ spin_unlock(&conn->ibc_lock);
+
kibnal_check_sends(conn);
}
switch (msg->ibm_type) {
+ default:
+ CERROR("Bad IBNAL message type %x from %s\n",
+ msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
+
case IBNAL_MSG_NOOP:
- kibnal_post_rx (rx, 1);
- return;
+ break;
case IBNAL_MSG_IMMEDIATE:
- if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
- CERROR ("Short IMMEDIATE from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, nob);
- goto failed;
- }
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+ msg->ibm_srcnid, rx, 0);
+ repost = rc < 0; /* repost on error */
break;
-
- case IBNAL_MSG_PUT_RDMA:
- case IBNAL_MSG_GET_RDMA:
- if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
- CERROR ("Short RDMA msg from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, nob);
- goto failed;
- }
- if (flipped)
- __swab32(msg->ibm_u.rdma.ibrm_num_descs);
- CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
- msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+ case IBNAL_MSG_PUT_REQ:
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ repost = rc < 0; /* repost on error */
+ break;
- if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
- (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) >
- min(nob, IBNAL_MSG_SIZE))) {
- CERROR ("num_descs %d too large\n",
- msg->ibm_u.rdma.ibrm_num_descs);
- goto failed;
- }
+ case IBNAL_MSG_PUT_NAK:
+ rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
- if (flipped) {
- __swab32(msg->ibm_u.rdma.rd_key);
- }
+ CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
- for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
- kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+ case IBNAL_MSG_PUT_ACK:
+ rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
- if (flipped) {
- __swab32(desc->rd_nob);
- __swab64(desc->rd_addr);
- }
+ spin_lock(&conn->ibc_lock);
+ tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
+ msg->ibm_u.putack.ibpam_src_cookie);
+ if (tx != NULL)
+ list_del(&tx->tx_list);
+ spin_unlock(&conn->ibc_lock);
- CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n",
- msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
+ if (tx == NULL) {
+ CERROR("Unmatched PUT_ACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
}
+
+ LASSERT (tx->tx_waiting);
+ /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+ * (a) I can overwrite tx_msg since my peer has received it!
+ * (b) tx_waiting set tells tx_complete() it's not done. */
+
+ tx->tx_nwrq = 0; /* overwrite PUT_REQ */
+
+ rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
+ kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+ &msg->ibm_u.putack.ibpam_rd,
+ msg->ibm_u.putack.ibpam_dst_cookie);
+ if (rc2 < 0)
+ CERROR("Can't setup rdma for PUT to %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+ spin_lock(&conn->ibc_lock);
+ if (tx->tx_status == 0 && rc2 < 0)
+ tx->tx_status = rc2;
+ tx->tx_waiting = 0; /* clear waiting and queue atomically */
+ kibnal_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
break;
-
+
case IBNAL_MSG_PUT_DONE:
+ /* This buffer was pre-reserved by not returning the credit
+ * when the PUT_REQ's buffer was reposted, so I just return it
+ * now */
+ kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBNAL_MSG_GET_REQ:
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ repost = rc < 0; /* repost on error */
+ break;
+
case IBNAL_MSG_GET_DONE:
- if (nob < base_nob + sizeof (kib_completion_msg_t)) {
- CERROR ("Short COMPLETION msg from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, nob);
- goto failed;
- }
- if (flipped)
- __swab32s(&msg->ibm_u.completion.ibcm_status);
-
- CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
- msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
- msg->ibm_u.completion.ibcm_status);
-
- kibnal_complete_passive_rdma (conn,
- msg->ibm_u.completion.ibcm_cookie,
- msg->ibm_u.completion.ibcm_status);
- kibnal_post_rx (rx, 1);
- return;
-
- default:
- CERROR ("Can't parse type from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, msg->ibm_type);
- goto failed;
+ rsrvd_credit = 1; /* rdma reply (was pre-reserved) */
+
+ kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
}
- /* schedule for kibnal_rx() in thread context */
- spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
-
- list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
- wake_up (&kibnal_data.kib_sched_waitq);
-
- spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ if (rc < 0) /* protocol error */
+ kibnal_close_conn(conn, rc);
- return;
-
- failed:
- CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- kibnal_close_conn(conn, -ECONNABORTED);
+ if (repost) {
+ if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+ rsrvd_credit = 0; /* peer isn't pre-reserving */
- /* Don't re-post rx & drop its ref on conn */
- kibnal_put_conn(conn);
+ kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
+ }
}
-static void
-kibnal_rx (kib_rx_t *rx)
+void
+kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
{
- kib_msg_t *msg = rx->rx_msg;
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ unsigned long flags;
+ int rc;
- /* Clear flag so I can detect if I've sent an RDMA completion */
- rx->rx_rdma = 0;
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ LASSERT (rx->rx_nob < 0); /* was posted */
+ rx->rx_nob = 0; /* isn't now */
- switch (msg->ibm_type) {
- case IBNAL_MSG_GET_RDMA:
- lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
- /* If the incoming get was matched, I'll have initiated the
- * RDMA and the completion message... */
- if (rx->rx_rdma)
- break;
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ goto ignore;
- /* Otherwise, I'll send a failed completion now to prevent
- * the peer's GET blocking for the full timeout. */
- CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
- rx->rx_conn->ibc_peer->ibp_nid);
- kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
- rx, NULL, 0, NULL, NULL, 0, 0);
- break;
-
- case IBNAL_MSG_PUT_RDMA:
- lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
- if (rx->rx_rdma)
- break;
- /* This is most unusual, since even if lib_parse() didn't
- * match anything, it should have asked us to read (and
- * discard) the payload. The portals header must be
- * inconsistent with this message type, so it's the
- * sender's fault for sending garbage and she can time
- * herself out... */
- CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
- rx->rx_conn->ibc_peer->ibp_nid);
- break;
+ if (vvrc != vv_comp_status_success) {
+ CERROR("Rx from %s failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
+ goto failed;
+ }
- case IBNAL_MSG_IMMEDIATE:
- lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
- LASSERT (!rx->rx_rdma);
- break;
-
- default:
- LBUG();
- break;
+ rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
+ if (rc != 0) {
+ CERROR ("Error %d unpacking rx from %s\n",
+ rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ goto failed;
+ }
+
+ rx->rx_nob = nob; /* Can trust 'nob' now */
+
+ if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
+ msg->ibm_srcnid) ||
+ !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+ msg->ibm_dstnid) ||
+ msg->ibm_srcstamp != conn->ibc_incarnation ||
+ msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+ CERROR ("Stale rx from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ goto failed;
+ }
+
+ if (msg->ibm_seq != rxseq) {
+ CERROR ("Out-of-sequence rx from %s"
+ ": got "LPD64" but expected "LPD64"\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ msg->ibm_seq, rxseq);
+ goto failed;
+ }
+
+ /* set time last known alive */
+ kibnal_peer_alive(conn->ibc_peer);
+
+ /* racing with connection establishment/teardown! */
+
+ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* must check holding global lock to eliminate race */
+ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+ list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
+ return;
+ }
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
}
+ kibnal_handle_rx(rx);
+ return;
- kibnal_post_rx (rx, 1);
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kibnal_close_conn(conn, -EIO);
+ ignore:
+ /* Don't re-post rx & drop its ref on conn */
+ kibnal_conn_decref(conn);
}
-static struct page *
+struct page *
kibnal_kvaddr_to_page (unsigned long vaddr)
{
struct page *page;
if (vaddr >= VMALLOC_START &&
- vaddr < VMALLOC_END)
+ vaddr < VMALLOC_END) {
page = vmalloc_to_page ((void *)vaddr);
-#if CONFIG_HIGHMEM
- else if (vaddr >= PKMAP_BASE &&
- vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
- page = vmalloc_to_page ((void *)vaddr);
- /* in 2.4 ^ just walks the page tables */
+ LASSERT (page != NULL);
+ return page;
+ }
+#ifdef CONFIG_HIGHMEM
+ if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+ /* No highmem pages only used for bulk (kiov) I/O */
+ CERROR("find page for address in highmem\n");
+ LBUG();
+ }
#endif
- else
- page = virt_to_page (vaddr);
-
- if (!VALID_PAGE (page))
- page = NULL;
-
+ page = virt_to_page (vaddr);
+ LASSERT (page != NULL);
return page;
}
-static void
-kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
- unsigned long len, int active)
+#if !IBNAL_USE_FMR
+int
+kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
+ unsigned long page_offset, unsigned long len)
{
- kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
- kib_rdma_desc_t *desc;
- vv_l_key_t l_key;
- vv_r_key_t r_key;
- void *addr;
- vv_mem_reg_h_t mem_h;
- vv_return_t retval;
-
- LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n",
- ibrm->ibrm_num_descs);
-
- desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
-
- addr = page_address(page) + page_offset;
-
- /* TODO: This next step is only needed to get either the lkey
- * or the rkey. However they should be the same than for the
- * tx buffer, so we might as well use it. */
- retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
- addr,
- len,
- &mem_h,
- &l_key,
- &r_key);
- if (retval) {
- CERROR("vv_get_gen_mr_attrib failed: %d", retval);
- /* TODO: this shouldn't really fail, but what if? */
- return;
- }
+ kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
+ vv_l_key_t l_key;
+ vv_r_key_t r_key;
+ __u64 addr;
+ __u64 frag_addr;
+ vv_mem_reg_h_t mem_h;
+ vv_return_t vvrc;
+
+ if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
+ CERROR ("Too many RDMA fragments\n");
+ return -EMSGSIZE;
+ }
+
+ /* Try to create an address that adaptor-tavor will munge into a valid
+ * network address, given how it maps all phys mem into 1 region */
+ addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
+
+ /* NB this relies entirely on there being a single region for the whole
+ * of memory, since "high" memory will wrap in the (void *) cast! */
+ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+ (void *)((unsigned long)addr),
+ len, &mem_h, &l_key, &r_key);
+ LASSERT (vvrc == vv_return_ok);
if (active) {
- ibrm->rd_key = l_key;
+ if (rd->rd_nfrag == 0) {
+ rd->rd_key = l_key;
+ } else if (l_key != rd->rd_key) {
+ CERROR ("> 1 key for single RDMA desc\n");
+ return -EINVAL;
+ }
+ frag_addr = addr;
} else {
- ibrm->rd_key = r_key;
+ if (rd->rd_nfrag == 0) {
+ rd->rd_key = r_key;
+ } else if (r_key != rd->rd_key) {
+ CERROR ("> 1 key for single RDMA desc\n");
+ return -EINVAL;
+ }
- vv_va2advertise_addr(kibnal_data.kib_hca, addr, &addr);
+ frag_addr = kibnal_addr2net(addr);
}
- desc->rd_addr = (__u64)(unsigned long)addr;
- desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
-
- ibrm->ibrm_num_descs++;
-}
-
-static int
-kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
-{
- struct page *page;
- int page_offset, len;
-
- while (nob > 0) {
- page = kibnal_kvaddr_to_page(vaddr);
- if (page == NULL)
- return -EFAULT;
+ kibnal_rf_set(frag, frag_addr, len);
- page_offset = vaddr & (PAGE_SIZE - 1);
- len = min(nob, (int)PAGE_SIZE - page_offset);
-
- kibnal_fill_ibrm(tx, page, page_offset, len, active);
- nob -= len;
- vaddr += len;
- }
+ CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n",
+ rd->rd_nfrag, frag->rf_nob, rd->rd_key,
+ frag->rf_addr_hi, frag->rf_addr_lo, frag_addr);
+ rd->rd_nfrag++;
return 0;
}
-static int
-kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
- int niov, struct iovec *iov, int offset, int nob, int active)
-
+int
+kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
+ vv_access_con_bit_mask_t access,
+ unsigned int niov, struct iovec *iov, int offset, int nob)
{
- void *vaddr;
- vv_return_t retval;
+ /* active if I'm sending */
+ int active = ((access & vv_acc_r_mem_write) == 0);
+ int fragnob;
+ int rc;
+ unsigned long vaddr;
+ struct page *page;
+ int page_offset;
LASSERT (nob > 0);
LASSERT (niov > 0);
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT ((rd != tx->tx_rd) == !active);
while (offset >= iov->iov_len) {
offset -= iov->iov_len;
LASSERT (niov > 0);
}
- if (nob > iov->iov_len - offset) {
- CERROR ("Can't map multiple vaddr fragments\n");
- return (-EMSGSIZE);
- }
+ rd->rd_nfrag = 0;
+ do {
+ LASSERT (niov > 0);
- /* our large contiguous iov could be backed by multiple physical
- * pages. */
- if (kibnal_whole_mem()) {
- int rc;
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
- rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base +
- offset, nob, active);
- if (rc != 0) {
- CERROR ("Can't map iov: %d\n", rc);
- return rc;
+ vaddr = ((unsigned long)iov->iov_base) + offset;
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL) {
+ CERROR ("Can't find page\n");
+ return -EFAULT;
}
- return 0;
- }
- vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
- tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+ fragnob = min((int)(iov->iov_len - offset), nob);
+ fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
- retval = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
- kibnal_data.kib_pd, access,
- &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
- if (retval != 0) {
- CERROR ("Can't map vaddr %p: %d\n", vaddr, retval);
- return -EINVAL;
- }
+ rc = kibnal_append_rdfrag(rd, active, page,
+ page_offset, fragnob);
+ if (rc != 0)
+ return rc;
- tx->tx_mapped = KIB_TX_MAPPED;
- return (0);
+ if (offset + fragnob < iov->iov_len) {
+ offset += fragnob;
+ } else {
+ offset = 0;
+ iov++;
+ niov--;
+ }
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return 0;
}
-static int
-kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access,
- int nkiov, ptl_kiov_t *kiov,
- int offset, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
+ vv_access_con_bit_mask_t access,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
{
- vv_phy_list_t phys_pages;
- vv_phy_buf_t *phys_buf = NULL;
- int page_offset;
- int nphys;
- int resid;
- int phys_size = 0;
- int i, rc = 0;
- vv_return_t retval;
+ /* active if I'm sending */
+ int active = ((access & vv_acc_r_mem_write) == 0);
+ int fragnob;
+ int rc;
CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
LASSERT (nob > 0);
LASSERT (nkiov > 0);
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT ((rd != tx->tx_rd) == !active);
while (offset >= kiov->kiov_len) {
offset -= kiov->kiov_len;
LASSERT (nkiov > 0);
}
- page_offset = kiov->kiov_offset + offset;
- nphys = 1;
+ rd->rd_nfrag = 0;
+ do {
+ LASSERT (nkiov > 0);
+ fragnob = min((int)(kiov->kiov_len - offset), nob);
- if (!kibnal_whole_mem()) {
- phys_size = nkiov * sizeof(vv_phy_buf_t);
- PORTAL_ALLOC(phys_buf, phys_size);
+ rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
+ kiov->kiov_offset + offset,
+ fragnob);
+ if (rc != 0)
+ return rc;
- if (phys_buf == NULL) {
- CERROR ("Can't allocate phys_buf\n");
- return (-ENOMEM);
- }
+ offset = 0;
+ kiov++;
+ nkiov--;
+ nob -= fragnob;
+ } while (nob > 0);
- phys_buf[0].start = kibnal_page2phys(kiov->kiov_page);
- phys_buf[0].size = PAGE_SIZE;
+ return 0;
+}
+#else
+int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ int npages, unsigned long page_offset, int nob)
+{
+ vv_return_t vvrc;
+ vv_fmr_map_t map_props;
- } else {
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
- kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset,
- kiov->kiov_len, active);
+ LASSERT ((rd != tx->tx_rd) == !active);
+ LASSERT (!tx->tx_md.md_active);
+ LASSERT (tx->tx_md.md_fmrcount > 0);
+ LASSERT (page_offset < PAGE_SIZE);
+ LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+ LASSERT (npages <= LNET_MAX_IOV);
+
+ memset(&map_props, 0, sizeof(map_props));
+
+ map_props.start = (void *)page_offset;
+ map_props.size = nob;
+ map_props.page_array_len = npages;
+ map_props.page_array = tx->tx_pages;
+
+ vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
+ &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
+ if (vvrc != vv_return_ok) {
+ CERROR ("Can't map vaddr %p for %d in %d pages: %d\n",
+ map_props.start, nob, npages, vvrc);
+ return -EFAULT;
}
- resid = nob - (kiov->kiov_len - offset);
+ tx->tx_md.md_addr = (unsigned long)map_props.start;
+ tx->tx_md.md_active = 1;
+ tx->tx_md.md_fmrcount--;
- while (resid > 0) {
- kiov++;
- nkiov--;
- LASSERT (nkiov > 0);
+ rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
+ rd->rd_nob = nob;
+ rd->rd_addr = tx->tx_md.md_addr;
- if (kiov->kiov_offset != 0 ||
- ((resid > PAGE_SIZE) &&
- kiov->kiov_len < PAGE_SIZE)) {
- /* Can't have gaps */
- CERROR ("Can't make payload contiguous in I/O VM:"
- "page %d, offset %d, len %d \n", nphys,
- kiov->kiov_offset, kiov->kiov_len);
+ /* Compensate for adaptor-tavor's munging of gatherlist addresses */
+ if (active)
+ rd->rd_addr += PAGE_OFFSET;
- for (i = -nphys; i < nkiov; i++)
- {
- CERROR("kiov[%d] %p +%d for %d\n",
- i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
- }
-
- rc = -EINVAL;
- goto out;
- }
+ return 0;
+}
- if (nphys == PTL_MD_MAX_IOV) {
- CERROR ("payload too big (%d)\n", nphys);
- rc = -EMSGSIZE;
- goto out;
- }
+int
+kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
+ vv_access_con_bit_mask_t access,
+ unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+ /* active if I'm sending */
+ int active = ((access & vv_acc_r_mem_write) == 0);
+ int resid;
+ int fragnob;
+ struct page *page;
+ int npages;
+ unsigned long page_offset;
+ unsigned long vaddr;
- if (!kibnal_whole_mem()) {
- LASSERT (nphys * sizeof (vv_phy_buf_t) < phys_size);
- phys_buf[nphys].start = kibnal_page2phys(kiov->kiov_page);
- phys_buf[nphys].size = PAGE_SIZE;
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
- } else {
- if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
- CERROR ("payload too big (%d)\n", nphys);
- rc = -EMSGSIZE;
- goto out;
- }
- kibnal_fill_ibrm(tx, kiov->kiov_page,
- kiov->kiov_offset, kiov->kiov_len,
- active);
- }
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
- nphys ++;
- resid -= PAGE_SIZE;
+ if (nob > iov->iov_len - offset) {
+ CERROR ("Can't map multiple vaddr fragments\n");
+ return (-EMSGSIZE);
}
- if (kibnal_whole_mem())
- goto out;
+ vaddr = ((unsigned long)iov->iov_base) + offset;
-#if 0
- CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
- for (i = 0; i < nphys; i++)
- CWARN (" [%d] "LPX64"\n", i, phys[i]);
-#endif
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ resid = nob;
+ npages = 0;
-#if IBNAL_FMR
-#error "vibnal hasn't learned about FMR yet"
- rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
- phys_pages, nphys,
- &tx->tx_md.md_addr,
- page_offset,
- &tx->tx_md.md_handle.fmr,
- &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
-#else
- retval = vv_phy_mem_region_register(kibnal_data.kib_hca,
- &phys_pages,
- IBNAL_RDMA_BASE,
- nphys,
- 0, /* offset */
- kibnal_data.kib_pd,
- vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */
- &tx->tx_md.md_handle,
- &tx->tx_md.md_addr,
- &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
-#endif
- if (retval == vv_return_ok) {
- CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
- nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
-#if IBNAL_FMR
- tx->tx_mapped = KIB_TX_MAPPED_FMR;
-#else
- tx->tx_mapped = KIB_TX_MAPPED;
-#endif
- } else {
- CERROR ("Can't map phys_pages: %d\n", retval);
- rc = -EFAULT;
+ do {
+ LASSERT (npages < LNET_MAX_IOV);
+
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL) {
+ CERROR("Can't find page for %lu\n", vaddr);
+ return -EFAULT;
+ }
+
+ tx->tx_pages[npages++] = lnet_page2phys(page);
+
+ fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+ vaddr += fragnob;
+ resid -= fragnob;
+
+ } while (resid > 0);
+
+ return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
+}
+
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
+ vv_access_con_bit_mask_t access,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+ /* active if I'm sending */
+ int active = ((access & vv_acc_r_mem_write) == 0);
+ int resid;
+ int npages;
+ unsigned long page_offset;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT (nkiov <= LNET_MAX_IOV);
+ LASSERT (!tx->tx_md.md_active);
+ LASSERT ((rd != tx->tx_rd) == !active);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
}
- out:
- if (phys_buf != NULL)
- PORTAL_FREE(phys_buf, phys_size);
+ page_offset = kiov->kiov_offset + offset;
+
+ resid = offset + nob;
+ npages = 0;
+
+ do {
+ LASSERT (npages < LNET_MAX_IOV);
+ LASSERT (nkiov > 0);
+
+ if ((npages > 0 && kiov->kiov_offset != 0) ||
+ (resid > kiov->kiov_len &&
+ (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
+ /* Can't have gaps */
+ CERROR ("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %d, len %d \n",
+ npages, kiov->kiov_offset, kiov->kiov_len);
+
+ return -EINVAL;
+ }
+
+ tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
+ resid -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ } while (resid > 0);
- return (rc);
+ return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
}
+#endif
-static kib_conn_t *
+kib_conn_t *
kibnal_find_conn_locked (kib_peer_t *peer)
{
struct list_head *tmp;
void
kibnal_check_sends (kib_conn_t *conn)
{
- unsigned long flags;
kib_tx_t *tx;
+ vv_return_t vvrc;
int rc;
- int i;
+ int consume_cred;
int done;
- int nwork;
- ENTRY;
+ /* Don't send anything until after the connection is established */
+ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+ CDEBUG(D_NET, "%s too soon\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ spin_lock(&conn->ibc_lock);
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ LASSERT (conn->ibc_nsends_posted <=
+ *kibnal_tunables.kib_concurrent_sends);
+ LASSERT (conn->ibc_reserved_credits >= 0);
- LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+ while (conn->ibc_reserved_credits > 0 &&
+ !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+ LASSERT (conn->ibc_version !=
+ IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+ tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+ conn->ibc_reserved_credits--;
+ }
if (list_empty(&conn->ibc_tx_queue) &&
- conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- tx = kibnal_get_idle_tx(0); /* don't block */
+ list_empty(&conn->ibc_tx_queue_nocred) &&
+ (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
+ kibnal_send_keepalive(conn))) {
+ spin_unlock(&conn->ibc_lock);
+
+ tx = kibnal_get_idle_tx();
if (tx != NULL)
kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
- spin_lock_irqsave(&conn->ibc_lock, flags);
-
- if (tx != NULL) {
- atomic_inc(&conn->ibc_refcount);
+ spin_lock(&conn->ibc_lock);
+
+ if (tx != NULL)
kibnal_queue_tx_locked(tx, conn);
- }
}
- while (!list_empty (&conn->ibc_tx_queue)) {
- tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+ for (;;) {
+ if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+ LASSERT (conn->ibc_version !=
+ IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+ tx = list_entry (conn->ibc_tx_queue_nocred.next,
+ kib_tx_t, tx_list);
+ consume_cred = 0;
+ } else if (!list_empty (&conn->ibc_tx_queue)) {
+ tx = list_entry (conn->ibc_tx_queue.next,
+ kib_tx_t, tx_list);
+ consume_cred = 1;
+ } else {
+ /* nothing waiting */
+ break;
+ }
+ LASSERT (tx->tx_queued);
/* We rely on this for QP sizing */
- LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+ LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
LASSERT (conn->ibc_outstanding_credits >= 0);
LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
LASSERT (conn->ibc_credits >= 0);
LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
- /* Not on ibc_rdma_queue */
- LASSERT (!tx->tx_passive_rdma_wait);
+ if (conn->ibc_nsends_posted ==
+ *kibnal_tunables.kib_concurrent_sends) {
+ /* We've got some tx completions outstanding... */
+ CDEBUG(D_NET, "%s: posted enough\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
- if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
- GOTO(out, 0);
+ if (consume_cred) {
+ if (conn->ibc_credits == 0) { /* no credits */
+ CDEBUG(D_NET, "%s: no credits\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
- if (conn->ibc_credits == 0) /* no credits */
- GOTO(out, 1);
-
- if (conn->ibc_credits == 1 && /* last credit reserved for */
- conn->ibc_outstanding_credits == 0) /* giving back credits */
- GOTO(out, 2);
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) { /* giving back credits */
+ CDEBUG(D_NET, "%s: not using last credit\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
+ }
list_del (&tx->tx_list);
+ tx->tx_queued = 0;
+
+ /* NB don't drop ibc_lock before bumping tx_sending */
if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
(!list_empty(&conn->ibc_tx_queue) ||
- conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ !list_empty(&conn->ibc_tx_queue_nocred) ||
+ (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
+ !kibnal_send_keepalive(conn)))) {
/* redundant NOOP */
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
kibnal_tx_done(tx);
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
+ CDEBUG(D_NET, "%s: redundant noop\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
continue;
}
- tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
- conn->ibc_outstanding_credits = 0;
+ kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
+ conn->ibc_outstanding_credits,
+ conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
+ conn->ibc_txseq);
+ conn->ibc_txseq++;
+ conn->ibc_outstanding_credits = 0;
conn->ibc_nsends_posted++;
- conn->ibc_credits--;
+ if (consume_cred)
+ conn->ibc_credits--;
+
+ /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
+ * PUT. If so, it was first queued here as a PUT_REQ, sent and
+ * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+ * and then re-queued here. It's (just) possible that
+ * tx_sending is non-zero if we've not done the tx_complete() from
+ * the first send; hence the ++ rather than = below. */
+ tx->tx_sending++;
- /* we only get a tx completion for the final rdma op */
- tx->tx_sending = 0;
- tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
list_add (&tx->tx_list, &conn->ibc_active_txs);
-#if IBNAL_CKSUM
- tx->tx_msg->ibm_cksum = 0;
- tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
- CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
-#endif
- /* NB the gap between removing tx from the queue and sending it
- * allows message re-ordering to occur */
- LASSERT (tx->tx_nsp > 0);
+ /* Keep holding ibc_lock while posting sends on this
+ * connection; vv_post_send() isn't re-entrant on the same
+ * QP!! */
+ LASSERT (tx->tx_nwrq > 0);
+#if 0
+ if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write)
+ CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+ tx->tx_wrq[0].scatgat_list->v_address,
+ tx->tx_wrq[0].scatgat_list->length,
+ tx->tx_wrq[0].scatgat_list->l_key,
+ tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
+ tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
+ else
+ CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
+ tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
+ tx->tx_wrq[0].scatgat_list->v_address,
+ tx->tx_wrq[0].scatgat_list->length,
+ tx->tx_wrq[0].scatgat_list->l_key);
+
+ if (tx->tx_nwrq > 1) {
+ if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write)
+ CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+ tx->tx_wrq[1].scatgat_list->v_address,
+ tx->tx_wrq[1].scatgat_list->length,
+ tx->tx_wrq[1].scatgat_list->l_key,
+ tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
+ tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
+ else
+ CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
+ tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
+ tx->tx_wrq[1].scatgat_list->v_address,
+ tx->tx_wrq[1].scatgat_list->length,
+ tx->tx_wrq[1].scatgat_list->l_key);
+ }
+#endif
rc = -ECONNABORTED;
- nwork = 0;
+ vvrc = vv_return_ok;
if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- vv_return_t retval;
-
tx->tx_status = 0;
- rc = 0;
-
- retval = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nsp, tx->tx_wrq, vv_operation_type_send_rc);
-
- if (retval != 0) {
- CERROR("post send failed with %d\n", retval);
- rc = -ECONNABORTED;
- break;
- }
-
- tx->tx_sending = tx->tx_nsp;
+ vvrc = vv_post_send_list(kibnal_data.kib_hca,
+ conn->ibc_qp,
+ tx->tx_nwrq,
+ tx->tx_wrq,
+ vv_operation_type_send_rc);
+ rc = (vvrc == vv_return_ok) ? 0 : -EIO;
}
+ conn->ibc_last_send = jiffies;
+
if (rc != 0) {
/* NB credits are transferred in the actual
* message, which can only be the last work item */
conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
- conn->ibc_credits++;
+ if (consume_cred)
+ conn->ibc_credits++;
conn->ibc_nsends_posted--;
tx->tx_status = rc;
- tx->tx_passive_rdma_wait = 0;
+ tx->tx_waiting = 0;
+ tx->tx_sending--;
- /* TODO: I think this is buggy if vv_post_send_list failed. */
done = (tx->tx_sending == 0);
if (done)
list_del (&tx->tx_list);
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
+
+ spin_unlock(&conn->ibc_lock);
+
if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
- CERROR ("Error %d posting transmit to "LPX64"\n",
- rc, conn->ibc_peer->ibp_nid);
+ CERROR ("Error %d posting transmit to %s\n",
+ vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
else
- CDEBUG (D_NET, "Error %d posting transmit to "
- LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+ CDEBUG (D_NET, "Error %d posting transmit to %s\n",
+ rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
kibnal_close_conn (conn, rc);
kibnal_tx_done (tx);
return;
}
-
}
- EXIT;
-out:
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
}
-static void
-kibnal_tx_callback (vv_wc_t *wc)
+void
+kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
{
- kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->wr_id);
- kib_conn_t *conn;
- unsigned long flags;
+ kib_conn_t *conn = tx->tx_conn;
+ int failed = (vvrc != vv_comp_status_success);
int idle;
- conn = tx->tx_conn;
- LASSERT (conn != NULL);
- LASSERT (tx->tx_sending != 0);
+ CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n",
+ tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc);
+
+ LASSERT (tx->tx_sending > 0);
- CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
- tx->tx_sending, tx->tx_nsp, wc->completion_status);
+ if (failed &&
+ tx->tx_status == 0 &&
+ conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
+ "sending %d waiting %d: failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ tx->tx_msg->ibm_type, tx->tx_cookie,
+ tx->tx_sending, tx->tx_waiting, vvrc);
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
/* I could be racing with rdma completion. Whoever makes 'tx' idle
- * gets to free it, which also drops its ref on 'conn'. If it's
- * not me, then I take an extra ref on conn so it can't disappear
- * under me. */
+ * gets to free it, which also drops its ref on 'conn'. */
tx->tx_sending--;
+ conn->ibc_nsends_posted--;
+
+ if (failed) {
+ tx->tx_waiting = 0;
+ tx->tx_status = -EIO;
+ }
+
idle = (tx->tx_sending == 0) && /* This is the final callback */
- (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ !tx->tx_waiting && /* Not waiting for peer */
+ !tx->tx_queued; /* Not re-queued (PUT_DONE) */
if (idle)
list_del(&tx->tx_list);
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
-
- if (tx->tx_sending == 0)
- conn->ibc_nsends_posted--;
-
- if (wc->completion_status != vv_comp_status_success &&
- tx->tx_status == 0)
- tx->tx_status = -ECONNABORTED;
+ kibnal_conn_addref(conn); /* 1 ref for me.... */
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
if (idle)
kibnal_tx_done (tx);
- if (wc->completion_status != vv_comp_status_success) {
- CERROR ("Tx completion to "LPX64" failed: %d\n",
- conn->ibc_peer->ibp_nid, wc->completion_status);
- kibnal_close_conn (conn, -ENETDOWN);
+ if (failed) {
+ kibnal_close_conn (conn, -EIO);
} else {
- /* can I shovel some more sends out the door? */
+ kibnal_peer_alive(conn->ibc_peer);
kibnal_check_sends(conn);
}
- kibnal_put_conn (conn);
-}
-
-void
-kibnal_ca_async_callback(vv_event_record_t ev)
-{
- /* XXX flesh out. this seems largely for async errors */
- CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data);
-}
-
-void
-kibnal_ca_callback (unsigned long unused_context)
-{
- vv_wc_t wc;
- int armed = 0;
- vv_return_t retval;
-
- for(;;) {
-
- while (vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc) == vv_return_ok) {
-
- /* We will need to rearm the CQ to avoid a potential race. */
- armed = 0;
-
- if (kibnal_wreqid_is_rx(wc.wr_id))
- kibnal_rx_callback(&wc);
- else
- kibnal_tx_callback(&wc);
- }
-
- if (armed)
- return;
-
- retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event);
- if (retval != 0) {
- CERROR ("Failed to re-arm completion queue: %d\n", retval);
- return;
- }
-
- armed = 1;
- }
+ kibnal_conn_decref(conn); /* ...until here */
}
void
kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
{
- vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nsp];
- vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nsp];
- int fence;
+ vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
+ vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq];
int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+ __u64 addr = (__u64)((unsigned long)((tx)->tx_msg));
- LASSERT (tx->tx_nsp >= 0 &&
- tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+ LASSERT (tx->tx_nwrq >= 0 &&
+ tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
LASSERT (nob <= IBNAL_MSG_SIZE);
-
- tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
- tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
- tx->tx_msg->ibm_type = type;
-#if IBNAL_CKSUM
- tx->tx_msg->ibm_nob = nob;
-#endif
- /* Fence the message if it's bundled with an RDMA read */
- fence = (tx->tx_nsp > 0) &&
- (type == IBNAL_MSG_PUT_DONE);
+
+ kibnal_init_msg(tx->tx_msg, type, body_nob);
*gl = (vv_scatgat_t) {
- .v_address = (void *)tx->tx_msg,
+ .v_address = KIBNAL_ADDR2SG(addr),
+ .l_key = tx->tx_lkey,
.length = nob,
- .l_key = tx->l_key,
};
- wrq->wr_id = kibnal_ptr2wreqid(tx, 0);
- wrq->completion_notification = 1;
+ memset(wrq, 0, sizeof(*wrq));
+
+ wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
+ wrq->wr_type = vv_wr_send;
wrq->scatgat_list = gl;
wrq->num_of_data_segments = 1;
- wrq->wr_type = vv_wr_send;
-
+ wrq->completion_notification = 1;
wrq->type.send.solicited_event = 1;
+ wrq->type.send.immidiate_data_indicator = 0;
+ wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
- wrq->type.send.send_qp_type.rc_type.fance_indicator = fence;
-
- tx->tx_nsp++;
+ tx->tx_nwrq++;
}
-static void
-kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+int
+kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+ kib_rdma_desc_t *dstrd, __u64 dstcookie)
{
- unsigned long flags;
+ kib_msg_t *ibmsg = tx->tx_msg;
+ kib_rdma_desc_t *srcrd = tx->tx_rd;
+ vv_scatgat_t *gl;
+ vv_wr_t *wrq;
+ int rc;
- spin_lock_irqsave(&conn->ibc_lock, flags);
+#if IBNAL_USE_FMR
+ LASSERT (tx->tx_nwrq == 0);
- kibnal_queue_tx_locked (tx, conn);
-
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- kibnal_check_sends(conn);
-}
+ gl = &tx->tx_gl[0];
+ gl->length = nob;
+ gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
+ gl->l_key = srcrd->rd_key;
-static void
-kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
-{
- unsigned long flags;
- kib_peer_t *peer;
- kib_conn_t *conn;
- rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ wrq = &tx->tx_wrq[0];
- /* If I get here, I've committed to send, so I complete the tx with
- * failure on any problems */
-
- LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
- LASSERT (tx->tx_nsp > 0); /* work items have been set up */
-
- read_lock (g_lock);
-
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
- read_unlock (g_lock);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
- }
+ wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+ wrq->completion_notification = 0;
+ wrq->scatgat_list = gl;
+ wrq->num_of_data_segments = 1;
+ wrq->wr_type = vv_wr_rdma_write;
+ wrq->type.send.solicited_event = 0;
+ wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+ wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
+ wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
+
+ tx->tx_nwrq = 1;
+ rc = nob;
+#else
+ /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+ int resid = nob;
+ kib_rdma_frag_t *srcfrag;
+ int srcidx;
+ kib_rdma_frag_t *dstfrag;
+ int dstidx;
+ int wrknob;
- conn = kibnal_find_conn_locked (peer);
- if (conn != NULL) {
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
- read_unlock (g_lock);
-
- kibnal_queue_tx (tx, conn);
- return;
- }
-
- /* Making one or more connections; I'll need a write lock... */
- read_unlock (g_lock);
- write_lock_irqsave (g_lock, flags);
+ /* Called by scheduler */
+ LASSERT (!in_interrupt());
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
- write_unlock_irqrestore (g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
- }
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
- conn = kibnal_find_conn_locked (peer);
- if (conn != NULL) {
- /* Connection exists; queue message on it */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
- write_unlock_irqrestore (g_lock, flags);
-
- kibnal_queue_tx (tx, conn);
- return;
- }
+ srcidx = dstidx = 0;
+ srcfrag = &srcrd->rd_frags[0];
+ dstfrag = &dstrd->rd_frags[0];
+ rc = resid;
- if (peer->ibp_connecting == 0) {
- if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
- write_unlock_irqrestore (g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
+ while (resid > 0) {
+ if (srcidx >= srcrd->rd_nfrag) {
+ CERROR("Src buffer exhausted: %d frags\n", srcidx);
+ rc = -EPROTO;
+ break;
}
-
- peer->ibp_connecting = 1;
-
- kib_peer_addref(peer); /* extra ref for connd */
-
- spin_lock (&kibnal_data.kib_connd_lock);
-
- list_add_tail (&peer->ibp_connd_list,
- &kibnal_data.kib_connd_peers);
- wake_up (&kibnal_data.kib_connd_waitq);
-
- spin_unlock (&kibnal_data.kib_connd_lock);
- }
-
- /* A connection is being established; queue the message... */
- list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
- write_unlock_irqrestore (g_lock, flags);
-}
+ if (dstidx == dstrd->rd_nfrag) {
+ CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+ rc = -EPROTO;
+ break;
+ }
-static ptl_err_t
-kibnal_start_passive_rdma (int type, ptl_nid_t nid,
- lib_msg_t *libmsg, ptl_hdr_t *hdr)
-{
- int nob = libmsg->md->length;
- kib_tx_t *tx;
- kib_msg_t *ibmsg;
- int rc;
- vv_access_con_bit_mask_t access;
-
- LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
- LASSERT (nob > 0);
- LASSERT (!in_interrupt()); /* Mapping could block */
+ if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+ srcidx, srcrd->rd_nfrag,
+ dstidx, dstrd->rd_nfrag);
+ rc = -EMSGSIZE;
+ break;
+ }
- access = vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind;
+ wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
- tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
- LASSERT (tx != NULL);
+ gl = &tx->tx_gl[tx->tx_nwrq];
+ gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag));
+ gl->length = wrknob;
+ gl->l_key = srcrd->rd_key;
- if ((libmsg->md->options & PTL_MD_KIOV) == 0)
- rc = kibnal_map_iov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.iov,
- 0, nob, 0);
- else
- rc = kibnal_map_kiov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.kiov,
- 0, nob, 0);
+ wrq = &tx->tx_wrq[tx->tx_nwrq];
- if (rc != 0) {
- CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
- goto failed;
- }
-
- if (type == IBNAL_MSG_GET_RDMA) {
- /* reply gets finalized when tx completes */
- tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
- nid, libmsg);
- if (tx->tx_libmsg[1] == NULL) {
- CERROR ("Can't create reply for GET -> "LPX64"\n",
- nid);
- rc = -ENOMEM;
- goto failed;
+ wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+ wrq->completion_notification = 0;
+ wrq->scatgat_list = gl;
+ wrq->num_of_data_segments = 1;
+ wrq->wr_type = vv_wr_rdma_write;
+ wrq->type.send.solicited_event = 0;
+ wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+ wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag);
+ wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
+
+ resid -= wrknob;
+ if (wrknob < srcfrag->rf_nob) {
+ kibnal_rf_set(srcfrag,
+ kibnal_rf_addr(srcfrag) + wrknob,
+ srcfrag->rf_nob - wrknob);
+ } else {
+ srcfrag++;
+ srcidx++;
}
- }
-
- tx->tx_passive_rdma = 1;
- ibmsg = tx->tx_msg;
+ if (wrknob < dstfrag->rf_nob) {
+ kibnal_rf_set(dstfrag,
+ kibnal_rf_addr(dstfrag) + wrknob,
+ dstfrag->rf_nob - wrknob);
+ } else {
+ dstfrag++;
+ dstidx++;
+ }
- ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
- ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
- /* map_kiov alrady filled the rdma descs for the whole_mem case */
- if (!kibnal_whole_mem()) {
- ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
- ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
- ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
- ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+ tx->tx_nwrq++;
}
- kibnal_init_tx_msg (tx, type,
- kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
-
- CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
- LPX64", nob %d\n",
- tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
- tx->tx_md.md_addr, nob);
-
- /* libmsg gets finalized when tx completes. */
- tx->tx_libmsg[0] = libmsg;
+ if (rc < 0) /* no RDMA if completing with failure */
+ tx->tx_nwrq = 0;
+#endif
- kibnal_launch_tx(tx, nid);
- return (PTL_OK);
+ ibmsg->ibm_u.completion.ibcm_status = rc;
+ ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
- failed:
- tx->tx_status = rc;
- kibnal_tx_done (tx);
- return (PTL_FAIL);
+ return rc;
}
void
-kibnal_start_active_rdma (int type, int status,
- kib_rx_t *rx, lib_msg_t *libmsg,
- unsigned int niov,
- struct iovec *iov, ptl_kiov_t *kiov,
- size_t offset, size_t nob)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
{
- kib_msg_t *rxmsg = rx->rx_msg;
- kib_msg_t *txmsg;
- kib_tx_t *tx;
- vv_access_con_bit_mask_t access;
- vv_wr_operation_t rdma_op;
- int rc;
- __u32 i;
-
- CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
- type, status, niov, offset, nob);
+ spin_lock(&conn->ibc_lock);
+ kibnal_queue_tx_locked (tx, conn);
+ spin_unlock(&conn->ibc_lock);
- /* Called by scheduler */
- LASSERT (!in_interrupt ());
+ kibnal_check_sends(conn);
+}
- /* Either all pages or all vaddrs */
- LASSERT (!(kiov != NULL && iov != NULL));
+void
+kibnal_schedule_peer_arp (kib_peer_t *peer)
+{
+ unsigned long flags;
- /* No data if we're completing with failure */
- LASSERT (status == 0 || nob == 0);
+ LASSERT (peer->ibp_connecting != 0);
+ LASSERT (peer->ibp_arp_count > 0);
- LASSERT (type == IBNAL_MSG_GET_DONE ||
- type == IBNAL_MSG_PUT_DONE);
+ kibnal_peer_addref(peer); /* extra ref for connd */
- /* Flag I'm completing the RDMA. Even if I fail to send the
- * completion message, I will have tried my best so further
- * attempts shouldn't be tried. */
- LASSERT (!rx->rx_rdma);
- rx->rx_rdma = 1;
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
- if (type == IBNAL_MSG_GET_DONE) {
- access = 0;
- rdma_op = vv_wr_rdma_write;
- LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
- } else {
- access = vv_acc_l_mem_write;
- rdma_op = vv_wr_rdma_read;
- LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
- }
+ list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
- tx = kibnal_get_idle_tx (0); /* Mustn't block */
- if (tx == NULL) {
- CERROR ("tx descs exhausted on RDMA from "LPX64
- " completing locally with failure\n",
- rx->rx_conn->ibc_peer->ibp_nid);
- lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
- return;
- }
- LASSERT (tx->tx_nsp == 0);
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
- if (nob == 0)
- GOTO(init_tx, 0);
+void
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ unsigned long flags;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ int retry;
+ int rc;
- /* We actually need to transfer some data (the transfer
- * size could get truncated to zero when the incoming
- * message is matched) */
- if (kiov != NULL)
- rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
- else
- rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
-
- if (rc != 0) {
- CERROR ("Can't map RDMA -> "LPX64": %d\n",
- rx->rx_conn->ibc_peer->ibp_nid, rc);
- /* We'll skip the RDMA and complete with failure. */
- status = rc;
- nob = 0;
- GOTO(init_tx, rc);
- }
-
- if (!kibnal_whole_mem()) {
- tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
- tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
- tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
- }
-
- /* XXX ugh. different page-sized hosts. */
- if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
- rxmsg->ibm_u.rdma.ibrm_num_descs) {
- CERROR("tx descs (%u) != rx descs (%u)\n",
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
- rxmsg->ibm_u.rdma.ibrm_num_descs);
- /* We'll skip the RDMA and complete with failure. */
- status = rc;
- nob = 0;
- GOTO(init_tx, rc);
- }
-
- /* map_kiov filled in the rdma descs which describe our side of the
- * rdma transfer. */
- /* ibrm_num_descs was verified in rx_callback */
- for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
- kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
- vv_scatgat_t *ds = &tx->tx_gl[i];
- vv_wr_t *wrq = &tx->tx_wrq[i];
-
- ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
- rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
-
- ds->v_address = (void *)(unsigned long)ldesc->rd_addr;
- ds->length = ldesc->rd_nob;
- ds->l_key = tx->tx_msg->ibm_u.rdma.rd_key;
-
- wrq->wr_id = kibnal_ptr2wreqid(tx, 0);
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
-#if 0
- /* only the last rdma post triggers tx completion */
- if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
- wrq->completion_notification = 1;
- else
- wrq->completion_notification = 0;
+ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
-#else
- /* TODO: hack. Right now complete everything, else the
- * driver will deadlock. This is less efficient than
- * requestion a notification for only a few of the
- * WQE. */
- wrq->completion_notification = 1;
-#endif
+ for (retry = 0; ; retry = 1) {
+ read_lock_irqsave(g_lock, flags);
- wrq->scatgat_list = ds;
- wrq->num_of_data_segments = 1;
- wrq->wr_type = rdma_op;
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) {
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+ read_unlock_irqrestore(g_lock, flags);
- wrq->type.send.solicited_event = 0;
+ kibnal_queue_tx (tx, conn);
+ kibnal_conn_decref(conn); /* ...to here */
+ return;
+ }
+ }
- wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
- wrq->type.send.send_qp_type.rc_type.r_addr = rdesc->rd_addr;
- wrq->type.send.send_qp_type.rc_type.r_r_key = rxmsg->ibm_u.rdma.rd_key;
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock(g_lock);
+ write_lock(g_lock);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL)
+ break;
+
+ write_unlock_irqrestore(g_lock, flags);
- CDEBUG(D_NET, "prepared RDMA with r_addr=%llx r_key=%x\n",
- wrq->type.send.send_qp_type.rc_type.r_addr,
- wrq->type.send.send_qp_type.rc_type.r_r_key);
+ if (retry) {
+ CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
+ if (rc != 0) {
+ CERROR("Can't add peer %s: %d\n",
+ libcfs_nid2str(nid), rc);
- tx->tx_nsp++;
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
}
-init_tx:
- txmsg = tx->tx_msg;
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+ write_unlock_irqrestore(g_lock, flags);
- txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
- txmsg->ibm_u.completion.ibcm_status = status;
-
- kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+ kibnal_queue_tx (tx, conn);
+ kibnal_conn_decref(conn); /* ...until here */
+ return;
+ }
- if (status == 0 && nob != 0) {
- LASSERT (tx->tx_nsp > 1);
- /* RDMA: libmsg gets finalized when the tx completes. This
- * is after the completion message has been sent, which in
- * turn is after the RDMA has finished. */
- tx->tx_libmsg[0] = libmsg;
- } else {
- LASSERT (tx->tx_nsp == 1);
- /* No RDMA: local completion happens now! */
- CDEBUG(D_WARNING,"No data: immediate completion\n");
- lib_finalize (&kibnal_lib, NULL, libmsg,
- status == 0 ? PTL_OK : PTL_FAIL);
- }
-
- /* +1 ref for this tx... */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- rx->rx_conn, rx->rx_conn->ibc_state,
- rx->rx_conn->ibc_peer->ibp_nid,
- atomic_read (&rx->rx_conn->ibc_refcount));
- atomic_inc (&rx->rx_conn->ibc_refcount);
- /* ...and queue it up */
- kibnal_queue_tx(tx, rx->rx_conn);
+ if (peer->ibp_connecting == 0 &&
+ peer->ibp_accepting == 0) {
+ if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+ time_after_eq(jiffies, peer->ibp_reconnect_time))) {
+ write_unlock_irqrestore(g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ peer->ibp_connecting = 1;
+ peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
+ kibnal_schedule_peer_arp(peer);
+ }
+
+ /* A connection is being established; queue the message... */
+ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+ write_unlock_irqrestore(g_lock, flags);
}
-static ptl_err_t
-kibnal_sendmsg(lib_nal_t *nal,
- void *private,
- lib_msg_t *libmsg,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- unsigned int payload_niov,
- struct iovec *payload_iov,
- ptl_kiov_t *payload_kiov,
- size_t payload_offset,
- size_t payload_nob)
+int
+kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
{
- kib_msg_t *ibmsg;
- kib_tx_t *tx;
- int nob;
+ lnet_hdr_t *hdr = &lntmsg->msg_hdr;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ int target_is_router = lntmsg->msg_target_is_router;
+ int routing = lntmsg->msg_routing;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct iovec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+ int rc;
/* NB 'private' is different depending on what we're sending.... */
- CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
- " pid %d\n", payload_nob, payload_niov, nid , pid);
+ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
LASSERT (payload_nob == 0 || payload_niov > 0);
- LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+ LASSERT (payload_niov <= LNET_MAX_IOV);
- /* Thread context if we're sending payload */
- LASSERT (!in_interrupt() || payload_niov == 0);
+ /* Thread context */
+ LASSERT (!in_interrupt());
/* payload is either all vaddrs or all pages */
LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
switch (type) {
default:
LBUG();
- return (PTL_FAIL);
-
- case PTL_MSG_REPLY: {
- /* reply's 'private' is the incoming receive */
- kib_rx_t *rx = private;
-
- /* RDMA reply expected? */
- if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
- kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
- rx, libmsg, payload_niov,
- payload_iov, payload_kiov,
- payload_offset, payload_nob);
- return (PTL_OK);
+ return (-EIO);
+
+ case LNET_MSG_ACK:
+ LASSERT (payload_nob == 0);
+ break;
+
+ case LNET_MSG_GET:
+ if (routing || target_is_router)
+ break; /* send IMMEDIATE */
+
+ /* is the REPLY message too small for RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+ if (nob <= IBNAL_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can allocate txd for GET to %s: \n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
}
-
- /* Incoming message consistent with immediate reply? */
- if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
- CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
- nid, rx->rx_msg->ibm_type);
- return (PTL_FAIL);
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+ ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+ rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+ vv_acc_r_mem_write,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.iov,
+ 0, lntmsg->msg_md->md_length);
+ else
+ rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+ vv_acc_r_mem_write,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.kiov,
+ 0, lntmsg->msg_md->md_length);
+ if (rc != 0) {
+ CERROR("Can't setup GET sink for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kibnal_tx_done(tx);
+ return -EIO;
}
- /* Will it fit in a message? */
- nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
- if (nob > IBNAL_MSG_SIZE) {
- CERROR("REPLY for "LPX64" too big (RDMA not requested): %d (max for message is %d)\n",
- nid, payload_nob, IBNAL_MSG_SIZE);
- return (PTL_FAIL);
+#if IBNAL_USE_FMR
+ nob = sizeof(kib_get_msg_t);
+#else
+ {
+ int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+
+ nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
}
- break;
- }
+#endif
+ kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
- case PTL_MSG_GET:
- /* might the REPLY message be big enough to need RDMA? */
- nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
- if (nob > IBNAL_MSG_SIZE)
- return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
- nid, libmsg, hdr));
- break;
+ tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+ lntmsg);
+ if (tx->tx_lntmsg[1] == NULL) {
+ CERROR("Can't create reply for GET -> %s\n",
+ libcfs_nid2str(target.nid));
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
- case PTL_MSG_ACK:
- LASSERT (payload_nob == 0);
- break;
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
+ tx->tx_waiting = 1; /* waiting for GET_DONE */
+ kibnal_launch_tx(tx, target.nid);
+ return 0;
- case PTL_MSG_PUT:
- /* Is the payload big enough to need RDMA? */
+ case LNET_MSG_REPLY:
+ case LNET_MSG_PUT:
+ /* Is the payload small enough not to need RDMA? */
nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
- if (nob > IBNAL_MSG_SIZE)
- return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
- nid, libmsg, hdr));
-
- break;
+ if (nob <= IBNAL_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't allocate %s txd for %s\n",
+ type == LNET_MSG_PUT ? "PUT" : "REPLY",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ if (payload_kiov == NULL)
+ rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ else
+ rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ if (rc != 0) {
+ CERROR("Can't setup PUT src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+ ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+ kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
+ kibnal_launch_tx(tx, target.nid);
+ return 0;
}
- tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
- type == PTL_MSG_REPLY ||
- in_interrupt()));
+ /* send IMMEDIATE */
+
+ LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+ <= IBNAL_MSG_SIZE);
+
+ tx = kibnal_get_idle_tx();
if (tx == NULL) {
- CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
- type, nid, in_interrupt() ? " (intr)" : "");
- return (PTL_NO_SPACE);
+ CERROR ("Can't send %d to %s: tx descs exhausted\n",
+ type, libcfs_nid2str(target.nid));
+ return -ENOMEM;
}
ibmsg = tx->tx_msg;
ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
- if (payload_nob > 0) {
- if (payload_kiov != NULL)
- lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
- else
- lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
- payload_niov, payload_iov,
- payload_offset, payload_nob);
- }
-
- kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
- offsetof(kib_immediate_msg_t,
- ibim_payload[payload_nob]));
+ if (payload_kiov != NULL)
+ lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
- /* libmsg gets finalized when tx completes */
- tx->tx_libmsg[0] = libmsg;
+ nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
- kibnal_launch_tx(tx, nid);
- return (PTL_OK);
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ kibnal_launch_tx(tx, target.nid);
+ return 0;
}
-static ptl_err_t
-kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
- ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int payload_niov, struct iovec *payload_iov,
- size_t payload_offset, size_t payload_len)
+void
+kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
{
- CDEBUG(D_NET, " pid = %d, nid="LPU64"\n",
- pid, nid);
- return (kibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, payload_iov, NULL,
- payload_offset, payload_len));
+ lnet_process_id_t target = lntmsg->msg_target;
+ unsigned int niov = lntmsg->msg_niov;
+ struct iovec *iov = lntmsg->msg_iov;
+ lnet_kiov_t *kiov = lntmsg->msg_kiov;
+ unsigned int offset = lntmsg->msg_offset;
+ unsigned int nob = lntmsg->msg_len;
+ kib_tx_t *tx;
+ int rc;
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't get tx for REPLY to %s\n",
+ libcfs_nid2str(target.nid));
+ goto failed_0;
+ }
+
+ if (nob == 0)
+ rc = 0;
+ else if (kiov == NULL)
+ rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
+ niov, iov, offset, nob);
+ else
+ rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
+ niov, kiov, offset, nob);
+
+ if (rc != 0) {
+ CERROR("Can't setup GET src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
+ &rx->rx_msg->ibm_u.get.ibgm_rd,
+ rx->rx_msg->ibm_u.get.ibgm_cookie);
+ if (rc < 0) {
+ CERROR("Can't setup rdma for GET from %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ if (rc == 0) {
+ /* No RDMA: local completion may happen now! */
+ lnet_finalize(ni, lntmsg, 0);
+ } else {
+ /* RDMA: lnet_finalize(lntmsg) when it
+ * completes */
+ tx->tx_lntmsg[0] = lntmsg;
+ }
+
+ kibnal_queue_tx(tx, rx->rx_conn);
+ return;
+
+ failed_1:
+ kibnal_tx_done(tx);
+ failed_0:
+ lnet_finalize(ni, lntmsg, -EIO);
}
-static ptl_err_t
-kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
- ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int payload_niov, ptl_kiov_t *payload_kiov,
- size_t payload_offset, size_t payload_len)
+int
+kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ void **new_private)
{
- return (kibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, NULL, payload_kiov,
- payload_offset, payload_len));
+ kib_rx_t *rx = private;
+ kib_conn_t *conn = rx->rx_conn;
+
+ if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+ /* Can't block if RDMA completions need normal credits */
+ LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers"
+ " free. %s is running an old version of LNET "
+ "that may deadlock if messages wait for"
+ "buffers) \n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return -EDEADLK;
+ }
+
+ *new_private = private;
+ return 0;
}
-static ptl_err_t
-kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
- unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
- size_t offset, size_t mlen, size_t rlen)
+int
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
{
kib_rx_t *rx = private;
kib_msg_t *rxmsg = rx->rx_msg;
- int msg_nob;
-
+ kib_conn_t *conn = rx->rx_conn;
+ kib_tx_t *tx;
+ kib_msg_t *txmsg;
+ int nob;
+ int post_cred = 1;
+ int rc = 0;
+
LASSERT (mlen <= rlen);
- LASSERT (!in_interrupt ());
+ LASSERT (!in_interrupt());
/* Either all pages or all vaddrs */
LASSERT (!(kiov != NULL && iov != NULL));
switch (rxmsg->ibm_type) {
default:
LBUG();
- return (PTL_FAIL);
-
+
case IBNAL_MSG_IMMEDIATE:
- msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
- if (msg_nob > IBNAL_MSG_SIZE) {
- CERROR ("Immediate message from "LPX64" too big: %d\n",
- rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
- return (PTL_FAIL);
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (nob > rx->rx_nob) {
+ CERROR ("Immediate message from %s too big: %d(%d)\n",
+ libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+ nob, rx->rx_nob);
+ rc = -EPROTO;
+ break;
}
if (kiov != NULL)
- lib_copy_buf2kiov(niov, kiov, offset,
- rxmsg->ibm_u.immediate.ibim_payload,
- mlen);
+ lnet_copy_flat2kiov(niov, kiov, offset,
+ IBNAL_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
else
- lib_copy_buf2iov(niov, iov, offset,
- rxmsg->ibm_u.immediate.ibim_payload,
- mlen);
-
- lib_finalize (nal, NULL, libmsg, PTL_OK);
- return (PTL_OK);
-
- case IBNAL_MSG_GET_RDMA:
- /* We get called here just to discard any junk after the
- * GET hdr. */
- LASSERT (libmsg == NULL);
- lib_finalize (nal, NULL, libmsg, PTL_OK);
- return (PTL_OK);
-
- case IBNAL_MSG_PUT_RDMA:
- kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
- rx, libmsg,
- niov, iov, kiov, offset, mlen);
- return (PTL_OK);
- }
-}
+ lnet_copy_flat2iov(niov, iov, offset,
+ IBNAL_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ lnet_finalize (ni, lntmsg, 0);
+ break;
-static ptl_err_t
-kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, struct iovec *iov,
- size_t offset, size_t mlen, size_t rlen)
-{
- return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
- offset, mlen, rlen));
-}
+ case IBNAL_MSG_PUT_REQ:
+ if (mlen == 0) {
+ lnet_finalize(ni, lntmsg, 0);
+ kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
-static ptl_err_t
-kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, ptl_kiov_t *kiov,
- size_t offset, size_t mlen, size_t rlen)
-{
- return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
- offset, mlen, rlen));
-}
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't allocate tx for %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ /* Not replying will break the connection */
+ rc = -ENOMEM;
+ break;
+ }
-/*****************************************************************************
- * the rest of this file concerns connection management. active connetions
- * start with connect_peer, passive connections start with passive_callback.
- * active disconnects start with conn_close, cm_callback starts passive
- * disconnects and contains the guts of how the disconnect state machine
- * progresses.
- *****************************************************************************/
+ txmsg = tx->tx_msg;
+ if (kiov == NULL)
+ rc = kibnal_setup_rd_iov(tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ vv_acc_r_mem_write,
+ niov, iov, offset, mlen);
+ else
+ rc = kibnal_setup_rd_kiov(tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ vv_acc_r_mem_write,
+ niov, kiov, offset, mlen);
+ if (rc != 0) {
+ CERROR("Can't setup PUT sink for %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kibnal_tx_done(tx);
+ /* tell peer it's over */
+ kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+ txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+ nob = sizeof(kib_putack_msg_t);
+#else
+ {
+ int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+
+ nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+ }
+#endif
+ kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_DONE */
+ kibnal_queue_tx(tx, conn);
+
+ if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+ post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
+ break;
+
+ case IBNAL_MSG_GET_REQ:
+ if (lntmsg != NULL) {
+ /* Optimized GET; RDMA lntmsg's payload */
+ kibnal_reply(ni, rx, lntmsg);
+ } else {
+ /* GET didn't match anything */
+ kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA,
+ rxmsg->ibm_u.get.ibgm_cookie);
+ }
+ break;
+ }
+
+ kibnal_post_rx(rx, post_cred, 0);
+ return rc;
+}
int
kibnal_thread_start (int (*fn)(void *arg), void *arg)
return (0);
}
-static void
+void
kibnal_thread_fini (void)
{
atomic_dec (&kibnal_data.kib_nthreads);
}
-/* this can be called by anyone at any time to close a connection. if
- * the connection is still established it heads to the connd to start
- * the disconnection in a safe context. It has no effect if called
- * on a connection that is already disconnecting */
void
-kibnal_close_conn_locked (kib_conn_t *conn, int error)
+kibnal_peer_alive (kib_peer_t *peer)
{
- /* This just does the immmediate housekeeping, and schedules the
- * connection for the connd to finish off.
- * Caller holds kib_global_lock exclusively in irq context */
- kib_peer_t *peer = conn->ibc_peer;
+ /* This is racy, but everyone's only writing cfs_time_current() */
+ peer->ibp_last_alive = cfs_time_current();
+ mb();
+}
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
- IBNAL_CONN_DISCONNECTED);
+void
+kibnal_peer_notify (kib_peer_t *peer)
+{
+ time_t last_alive = 0;
+ int error = 0;
+ unsigned long flags;
- if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
- return; /* already disconnecting */
+ read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- CDEBUG (error == 0 ? D_NET : D_ERROR,
- "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+ if (list_empty(&peer->ibp_conns) &&
+ peer->ibp_accepting == 0 &&
+ peer->ibp_connecting == 0 &&
+ peer->ibp_error != 0) {
+ error = peer->ibp_error;
+ peer->ibp_error = 0;
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- /* kib_connd_conns takes ibc_list's ref */
- list_del (&conn->ibc_list);
- } else {
- /* new ref for kib_connd_conns */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- }
-
- if (list_empty (&peer->ibp_conns) &&
- peer->ibp_persistence == 0) {
- /* Non-persistent peer with no more conns... */
- kibnal_unlink_peer_locked (peer);
+ last_alive = cfs_time_current_sec() -
+ cfs_duration_sec(cfs_time_current() -
+ peer->ibp_last_alive);
}
- conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+ read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ if (error != 0)
+ lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
+}
+
+void
+kibnal_schedule_conn (kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ kibnal_conn_addref(conn); /* ++ref for connd */
- spin_lock (&kibnal_data.kib_connd_lock);
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
wake_up (&kibnal_data.kib_connd_waitq);
-
- spin_unlock (&kibnal_data.kib_connd_lock);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
+
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immediate housekeeping. 'error' is zero for a
+ * normal shutdown which can happen only after the connection has been
+ * established. If the connection is established, schedule the
+ * connection to be finished off by the connd. Otherwise the connd is
+ * already dealing with it (either to set it up or tear it down).
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+
+ LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+ if (error != 0 && conn->ibc_comms_error == 0)
+ conn->ibc_comms_error = error;
+
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+ return; /* already being handled */
+
+ /* NB Can't take ibc_lock here (could be in IRQ context), without
+ * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
+
+ if (error == 0 &&
+ list_empty(&conn->ibc_tx_queue) &&
+ list_empty(&conn->ibc_tx_queue_rsrvd) &&
+ list_empty(&conn->ibc_tx_queue_nocred) &&
+ list_empty(&conn->ibc_active_txs)) {
+ CDEBUG(D_NET, "closing conn to %s"
+ " rx# "LPD64" tx# "LPD64"\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_txseq, conn->ibc_rxseq);
+ } else {
+ CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
+ " rx# "LPD64" tx# "LPD64"\n",
+ libcfs_nid2str(peer->ibp_nid), error,
+ list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+ list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+ list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
+ conn->ibc_txseq, conn->ibc_rxseq);
+ }
+
+ list_del (&conn->ibc_list);
+
+ if (list_empty (&peer->ibp_conns)) { /* no more conns */
+ if (peer->ibp_persistence == 0 && /* non-persistent peer */
+ kibnal_peer_active(peer)) /* still in peer table */
+ kibnal_unlink_peer_locked (peer);
+
+ /* set/clear error on last conn */
+ peer->ibp_error = conn->ibc_comms_error;
+ }
+
+ kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
+
+ kibnal_schedule_conn(conn);
+ kibnal_conn_decref(conn); /* lose ibc_list's ref */
}
void
kibnal_close_conn (kib_conn_t *conn, int error)
{
- unsigned long flags;
+ unsigned long flags;
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
kibnal_close_conn_locked (conn, error);
-
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_handle_early_rxs(kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_rx_t *rx;
+
+ LASSERT (!in_interrupt());
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ while (!list_empty(&conn->ibc_early_rxs)) {
+ rx = list_entry(conn->ibc_early_rxs.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ kibnal_handle_rx(rx);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ }
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+
+ spin_lock(&conn->ibc_lock);
+
+ list_for_each_safe (tmp, nxt, txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ if (txs == &conn->ibc_active_txs) {
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+ } else {
+ LASSERT (tx->tx_queued);
+ }
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_queued = 0;
+ tx->tx_waiting = 0;
+
+ if (tx->tx_sending == 0) {
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+ }
+
+ spin_unlock(&conn->ibc_lock);
+
+ kibnal_txlist_done(&zombies, -ECONNABORTED);
+}
+
+void
+kibnal_conn_disconnected(kib_conn_t *conn)
+{
+ /* I'm the connd */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
+
+ kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
+
+ /* move QP to error state to make posted work items complete */
+ kibnal_set_qp_state(conn, vv_qp_state_error);
+
+ /* Complete all tx descs not waiting for sends to complete.
+ * NB we should be safe from RDMA now that the QP has changed state */
+
+ kibnal_abort_txs(conn, &conn->ibc_tx_queue);
+ kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+ kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+ kibnal_abort_txs(conn, &conn->ibc_active_txs);
+
+ kibnal_handle_early_rxs(conn);
+
+ kibnal_peer_notify(conn->ibc_peer);
}
-static void
-kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
{
LIST_HEAD (zombies);
- kib_tx_t *tx;
unsigned long flags;
- LASSERT (rc != 0);
- LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+ /* Only the connd creates conns => single threaded */
+ LASSERT (error != 0);
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- LASSERT (peer->ibp_connecting != 0);
- peer->ibp_connecting--;
- if (peer->ibp_connecting != 0) {
+ if (active) {
+ LASSERT (peer->ibp_connecting != 0);
+ peer->ibp_connecting--;
+ } else {
+ LASSERT (peer->ibp_accepting != 0);
+ peer->ibp_accepting--;
+ }
+
+ if (peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0) {
/* another connection attempt under way (loopback?)... */
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
return;
}
if (list_empty(&peer->ibp_conns)) {
/* Say when active connection can be re-attempted */
- peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
- /* Increase reconnection interval */
- peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
- IBNAL_MAX_RECONNECT_INTERVAL);
-
- /* Take peer's blocked blocked transmits; I'll complete
- * them with error */
- while (!list_empty (&peer->ibp_tx_queue)) {
- tx = list_entry (peer->ibp_tx_queue.next,
- kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
- list_add_tail (&tx->tx_list, &zombies);
- }
-
+ peer->ibp_reconnect_interval *= 2;
+ peer->ibp_reconnect_interval =
+ MAX(peer->ibp_reconnect_interval,
+ *kibnal_tunables.kib_min_reconnect_interval);
+ peer->ibp_reconnect_interval =
+ MIN(peer->ibp_reconnect_interval,
+ *kibnal_tunables.kib_max_reconnect_interval);
+
+ peer->ibp_reconnect_time = jiffies +
+ peer->ibp_reconnect_interval * HZ;
+
+ /* Take peer's blocked transmits to complete with error */
+ list_add(&zombies, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
+
if (kibnal_peer_active(peer) &&
(peer->ibp_persistence == 0)) {
/* failed connection attempt on non-persistent peer */
kibnal_unlink_peer_locked (peer);
}
+
+ peer->ibp_error = error;
} else {
/* Can't have blocked transmits if there are connections */
LASSERT (list_empty(&peer->ibp_tx_queue));
}
-
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
- if (!list_empty (&zombies))
- CERROR ("Deleting messages for "LPX64": connection failed\n",
- peer->ibp_nid);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
- while (!list_empty (&zombies)) {
- tx = list_entry (zombies.next, kib_tx_t, tx_list);
+ kibnal_peer_notify(peer);
- list_del (&tx->tx_list);
- /* complete now */
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- }
+ if (list_empty (&zombies))
+ return;
+
+ CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
+ libcfs_nid2str(peer->ibp_nid));
+
+ kibnal_txlist_done(&zombies, -EHOSTUNREACH);
}
-static void
-kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+void
+kibnal_reject(cm_cep_handle_t cep, int why)
{
- int state = conn->ibc_state;
- kib_peer_t *peer = conn->ibc_peer;
- kib_tx_t *tx;
- unsigned long flags;
- int i;
+ static cm_reject_data_t rejs[3];
+ cm_reject_data_t *rej = &rejs[why];
+
+ LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
+
+ /* If I wasn't so lazy, I'd initialise this only once; it's effective
+ * read-only */
+ rej->reason = cm_rej_code_usr_rej;
+ rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
+ rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
+ rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
+ rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
+ rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
+ rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
+ rej->priv_data[6] = why;
+
+ cm_reject(cep, rej);
+}
+
+void
+kibnal_connreq_done(kib_conn_t *conn, int active, int status)
+{
+ struct list_head txs;
+ kib_peer_t *peer = conn->ibc_peer;
+ unsigned long flags;
+ kib_tx_t *tx;
- CDEBUG(D_NET, "Enter kibnal_connreq_done for conn=%p, active=%d, status=%d\n",
- conn, active, status);
+ CDEBUG(D_NET,"%d\n", status);
- /* passive connection has no connreq & vice versa */
- LASSERTF(!active == !(conn->ibc_connreq != NULL),
- "%d %p\n", active, conn->ibc_connreq);
+ /* Only the connd creates conns => single threaded */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+ LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
if (active) {
- PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
- conn->ibc_connreq = NULL;
+ LASSERT (peer->ibp_connecting > 0);
+ } else {
+ LASSERT (peer->ibp_accepting > 0);
}
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ conn->ibc_connvars = NULL;
- LASSERT (peer->ibp_connecting != 0);
-
- if (status == 0) {
- /* connection established... */
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
- conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+ if (status != 0) {
+ /* failed to establish connection */
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
- if (!kibnal_peer_active(peer)) {
- /* ...but peer deleted meantime */
- status = -ECONNABORTED;
- }
- } else {
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
- IBNAL_CONN_CONNECTING);
- }
+ case IBNAL_CONN_ACTIVE_CHECK_REPLY:
+ /* got a connection reply but failed checks */
+ LASSERT (active);
+ kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
+ break;
- if (status == 0) {
- /* Everything worked! */
+ case IBNAL_CONN_ACTIVE_CONNECT:
+ LASSERT (active);
+ cm_cancel(conn->ibc_cep);
+ cfs_pause(cfs_time_seconds(1)/10);
+ /* cm_connect() failed immediately or
+ * callback returned failure */
+ break;
- peer->ibp_connecting--;
+ case IBNAL_CONN_ACTIVE_ARP:
+ LASSERT (active);
+ /* ibat_get_ib_data() failed immediately
+ * or callback returned failure */
+ break;
- /* +1 ref for ibc_list; caller(== CM)'s ref remains until
- * the IB_CM_IDLE callback */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- list_add (&conn->ibc_list, &peer->ibp_conns);
-
- /* reset reconnect interval for next attempt */
- peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
-
- /* post blocked sends to the new connection */
- spin_lock (&conn->ibc_lock);
-
- while (!list_empty (&peer->ibp_tx_queue)) {
- tx = list_entry (peer->ibp_tx_queue.next,
- kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
+ case IBNAL_CONN_INIT:
+ break;
- /* +1 ref for each tx */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- kibnal_queue_tx_locked (tx, conn);
+ case IBNAL_CONN_PASSIVE_WAIT:
+ LASSERT (!active);
+ /* cm_accept callback returned failure */
+ break;
}
-
- spin_unlock (&conn->ibc_lock);
- /* Nuke any dangling conns from a different peer instance... */
- kibnal_close_stale_conns_locked (conn->ibc_peer,
- conn->ibc_incarnation);
+ kibnal_peer_connect_failed(peer, active, status);
+ kibnal_conn_disconnected(conn);
+ return;
+ }
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ /* connection established */
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- /* queue up all the receives */
- for (i = 0; i < IBNAL_RX_MSGS; i++) {
- /* +1 ref for rx desc */
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
+ if (active) {
+ LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU);
+ } else {
+ LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
+ }
- CDEBUG(D_NET, "RX[%d] %p->%p\n",
- i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg);
+ conn->ibc_last_send = jiffies;
+ kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
+ kibnal_peer_alive(peer);
- kibnal_post_rx (&conn->ibc_rxs[i], 0);
- }
+ /* Add conn to peer's list and nuke any dangling conns from a different
+ * peer instance... */
+ kibnal_conn_addref(conn); /* +1 ref for ibc_list */
+ list_add(&conn->ibc_list, &peer->ibp_conns);
+ kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
+
+ if (!kibnal_peer_active(peer) || /* peer has been deleted */
+ conn->ibc_comms_error != 0 || /* comms error */
+ conn->ibc_disconnect) { /* need to disconnect */
+
+ /* start to shut down connection */
+ kibnal_close_conn_locked(conn, -ECONNABORTED);
- kibnal_check_sends (conn);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+ kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
return;
}
- /* connection failed */
- if (state == IBNAL_CONN_CONNECTING) {
- /* schedule for connd to close */
- kibnal_close_conn_locked (conn, status);
- } else {
- /* Don't have a CM comm_id; just wait for refs to drain */
- conn->ibc_state = IBNAL_CONN_DISCONNECTED;
- }
+ if (active)
+ peer->ibp_connecting--;
+ else
+ peer->ibp_accepting--;
+
+ /* grab pending txs while I have the lock */
+ list_add(&txs, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
- kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ /* Schedule blocked txs */
+ spin_lock (&conn->ibc_lock);
+ while (!list_empty (&txs)) {
+ tx = list_entry (txs.next, kib_tx_t, tx_list);
+ list_del (&tx->tx_list);
+
+ kibnal_queue_tx_locked (tx, conn);
+ }
+ spin_unlock (&conn->ibc_lock);
+ kibnal_check_sends (conn);
- /* If we didn't establish the connection we don't have to pass
- * through the disconnect protocol before dropping the CM ref */
- if (state < IBNAL_CONN_CONNECTING)
- kibnal_put_conn (conn);
+ /* schedule blocked rxs */
+ kibnal_handle_early_rxs(conn);
}
-static int
-kibnal_accept (kib_conn_t **connp, cm_cep_handle_t *cep,
- ptl_nid_t nid, __u64 incarnation, int queue_depth)
+void
+kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
{
- kib_conn_t *conn = kibnal_create_conn();
- kib_peer_t *peer;
- kib_peer_t *peer2;
- unsigned long flags;
+ static cm_dreply_data_t drep; /* just zeroed space */
- if (conn == NULL)
- return (-ENOMEM);
-
- if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
- CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
- nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
- atomic_dec (&conn->ibc_refcount);
- kibnal_destroy_conn(conn);
- return (-EPROTO);
- }
-
- /* assume 'nid' is a new peer */
- peer = kibnal_create_peer (nid);
- if (peer == NULL) {
- CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_dec (&conn->ibc_refcount);
- kibnal_destroy_conn(conn);
- return (-ENOMEM);
- }
-
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
-
- peer2 = kibnal_find_peer_locked(nid);
- if (peer2 == NULL) {
- /* peer table takes my ref on peer */
- list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
- } else {
- kib_peer_decref (peer);
- peer = peer2;
- }
+ kib_conn_t *conn = (kib_conn_t *)arg;
+ unsigned long flags;
- kib_peer_addref(peer); /* +1 ref for conn */
- peer->ibp_connecting++;
+ /* CAVEAT EMPTOR: tasklet context */
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ switch (cmdata->status) {
+ default:
+ LBUG();
- conn->ibc_peer = peer;
- conn->ibc_state = IBNAL_CONN_CONNECTING;
- /* conn->ibc_cep is set when cm_accept is called */
- conn->ibc_incarnation = incarnation;
- conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+ case cm_event_disconn_request:
+ /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done
+ * IBNAL_CONN_ESTABLISHED: I start it closing
+ * otherwise: it's closing anyway */
+ cm_disconnect(conn->ibc_cep, NULL, &drep);
+ cm_cancel(conn->ibc_cep);
- *connp = conn;
- return (0);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ LASSERT (!conn->ibc_disconnect);
+ conn->ibc_disconnect = 1;
+
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBNAL_CONN_ACTIVE_RTU:
+ /* kibnal_connreq_done is getting there; It'll see
+ * ibc_disconnect set... */
+ break;
+
+ case IBNAL_CONN_ESTABLISHED:
+ /* kibnal_connreq_done got there already; get
+ * disconnect going... */
+ kibnal_close_conn_locked(conn, 0);
+ break;
+
+ case IBNAL_CONN_DISCONNECT1:
+ /* kibnal_disconnect_conn is getting there; It'll see
+ * ibc_disconnect set... */
+ break;
+
+ case IBNAL_CONN_DISCONNECT2:
+ /* kibnal_disconnect_conn got there already; complete
+ * the disconnect. */
+ kibnal_schedule_conn(conn);
+ break;
+ }
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+ break;
+
+ case cm_event_disconn_timeout:
+ case cm_event_disconn_reply:
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2);
+ LASSERT (!conn->ibc_disconnect);
+ conn->ibc_disconnect = 1;
+
+ /* kibnal_disconnect_conn sent the disconnect request. */
+ kibnal_schedule_conn(conn);
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+ break;
+
+ case cm_event_connected:
+ case cm_event_conn_timeout:
+ case cm_event_conn_reject:
+ LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
+ conn->ibc_connvars->cv_conndata = *cmdata;
+
+ kibnal_schedule_conn(conn);
+ break;
+ }
+
+ kibnal_conn_decref(conn); /* lose my ref */
}
-static void kibnal_move_qp_to_error(kib_conn_t *conn)
+void
+kibnal_check_passive_wait(kib_conn_t *conn)
{
- vv_qp_attr_t qp_attr;
- vv_return_t retval;
+ int rc;
+
+ switch (conn->ibc_connvars->cv_conndata.status) {
+ default:
+ LBUG();
- qp_attr.modify.qp_modify_into_state = vv_qp_state_error;
- qp_attr.modify.vv_qp_attr_mask = VV_QP_AT_STATE;
- qp_attr.modify.qp_type = vv_qp_type_r_conn;
+ case cm_event_connected:
+ kibnal_conn_addref(conn); /* ++ ref for CM callback */
+ rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
+ if (rc != 0)
+ conn->ibc_comms_error = rc;
+ /* connection _has_ been established; it's just that we've had
+ * an error immediately... */
+ kibnal_connreq_done(conn, 0, 0);
+ break;
- retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs);
- if (retval)
- CERROR("couldn't move qp into error state, error %d\n", retval);
+ case cm_event_conn_timeout:
+ kibnal_connreq_done(conn, 0, -ETIMEDOUT);
+ break;
+
+ case cm_event_conn_reject:
+ kibnal_connreq_done(conn, 0, -ECONNRESET);
+ break;
+ }
}
-static void kibnal_flush_pending(kib_conn_t *conn)
+void
+kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
{
- LIST_HEAD (zombies);
- struct list_head *tmp;
- struct list_head *nxt;
- kib_tx_t *tx;
- unsigned long flags;
- int done;
+ static kib_msg_t txmsg;
+ static kib_msg_t rxmsg;
+ static cm_reply_data_t reply;
- /* NB we wait until the connection has closed before completing
- * outstanding passive RDMAs so we can be sure the network can't
- * touch the mapped memory any more. */
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+ kib_conn_t *conn = NULL;
+ int rc = 0;
+ int reason;
+ int rxmsgnob;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ unsigned long flags;
+ kib_connvars_t *cv;
+ cm_return_t cmrc;
+ vv_return_t vvrc;
+
+ /* I'm the connd executing in thread context
+ * No concurrency problems with static data! */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+
+ if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
+ CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
+ cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
+
+ /* copy into rxmsg to avoid alignment issues */
+ rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
+ memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
+
+ rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
+ if (rc != 0) {
+ /* SILENT! kibnal_unpack_msg() complains if required */
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- /* set the QP to the error state so that we get flush callbacks
- * on our posted receives which can then drop their conn refs */
- kibnal_move_qp_to_error(conn);
+ if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
+ CWARN("Connection from %s: old protocol version 0x%x\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
+ CERROR("Unexpected connreq msg type: %x from %s\n",
+ rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- /* grab passive RDMAs not waiting for the tx callback */
- list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
- tx = list_entry (tmp, kib_tx_t, tx_list);
+ if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+ rxmsg.ibm_dstnid)) {
+ CERROR("Can't accept %s: bad dst nid %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid),
+ libcfs_nid2str(rxmsg.ibm_dstnid));
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
+ if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+ CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid),
+ rxmsg.ibm_u.connparams.ibcp_queue_depth,
+ IBNAL_MSG_QUEUE_SIZE);
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- LASSERT (tx->tx_passive_rdma_wait ||
- tx->tx_sending != 0);
+ if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
+ CERROR("Can't accept %s: message size %d too big (%d max)\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid),
+ rxmsg.ibm_u.connparams.ibcp_max_msg_size,
+ IBNAL_MSG_SIZE);
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- /* still waiting for tx callback? */
- if (!tx->tx_passive_rdma_wait)
- continue;
+ if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("Can't accept %s: max frags %d too big (%d max)\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid),
+ rxmsg.ibm_u.connparams.ibcp_max_frags,
+ IBNAL_MAX_RDMA_FRAGS);
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- tx->tx_status = -ECONNABORTED;
- tx->tx_passive_rdma_wait = 0;
- done = (tx->tx_sending == 0);
+ /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
+ rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
+ if (rc != 0) {
+ CERROR("Can't create peer for %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+ reason = IBNAL_REJECT_NO_RESOURCES;
+ goto reject;
+ }
- if (!done)
- continue;
+ write_lock_irqsave(g_lock, flags);
- list_del (&tx->tx_list);
- list_add (&tx->tx_list, &zombies);
+ if (kibnal_data.kib_listen_handle == NULL) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN ("Shutdown has started, rejecting connreq from %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+ kibnal_peer_decref(peer);
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
}
- /* grab all blocked transmits */
- list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
- tx = list_entry (tmp, kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
- list_add (&tx->tx_list, &zombies);
+ peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
+ if (peer2 != NULL) {
+ /* tie-break connection race in favour of the higher NID */
+ if (peer2->ibp_connecting != 0 &&
+ rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN("Conn race %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+
+ kibnal_peer_decref(peer);
+ reason = IBNAL_REJECT_CONN_RACE;
+ goto reject;
+ }
+
+ peer2->ibp_accepting++;
+ kibnal_peer_addref(peer2);
+
+ write_unlock_irqrestore(g_lock, flags);
+ kibnal_peer_decref(peer);
+ peer = peer2;
+ } else {
+ /* Brand new peer */
+ LASSERT (peer->ibp_accepting == 0);
+ peer->ibp_accepting = 1;
+
+ kibnal_peer_addref(peer);
+ list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
+
+ write_unlock_irqrestore(g_lock, flags);
}
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
- while (!list_empty(&zombies)) {
- tx = list_entry (zombies.next, kib_tx_t, tx_list);
+ conn = kibnal_create_conn(cep);
+ if (conn == NULL) {
+ CERROR("Can't create conn for %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+ kibnal_peer_connect_failed(peer, 0, -ENOMEM);
+ kibnal_peer_decref(peer);
+ reason = IBNAL_REJECT_NO_RESOURCES;
+ goto reject;
+ }
- list_del(&tx->tx_list);
- kibnal_tx_done (tx);
+ conn->ibc_version = rxmsg.ibm_version;
+
+ conn->ibc_peer = peer; /* conn takes over my ref */
+ conn->ibc_incarnation = rxmsg.ibm_srcstamp;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+ LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+ <= IBNAL_RX_MSGS);
+
+ cv = conn->ibc_connvars;
+
+ cv->cv_txpsn = cmreq->cep_data.start_psn;
+ cv->cv_remote_qpn = cmreq->cep_data.qpn;
+ cv->cv_path = cmreq->path_data.path;
+ cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt;
+ // XXX cmreq->cep_data.retry_cnt;
+ cv->cv_port = cmreq->cep_data.local_port_num;
+
+ vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
+ &cv->cv_path.sgid, &cv->cv_sgid_index);
+ if (vvrc != vv_return_ok) {
+ CERROR("gid2gid_index failed for %s: %d\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
+ rc = -EIO;
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
+
+ vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
+ cv->cv_path.pkey, &cv->cv_pkey_index);
+ if (vvrc != vv_return_ok) {
+ CERROR("pkey2pkey_index failed for %s: %d\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
+ rc = -EIO;
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
+
+ rc = kibnal_set_qp_state(conn, vv_qp_state_init);
+ if (rc != 0) {
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
}
-}
-static void
-kibnal_reject (cm_cep_handle_t cep, cm_rej_code_t reason)
-{
- cm_reject_data_t *rej;
+ rc = kibnal_post_receives(conn);
+ if (rc != 0) {
+ CERROR("Can't post receives for %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
- PORTAL_ALLOC(rej, sizeof(*rej));
- if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
- return;
+ rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
+ if (rc != 0) {
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
+
+ memset(&reply, 0, sizeof(reply));
+ reply.qpn = cv->cv_local_qpn;
+ reply.qkey = IBNAL_QKEY;
+ reply.start_psn = cv->cv_rxpsn;
+ reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH;
+ reply.arb_resp_res = IBNAL_ARB_RESP_RES;
+ reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED;
+ reply.rnr_retry_count = cv->cv_rnr_count;
+ reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay;
+
+ /* setup txmsg... */
+ memset(&txmsg, 0, sizeof(txmsg));
+ kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK,
+ sizeof(txmsg.ibm_u.connparams));
+ LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len);
+ txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
+ txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
+ txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
+ kibnal_pack_msg(&txmsg, conn->ibc_version,
+ 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
+
+ /* ...and copy into reply to avoid alignment issues */
+ memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
+
+ kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT);
+
+ cmrc = cm_accept(conn->ibc_cep, &reply, NULL,
+ kibnal_cm_callback, conn);
+
+ if (cmrc == cm_stat_success)
+ return; /* callback has got my ref on conn */
+
+ /* back out state change (no callback happening) */
+ kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
+ rc = -EIO;
+ reason = IBNAL_REJECT_FATAL;
+
+ reject:
+ CDEBUG(D_NET, "Rejecting connreq from %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
- rej->reason = reason;
- cm_reject(cep, rej);
- PORTAL_FREE(rej, sizeof(*rej));
+ kibnal_reject(cep, reason);
+
+ if (conn != NULL) {
+ LASSERT (rc != 0);
+ kibnal_connreq_done(conn, 0, rc);
+ } else {
+ cm_destroy_cep(cep);
+ }
}
-static void get_av_from_path(ib_path_record_v2_t *path, vv_add_vec_t *av)
+void
+kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
{
- av->service_level = path->sl;
- av->grh_flag = 0; /* TODO: correct? */
- av->dlid = path->dlid;
- av->pmtu = path->mtu;
-
- /* From sdp-hca-params.h. */
- switch(path->rate) {
- case 2:
- av->max_static_rate = 1;
- break;
- case 3:
- case 4:
- default:
- av->max_static_rate = 0;
- break;
- }
+ cm_request_data_t *cmreq = &data->data.request;
+ kib_pcreq_t *pcr;
+ unsigned long flags;
- av->l_ack_timeout = IBNAL_ACK_TIMEOUT;
- av->retry_count = IBNAL_RETRY;
- av->rnr_retry_count = IBNAL_RNR_RETRY;
- av->source_path_bit = 0;
+ LASSERT (arg == NULL);
- av->global_dest.flow_lable = path->flow_label;
- av->global_dest.hope_limit = path->hop_limut;
- av->global_dest.traffic_class = path->traffic_class;
- av->global_dest.s_gid_index = 0;
- av->global_dest.d_gid = path->dgid;
-};
+ if (data->status != cm_event_conn_request) {
+ CERROR("status %d is not cm_event_conn_request\n",
+ data->status);
+ return;
+ }
-static vv_return_t
-kibnal_qp_rts(vv_qp_h_t qp_handle, __u32 qpn, __u8 resp_res,
- ib_path_record_v2_t *path, __u8 init_depth, __u32 send_psn)
-{
- vv_qp_attr_t qp_attr;
- vv_return_t retval;
+ LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
+ if (pcr == NULL) {
+ CERROR("Can't allocate passive connreq\n");
- ENTRY;
+ kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
+ cm_destroy_cep(cep);
+ return;
+ }
-#if 1
- /* TODO - Hack. I don't know whether I get bad values from the
- * stack or if I'm using the wrong names. */
- resp_res = 8;
- init_depth = 8;
-#endif
+ pcr->pcr_cep = cep;
+ pcr->pcr_cmreq = *cmreq;
- /* RTR */
- qp_attr.modify.qp_modify_into_state = vv_qp_state_rtr;
- qp_attr.modify.vv_qp_attr_mask =
- VV_QP_AT_STATE |
- VV_QP_AT_ADD_VEC |
- VV_QP_AT_DEST_QP |
- VV_QP_AT_R_PSN |
- VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
- VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_OP_F;
-
- qp_attr.modify.qp_type = vv_qp_type_r_conn;
-
- get_av_from_path(path, &qp_attr.modify.params.rtr.remote_add_vec);
- qp_attr.modify.params.rtr.destanation_qp = qpn;
- qp_attr.modify.params.rtr.receive_psn = IBNAL_STARTING_PSN;
- qp_attr.modify.params.rtr.responder_rdma_r_atom_num = resp_res;
- qp_attr.modify.params.rtr.opt_min_rnr_nak_timer = 16; /* 20 ms */
-
- /* For now, force MTU to 1KB (Voltaire's advice). */
- qp_attr.modify.params.rtr.remote_add_vec.pmtu = vv_mtu_1024;
-
- retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
- if (retval) {
- CERROR("Cannot modify QP to RTR: %d\n", retval);
- RETURN(retval);
- }
-
- /* RTS */
- qp_attr.modify.qp_modify_into_state = vv_qp_state_rts;
- qp_attr.modify.vv_qp_attr_mask =
- VV_QP_AT_STATE |
- VV_QP_AT_L_ACK_T |
- VV_QP_AT_RETRY_NUM |
- VV_QP_AT_RNR_NUM |
- VV_QP_AT_S_PSN |
- VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM;
- qp_attr.modify.qp_type = vv_qp_type_r_conn;
-
- qp_attr.modify.params.rts.local_ack_timeout = path->pkt_life_time + 2; /* 2 or 1? */
- qp_attr.modify.params.rts.retry_num = IBNAL_RETRY;
- qp_attr.modify.params.rts.rnr_num = IBNAL_RNR_RETRY;
- qp_attr.modify.params.rts.send_psn = send_psn;
- qp_attr.modify.params.rts.dest_out_rdma_r_atom_num = init_depth;
- qp_attr.modify.params.rts.flow_control = 1; /* Stack does not use it. */
-
- retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL);
- if (retval) {
- CERROR("Cannot modify QP to RTS: %d\n", retval);
- }
-
- RETURN(retval);
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+ list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs);
+ wake_up(&kibnal_data.kib_connd_waitq);
+spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
}
-static void
-kibnal_connect_reply (cm_cep_handle_t cep, cm_conn_data_t *info, kib_conn_t *conn)
+
+void
+kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
+ void *arg)
{
- vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
- kib_wire_connreq_t *wcr;
- cm_reply_data_t *rep = &info->data.reply;
- cm_rej_code_t reason;
- vv_return_t retval;
-
- wcr = (kib_wire_connreq_t *)info->data.reply.priv_data;
-
- if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
- CERROR ("Can't connect "LPX64": bad magic %08x\n",
- conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
- GOTO(reject, reason = cm_rej_code_usr_rej);
- }
-
- if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
- CERROR ("Can't connect "LPX64": bad version %d\n",
- conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
- GOTO(reject, reason = cm_rej_code_usr_rej);
- }
-
- if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
- CERROR ("Can't connect "LPX64": bad queue depth %d\n",
- conn->ibc_peer->ibp_nid,
- le16_to_cpu(wcr->wcr_queue_depth));
- GOTO(reject, reason = cm_rej_code_usr_rej);
- }
-
- if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
- CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
- le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
- GOTO(reject, reason = cm_rej_code_usr_rej);
- }
-
- CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
- conn, conn->ibc_peer->ibp_nid);
-
- conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
- conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+ /* CAVEAT EMPTOR: tasklet context */
+ kib_conn_t *conn = (kib_conn_t *)arg;
+ kib_connvars_t *cv = conn->ibc_connvars;
- retval = kibnal_qp_rts(conn->ibc_qp, rep->qpn,
- min_t(__u8, rep->arb_initiator_depth,
- ca_attr->max_read_atom_qp_outstanding),
- &conn->ibc_connreq->cr_path,
- min_t(__u8, rep->arb_resp_res,
- ca_attr->max_qp_depth_for_init_read_atom),
- rep->start_psn);
-
- if (retval) {
- CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
- conn, conn->ibc_peer->ibp_nid, retval);
- GOTO(reject, reason = cm_rej_code_no_qp);
- }
-
- dump_qp(conn);
-
- /* the callback arguments are ignored for an active accept */
- /* TODO: memset cmrtu? */
- retval = cm_accept(cep, NULL, &conn->ibc_connreq->cr_cm_rtu, kibnal_cm_callback, conn);
- if (retval) {
- CERROR("Connection %p -> "LPX64" CMAccept RTU failed: %d\n",
- conn, conn->ibc_peer->ibp_nid, retval);
- kibnal_connreq_done (conn, 1, -ECONNABORTED);
- /* XXX don't call reject after accept fails? */
- return;
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
+ cv->cv_conndata = *cd;
+
+ kibnal_schedule_conn(conn);
+ kibnal_conn_decref(conn);
+}
+
+void
+kibnal_connect_conn (kib_conn_t *conn)
+{
+ static cm_request_data_t cmreq;
+ static kib_msg_t msg;
+
+ kib_connvars_t *cv = conn->ibc_connvars;
+ kib_peer_t *peer = conn->ibc_peer;
+ cm_return_t cmrc;
+
+ /* Only called by connd => statics OK */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
+
+ memset(&cmreq, 0, sizeof(cmreq));
+
+ cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
+
+ cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid;
+ cmreq.cep_data.qpn = cv->cv_local_qpn;
+ cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt;
+ cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt;
+ cmreq.cep_data.start_psn = cv->cv_rxpsn;
+ cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
+ // XXX ack_timeout?
+ // offered_resp_res
+ // offered_initiator_depth
+
+ cmreq.path_data.subn_local = IBNAL_LOCAL_SUB;
+ cmreq.path_data.path = cv->cv_path;
+
+ /* setup msg... */
+ memset(&msg, 0, sizeof(msg));
+ kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams));
+ LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len);
+ msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
+ msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
+ msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
+ kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ LNET_LOCK();
+ if ((the_lnet.ln_testprotocompat & 1) != 0) {
+ msg.ibm_version++;
+ the_lnet.ln_testprotocompat &= ~1;
+ }
+ if ((the_lnet.ln_testprotocompat & 2) != 0) {
+ msg.ibm_magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~2;
+ }
+ LNET_UNLOCK();
}
- CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
- conn, conn->ibc_peer->ibp_nid);
+ /* ...and copy into cmreq to avoid alignment issues */
+ memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
- kibnal_connreq_done (conn, 1, 0);
+ CDEBUG(D_NET, "Connecting %p to %s\n", conn,
+ libcfs_nid2str(peer->ibp_nid));
- return;
+ kibnal_conn_addref(conn); /* ++ref for CM callback */
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
-reject:
- kibnal_reject(cep, reason);
- kibnal_connreq_done (conn, 1, -EPROTO);
+ cmrc = cm_connect(conn->ibc_cep, &cmreq,
+ kibnal_active_connect_callback, conn);
+ if (cmrc == cm_stat_success) {
+ CDEBUG(D_NET, "connection REQ sent to %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return;
+ }
+
+ CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
+ kibnal_conn_decref(conn); /* drop callback's ref */
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
}
-/* Off level CM callback */
-static void
-_kibnal_cm_callback(void * arg)
+void
+kibnal_reconnect (kib_conn_t *conn, int why)
{
- struct cm_off_level *cm_tq = arg;
- cm_cep_handle_t cep = cm_tq->cep;
- cm_conn_data_t *info = cm_tq->info;
- kib_conn_t *conn = cm_tq->conn;
- vv_return_t retval;
+ kib_peer_t *peer = conn->ibc_peer;
+ int retry;
+ unsigned long flags;
+ cm_return_t cmrc;
+ cm_cep_handle_t cep;
- CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
- PORTAL_FREE(cm_tq, sizeof(*cm_tq));
+ read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- /* Established Connection Notifier */
- switch (info->status) {
- case cm_event_connected:
- CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
- conn, conn->ibc_peer->ibp_nid);
- kibnal_connreq_done (conn, 0, 0);
- break;
+ LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */
- case cm_event_conn_timeout:
- case cm_event_conn_reject:
- /* TODO: be sure this is called only if REQ times out. */
- CERROR("connection timed out\n");
- LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING);
- conn->ibc_state = IBNAL_CONN_INIT_QP;
- kibnal_connreq_done (conn, 1, -EINVAL);
- break;
+ /* retry connection if it's still needed and no other connection
+ * attempts (active or passive) are in progress.
+ * Immediate reconnect is required, so I don't even look at the
+ * reconnection timeout etc */
- case cm_event_conn_reply:
- kibnal_connect_reply(cep, info, conn);
- break;
+ retry = (!list_empty(&peer->ibp_tx_queue) &&
+ peer->ibp_connecting == 1 &&
+ peer->ibp_accepting == 0);
- case cm_event_disconn_request:
- /* XXX lock around these state management bits? */
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
- kibnal_close_conn (conn, 0);
- conn->ibc_state = IBNAL_CONN_DREP;
-
- retval = cm_disconnect(conn->ibc_cep, NULL, &kibnal_data.cm_data.drep_data);
- if (retval)
- CERROR("disconnect rep failed: %d\n", retval);
-
- /* Fall through ... */
-
- /* these both guarantee that no more cm callbacks will occur */
- case cm_event_disconnected: /* aka cm_event_disconn_timeout */
- case cm_event_disconn_reply:
- CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
- conn, conn->ibc_peer->ibp_nid);
+ read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
- conn->ibc_state = IBNAL_CONN_DISCONNECTED;
- kibnal_flush_pending(conn);
- kibnal_put_conn(conn); /* Lose CM's ref */
- break;
+ if (!retry) {
+ kibnal_connreq_done(conn, 1, why);
+ return;
+ }
- default:
- CERROR("unknown status %d on Connection %p -> "LPX64"\n",
- info->status, conn, conn->ibc_peer->ibp_nid);
- LBUG();
- break;
+ cep = cm_create_cep(cm_cep_transp_rc);
+ if (cep == NULL) {
+ CERROR("Can't create new CEP\n");
+ kibnal_connreq_done(conn, 1, -ENOMEM);
+ return;
}
- return;
+ cmrc = cm_cancel(conn->ibc_cep);
+ LASSERT (cmrc == cm_stat_success);
+ cmrc = cm_destroy_cep(conn->ibc_cep);
+ LASSERT (cmrc == cm_stat_success);
+
+ conn->ibc_cep = cep;
+
+ /* reuse conn; no need to peer->ibp_connecting++ */
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
+ kibnal_connect_conn(conn);
}
-static void
-kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg)
-{
- struct cm_off_level *cm_tq;
+void
+kibnal_check_connreply (kib_conn_t *conn)
+{
+ static cm_rtu_data_t rtu;
+ static kib_msg_t msg;
+
+ kib_connvars_t *cv = conn->ibc_connvars;
+ cm_reply_data_t *reply = &cv->cv_conndata.data.reply;
+ kib_peer_t *peer = conn->ibc_peer;
+ int msgnob;
+ cm_return_t cmrc;
+ unsigned long flags;
+ int rc;
+
+ /* Only called by connd => statics OK */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
- LASSERT(cep);
- LASSERT(info);
+ if (cv->cv_conndata.status == cm_event_conn_reply) {
+ cv->cv_remote_qpn = reply->qpn;
+ cv->cv_txpsn = reply->start_psn;
+ // XXX reply->targ_ack_delay;
+ cv->cv_rnr_count = reply->rnr_retry_count;
- CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep);
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
- PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq));
- if (cm_tq == NULL) {
- CERROR("Failed to allocate a CM off level structure\n");
- return;
- }
+ /* copy into msg to avoid alignment issues */
+ msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
+ memcpy(&msg, &reply->priv_data, msgnob);
- cm_tq->tq.sync = 0;
- cm_tq->tq.routine = _kibnal_cm_callback;
- cm_tq->tq.data = cm_tq;
+ rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
+ if (rc != 0) {
+ CERROR("Can't unpack reply from %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, rc);
+ return;
+ }
- cm_tq->cep = cep;
- cm_tq->info = info;
- cm_tq->conn = (kib_conn_t *)arg;
+ if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
+ CERROR("Unexpected message type %d from %s\n",
+ msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
- schedule_task(&cm_tq->tq);
-}
+ if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+ CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg.ibm_u.connparams.ibcp_queue_depth,
+ IBNAL_MSG_QUEUE_SIZE);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
-static int
-kibnal_set_cm_flags(cm_cep_handle_t cep)
-{
-#ifdef TODO
-voltaire cm doesnot appear to have that functionnality
- FSTATUS frc;
- uint32 value = 1;
+ if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
+ CERROR("%s max message size %d too big (%d max)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg.ibm_u.connparams.ibcp_max_msg_size,
+ IBNAL_MSG_SIZE);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
- frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
- (char *)&value, sizeof(value), 0);
- if (frc != FSUCCESS) {
- CERROR("error setting timeout callback: %d\n", frc);
- return -1;
- }
+ if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("%s max frags %d too big (%d max)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg.ibm_u.connparams.ibcp_max_frags,
+ IBNAL_MAX_RDMA_FRAGS);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
-#if 0
- frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
- sizeof(value), 0);
- if (frc != FSUCCESS) {
- CERROR("error setting async accept: %d\n", frc);
- return -1;
- }
-#endif
-#endif
+ read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+ msg.ibm_dstnid) &&
+ msg.ibm_dststamp == kibnal_data.kib_incarnation)
+ rc = 0;
+ else
+ rc = -ESTALE;
+ read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+ if (rc != 0) {
+ CERROR("Stale connection reply from %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, rc);
+ return;
+ }
- return 0;
-}
+ conn->ibc_incarnation = msg.ibm_srcstamp;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+ LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+ <= IBNAL_RX_MSGS);
-/* Off level listen callback */
-static void
-_kibnal_listen_callback(void *arg)
-{
- struct cm_off_level *cm_tq = arg;
- cm_cep_handle_t cep = cm_tq->cep;
- cm_conn_data_t *info = cm_tq->info;
- vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
- cm_request_data_t *req;
- cm_reply_data_t *rep = NULL;
- kib_wire_connreq_t *wcr;
- kib_conn_t *conn = NULL;
- cm_rej_code_t reason = 0;
- int rc = 0;
- vv_return_t retval;
- vv_qp_attr_t *query;
- void *qp_context;
+ rc = kibnal_post_receives(conn);
+ if (rc != 0) {
+ CERROR("Can't post receives for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, rc);
+ return;
+ }
- LASSERT(cep);
- LASSERT(info);
+ rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
+ if (rc != 0) {
+ kibnal_connreq_done(conn, 1, rc);
+ return;
+ }
- CDEBUG(D_NET, "LISTEN status 0x%x for CEP %p\n", info->status, cep);
+ rc = kibnal_set_qp_state(conn, vv_qp_state_rts);
+ if (rc != 0) {
+ kibnal_connreq_done(conn, 1, rc);
+ return;
+ }
- PORTAL_FREE(cm_tq, sizeof(*cm_tq));
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU);
+ kibnal_conn_addref(conn); /* ++for CM callback */
- req = &info->data.request;
- wcr = (kib_wire_connreq_t *)req->priv_data;
+ memset(&rtu, 0, sizeof(rtu));
+ cmrc = cm_accept(conn->ibc_cep, NULL, &rtu,
+ kibnal_cm_callback, conn);
+ if (cmrc == cm_stat_success) {
+ /* Now I'm racing with disconnect signalled by
+ * kibnal_cm_callback */
+ kibnal_connreq_done(conn, 1, 0);
+ return;
+ }
- CDEBUG(D_NET, "%d from "LPX64"\n", info->status,
- le64_to_cpu(wcr->wcr_nid));
-
-#ifdef TODO
- is there an equivalent?
- if (info->status == FCM_CONNECT_CANCEL)
+ CERROR("cm_accept %s failed: %d\n",
+ libcfs_nid2str(peer->ibp_nid), cmrc);
+ /* Back out of RTU: no callback coming */
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
+ kibnal_conn_decref(conn);
+ kibnal_connreq_done(conn, 1, -EIO);
return;
-#endif
-
- LASSERT (info->status == cm_event_conn_request);
-
- if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
- CERROR ("Can't accept: bad magic %08x\n",
- le32_to_cpu(wcr->wcr_magic));
- GOTO(out, reason = cm_rej_code_usr_rej);
- }
-
- if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
- CERROR ("Can't accept: bad version %d\n",
- le16_to_cpu(wcr->wcr_magic));
- GOTO(out, reason = cm_rej_code_usr_rej);
}
- rc = kibnal_accept(&conn, cep,
- le64_to_cpu(wcr->wcr_nid),
- le64_to_cpu(wcr->wcr_incarnation),
- le16_to_cpu(wcr->wcr_queue_depth));
- if (rc != 0) {
- CERROR ("Can't accept "LPX64": %d\n",
- le64_to_cpu(wcr->wcr_nid), rc);
- GOTO(out, reason = cm_rej_code_no_res);
- }
-
- /* TODO: I hope I got the ca_attr names correctly. */
- retval = kibnal_qp_rts(conn->ibc_qp, req->cep_data.qpn,
- min_t(__u8, req->cep_data.offered_initiator_depth,
- ca_attr->max_read_atom_qp_outstanding),
- &req->path_data.path,
- min_t(__u8, req->cep_data.offered_resp_res,
- ca_attr->max_qp_depth_for_init_read_atom),
- req->cep_data.start_psn);
-
- if (retval) {
- CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n",
- le64_to_cpu(wcr->wcr_nid), retval);
- GOTO(out, reason = cm_rej_code_no_qp);
- }
-
- dump_qp(conn);
-
- retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs);
- if (retval) {
- CERROR ("Couldn't query qp attributes "LPX64": %d\n",
- le64_to_cpu(wcr->wcr_nid), retval);
- GOTO(out, reason = cm_rej_code_no_qp);
- }
- query = &conn->ibc_qp_attrs;
-
- PORTAL_ALLOC(rep, sizeof(*rep));
- if (rep == NULL) {
- CERROR ("can't reply and receive buffers\n");
- GOTO(out, reason = cm_rej_code_insuff_resp_res);
- }
-
- /* don't try to deref this into the incoming wcr :) */
- wcr = (kib_wire_connreq_t *)rep->priv_data;
-
- *rep = (cm_reply_data_t) {
- .qpn = query->query.qp_num,
- .start_psn = query->query.receve_psn,
- .arb_resp_res = query->query.rdma_r_atom_outstand_num,
- .arb_initiator_depth = query->query.rdma_r_atom_outstand_num,
- .targ_ack_delay = 0,
- .failover_accepted = 0,
- .end_to_end_flow_ctrl = 1, /* (query->query.flow_control is never set) */
- .rnr_retry_count = req->cep_data.rtr_retry_cnt,
- };
-
- *wcr = (kib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
- .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
- };
-
- retval = cm_accept(cep, rep, NULL, kibnal_cm_callback, conn);
-
- PORTAL_FREE(rep, sizeof(*rep));
+ if (cv->cv_conndata.status == cm_event_conn_reject) {
+
+ if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
+ unsigned char *bytes =
+ cv->cv_conndata.data.reject.priv_data;
+ int magic = (bytes[0]) |
+ (bytes[1] << 8) |
+ (bytes[2] << 16) |
+ (bytes[3] << 24);
+ int version = (bytes[4]) |
+ (bytes[5] << 8);
+ int why = (bytes[6]);
+
+ /* Expected proto/version: she just doesn't like me (or
+ * ran out of resources) */
+ if (magic == IBNAL_MSG_MAGIC &&
+ version == conn->ibc_version) {
+ CERROR("conn -> %s rejected: fatal error %d\n",
+ libcfs_nid2str(peer->ibp_nid), why);
+
+ if (why == IBNAL_REJECT_CONN_RACE)
+ kibnal_reconnect(conn, -EALREADY);
+ else
+ kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+ return;
+ }
- if (retval) {
- /* XXX it seems we don't call reject after this point? */
- CERROR("cm_accept() failed: %d, aborting\n", retval);
- rc = -ECONNABORTED;
- goto out;
- }
+ /* Fail unless it's worth retrying with an old proto
+ * version */
+ if (!(magic == IBNAL_MSG_MAGIC &&
+ version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+ conn->ibc_version == IBNAL_MSG_VERSION)) {
+ CERROR("conn -> %s rejected: bad protocol "
+ "magic/ver %08x/%x why %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ magic, version, why);
+
+ kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+ return;
+ }
- if (kibnal_set_cm_flags(conn->ibc_cep)) {
- rc = -ECONNABORTED;
- goto out;
- }
+ conn->ibc_version = version;
+ CWARN ("Connection to %s refused: "
+ "retrying with old protocol version 0x%x\n",
+ libcfs_nid2str(peer->ibp_nid), version);
- conn->ibc_cep = cep;
+ kibnal_reconnect(conn, -ECONNREFUSED);
+ return;
+ } else if (cv->cv_conndata.data.reject.reason ==
+ cm_rej_code_stale_conn) {
- CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
- conn, conn->ibc_peer->ibp_nid);
+ CWARN ("conn -> %s stale: retrying\n",
+ libcfs_nid2str(peer->ibp_nid));
-out:
- if (reason) {
- kibnal_reject(cep, reason);
- rc = -ECONNABORTED;
+ kibnal_reconnect(conn, -ESTALE);
+ return;
+ } else {
+ CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cv->cv_conndata.data.reject.reason);
+ kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+ return;
+ }
+ /* NOT REACHED */
}
- return;
+ CDEBUG(D_NETERROR, "conn -> %s failed: %d\n",
+ libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
+ kibnal_connreq_done(conn, 1, -ECONNABORTED);
}
void
-kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg)
+kibnal_arp_done (kib_conn_t *conn)
{
- struct cm_off_level *cm_tq;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_connvars_t *cv = conn->ibc_connvars;
+ ibat_arp_data_t *arp = &cv->cv_arp;
+ ib_path_record_v2_t *path = &cv->cv_path;
+ vv_return_t vvrc;
+ int rc;
+ unsigned long flags;
- LASSERT(cep);
- LASSERT(info);
- LASSERT(arg == NULL); /* no conn yet for passive */
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
+ LASSERT (peer->ibp_arp_count > 0);
- PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq));
- if (cm_tq == NULL) {
- CERROR("Failed to allocate a CM off level structure\n");
- return;
+ if (cv->cv_arprc != ibat_stat_ok) {
+ CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
+ cv->cv_arprc);
+ goto failed;
}
- cm_tq->tq.sync = 0;
- cm_tq->tq.routine = _kibnal_listen_callback;
- cm_tq->tq.data = cm_tq;
-
- cm_tq->cep = cep;
- cm_tq->info = info;
- cm_tq->conn = NULL;
-
- schedule_task(&cm_tq->tq);
-}
-
-static void
-kibnal_pathreq_callback (struct sa_request *request)
-{
- vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs;
- kib_conn_t *conn = request->context;
- gsi_dtgrm_t *dtgrm;
- sa_mad_v2_t *mad;
- ib_path_record_v2_t *path;
- u64 component_mask;
- cm_return_t cmret;
-
- if (request->status) {
- CERROR ("status %d\n", request->status);
- free_sa_request(request);
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
+ if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
+ CDEBUG(D_NET, "Got valid path for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
- dtgrm = request->dtgrm_resp;
- mad = (sa_mad_v2_t *) dtgrm->mad;
- path = (ib_path_record_v2_t *) mad->payload;
-
- /* Put the path record in host order for that stack. */
- gid_swap(&path->sgid);
- gid_swap(&path->dgid);
- path->slid = be16_to_cpu(path->slid);
- path->dlid = be16_to_cpu(path->dlid);
- path->flow_label = be32_to_cpu(path->flow_label);
- path->pkey = be16_to_cpu(path->pkey);
- path->sl = be16_to_cpu(path->sl);
-
- CDEBUG(D_NET, "sgid "LPX64":"LPX64" dgid "
- LPX64":"LPX64" pkey %x\n",
- path->sgid.scope.g.subnet,
- path->sgid.scope.g.eui64,
- path->dgid.scope.g.subnet,
- path->dgid.scope.g.eui64,
- path->pkey);
-
-#if TODO
- component_mask = be64_to_cpu(mad->component_mask);
- if ((component_mask && (1ull << 1)) == 0) {
- CERROR ("no servivce GID in SR: "LPX64"\n", component_mask);
- free_sa_request(request);
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
-#endif
+ *path = *arp->primary_path;
- conn->ibc_connreq->cr_path = *path;
+ vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
+ &cv->cv_port);
+ if (vvrc != vv_return_ok) {
+ CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ HIPQUAD(peer->ibp_ip), vvrc);
+ goto failed;
+ }
- free_sa_request(request);
+ vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
+ &path->sgid, &cv->cv_sgid_index);
+ if (vvrc != vv_return_ok) {
+ CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ HIPQUAD(peer->ibp_ip), vvrc);
+ goto failed;
+ }
- conn->ibc_cep = cm_create_cep(cm_cep_transp_rc);
- if (conn->ibc_cep == NULL) {
- CERROR ("Can't create CEP\n");
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
+ vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
+ path->pkey, &cv->cv_pkey_index);
+ if (vvrc != vv_return_ok) {
+ CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ HIPQUAD(peer->ibp_ip), vvrc);
+ goto failed;
+ }
- if (kibnal_set_cm_flags(conn->ibc_cep)) {
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
+ path->mtu = IBNAL_IB_MTU;
- conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
- .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
- };
+ } else if ((arp->mask & IBAT_LID_VALID) != 0) {
+ CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
- conn->ibc_connreq->cr_cm_req = (cm_request_data_t) {
- .sid = kibnal_data.kib_service_id,
- .cep_data = (cm_cep_data_t) {
- .ca_guid = kibnal_data.kib_hca_attrs.guid,
- .end_to_end_flow_ctrl = 1,
- .port_guid = kibnal_data.kib_port_gid.scope.g.eui64,
- .local_port_num = kibnal_data.kib_port,
- .start_psn = IBNAL_STARTING_PSN,
- .qpn = conn->ibc_qp_attrs.query.qp_num,
- .retry_cnt = IBNAL_RETRY,
- .rtr_retry_cnt = IBNAL_RNR_RETRY,
- .ack_timeout = IBNAL_ACK_TIMEOUT,
- .offered_resp_res = ca_attr->max_read_atom_qp_outstanding,
- .offered_initiator_depth = ca_attr->max_qp_depth_for_init_read_atom,
- },
- .path_data = (cm_cep_path_data_t) {
- .subn_local = TRUE,
- .path = conn->ibc_connreq->cr_path,
- },
- };
+ cv->cv_pkey_index = IBNAL_PKEY_IDX;
+ cv->cv_sgid_index = IBNAL_SGID_IDX;
+ cv->cv_port = arp->local_port_num;
-#if 0
- /* XXX set timeout just like SDP!!!*/
- conn->ibc_connreq->cr_path.packet_life = 13;
-#endif
- /* Flag I'm getting involved with the CM... */
- conn->ibc_state = IBNAL_CONN_CONNECTING;
+ memset(path, 0, sizeof(*path));
-#if 0
- CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
- conn->ibc_connreq->cr_service.RID.ServiceID,
- *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
-#endif
+ vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
+ &path->sgid);
+ if (vvrc != vv_return_ok) {
+ CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n",
+ libcfs_nid2str(peer->ibp_ip),
+ HIPQUAD(peer->ibp_ip), vvrc);
+ goto failed;
+ }
- memset(conn->ibc_connreq->cr_cm_req.priv_data, 0,
- cm_REQ_priv_data_len);
- memcpy(conn->ibc_connreq->cr_cm_req.priv_data,
- &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+ vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
+ &path->slid);
+ if (vvrc != vv_return_ok) {
+ CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n",
+ libcfs_nid2str(peer->ibp_ip),
+ HIPQUAD(peer->ibp_ip), vvrc);
+ goto failed;
+ }
- /* kibnal_cm_callback gets my conn ref */
- cmret = cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cm_req,
- kibnal_cm_callback, conn);
+ path->dgid = arp->gid;
+ path->sl = IBNAL_SERVICE_LEVEL;
+ path->dlid = arp->lid;
+ path->mtu = IBNAL_IB_MTU;
+ path->rate = IBNAL_STATIC_RATE;
+ path->pkt_life_time = IBNAL_PKT_LIFETIME;
+ path->pkey = IBNAL_PKEY;
+ path->traffic_class = IBNAL_TRAFFIC_CLASS;
+ } else {
+ CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
+ goto failed;
+ }
- if (cmret) {
- CERROR ("Connect failed: %d\n", cmret);
- /* Back out state change as connect failed */
- conn->ibc_state = IBNAL_CONN_INIT_QP;
- kibnal_connreq_done (conn, 1, -EINVAL);
+ rc = kibnal_set_qp_state(conn, vv_qp_state_init);
+ if (rc != 0) {
+ kibnal_connreq_done(conn, 1, rc);
}
- CDEBUG(D_NET, "connection REQ sent\n");
+ /* do the actual connection request */
+ kibnal_connect_conn(conn);
+ return;
+
+ failed:
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ peer->ibp_arp_count--;
+ if (peer->ibp_arp_count == 0) {
+ /* final ARP attempt failed */
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
+ CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
+ } else {
+ /* Retry ARP: ibp_connecting++ so terminating conn
+ * doesn't end peer's connection attempt */
+ peer->ibp_connecting++;
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
+ CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
+ peer->ibp_arp_count);
+
+ kibnal_schedule_peer_arp(peer);
+ }
+ kibnal_connreq_done(conn, 1, -ENETUNREACH);
}
-static void
-kibnal_service_get_callback (struct sa_request *request)
+void
+kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
{
- kib_conn_t *conn = request->context;
- gsi_dtgrm_t *dtgrm;
- sa_mad_v2_t *mad;
- ib_service_record_v2_t *sr;
- u64 component_mask;
- int ret;
-
- if (request->status) {
- CERROR ("status %d\n", request->status);
- free_sa_request(request);
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
-
- dtgrm = request->dtgrm_resp;
- mad = (sa_mad_v2_t *) dtgrm->mad;
- sr = (ib_service_record_v2_t *) mad->payload;
+ /* CAVEAT EMPTOR: tasklet context */
+ kib_peer_t *peer;
+ kib_conn_t *conn = (kib_conn_t *)arg;
- CDEBUG(D_NET, "sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
- sr->service_id,
- sr->service_gid.scope.g.subnet,
- sr->service_gid.scope.g.eui64,
- sr->service_pkey);
-
- component_mask = be64_to_cpu(mad->component_mask);
- if ((component_mask && (1ull << 1)) == 0) {
- CERROR ("no service GID in SR: "LPX64"\n", component_mask);
- free_sa_request(request);
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
+ LASSERT (conn != NULL);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
- //conn->ibc_connreq->cr_service = sr;
+ peer = conn->ibc_peer;
- /* Return the response datagram to its pool. We don't need it anymore. */
- gsi_dtgrm_pool_put(request->dtgrm_resp);
- request->dtgrm_resp = NULL;
+ if (arprc != ibat_stat_ok)
+ CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
+ else
+ CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
+ libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
+ (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
+ (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
- /* kibnal_pathreq_callback gets my conn ref */
- ret = kibnal_pathrecord_op(request, sr->service_gid, kibnal_pathreq_callback, conn);
- if (ret) {
- CERROR ("Path record request failed: %d\n", ret);
- kibnal_connreq_done (conn, 1, -EINVAL);
- }
+ conn->ibc_connvars->cv_arprc = arprc;
+ if (arprc == ibat_stat_ok)
+ conn->ibc_connvars->cv_arp = *arp_data;
- return;
+ kibnal_schedule_conn(conn);
+ kibnal_conn_decref(conn);
}
-static void
-kibnal_connect_peer (kib_peer_t *peer)
+void
+kibnal_arp_peer (kib_peer_t *peer)
{
- kib_conn_t *conn = kibnal_create_conn();
- struct sa_request *request;
- int ret;
+ cm_cep_handle_t cep;
+ kib_conn_t *conn;
+ int ibatrc;
+ /* Only the connd does this (i.e. single threaded) */
+ LASSERT (current == kibnal_data.kib_connd);
LASSERT (peer->ibp_connecting != 0);
+ LASSERT (peer->ibp_arp_count > 0);
+
+ cep = cm_create_cep(cm_cep_transp_rc);
+ if (cep == NULL) {
+ CERROR ("Can't create cep for conn->%s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ kibnal_peer_connect_failed(peer, 1, -ENOMEM);
+ return;
+ }
+ conn = kibnal_create_conn(cep);
if (conn == NULL) {
- CERROR ("Can't allocate conn\n");
- kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ CERROR ("Can't allocate conn->%s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ cm_destroy_cep(cep);
+ kibnal_peer_connect_failed(peer, 1, -ENOMEM);
return;
}
conn->ibc_peer = peer;
- kib_peer_addref(peer);
+ kibnal_peer_addref(peer);
- PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
- if (conn->ibc_connreq == NULL) {
- CERROR ("Can't allocate connreq\n");
- kibnal_connreq_done (conn, 1, -ENOMEM);
- return;
- }
+ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
- memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+ ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY,
+ ibat_paths_primary,
+ &conn->ibc_connvars->cv_arp,
+ kibnal_arp_callback, conn, 0);
+ CDEBUG(D_NET,"ibatrc %d\n", ibatrc);
+ switch (ibatrc) {
+ default:
+ LBUG();
- /* kibnal_service_get_callback gets my conn ref */
- ret = kibnal_advertize_op(peer->ibp_nid, SUBN_ADM_GET, kibnal_service_get_callback, conn);
+ case ibat_stat_pending:
+ /* NB callback has my ref on conn */
+ break;
- if (ret) {
- CERROR("kibnal_advertize_op failed for op %d NID "LPX64"\n", SUBN_ADM_GET, peer->ibp_nid);
- /* TODO: I'm unsure yet whether ret contains a
- * consistent error type, so I return -EIO in the
- * meantime. */
- kibnal_connreq_done (conn, 1, -EIO);
+ case ibat_stat_ok:
+ case ibat_stat_error:
+ case ibat_stat_timeout:
+ case ibat_stat_not_found:
+ /* Immediate return (ARP cache hit or failure) == no callback.
+ * Do the next stage directly... */
+ conn->ibc_connvars->cv_arprc = ibatrc;
+ kibnal_arp_done(conn);
+ kibnal_conn_decref(conn);
+ break;
}
-
- return;
}
-static int
-kibnal_conn_timed_out (kib_conn_t *conn)
+int
+kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
{
kib_tx_t *tx;
struct list_head *ttmp;
- unsigned long flags;
+ int timed_out = 0;
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
- list_for_each (ttmp, &conn->ibc_tx_queue) {
+ list_for_each (ttmp, txs) {
tx = list_entry (ttmp, kib_tx_t, tx_list);
- LASSERT (!tx->tx_passive_rdma_wait);
- LASSERT (tx->tx_sending == 0);
-
- if (time_after_eq (jiffies, tx->tx_deadline)) {
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
- return 1;
+ if (txs == &conn->ibc_active_txs) {
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+ } else {
+ LASSERT (tx->tx_queued);
}
- }
-
- list_for_each (ttmp, &conn->ibc_active_txs) {
- tx = list_entry (ttmp, kib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
-
- LASSERT (tx->tx_passive_rdma_wait ||
- tx->tx_sending != 0);
if (time_after_eq (jiffies, tx->tx_deadline)) {
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
- return 1;
+ timed_out = 1;
+ break;
}
}
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
+ return timed_out;
+}
- return 0;
+int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+ return kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
+ kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
+ kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
+ kibnal_check_txs(conn, &conn->ibc_active_txs);
}
-static void
+void
kibnal_check_conns (int idx)
{
struct list_head *peers = &kibnal_data.kib_peers[idx];
kib_peer_t *peer;
kib_conn_t *conn;
struct list_head *ctmp;
+ unsigned long flags;
again:
/* NB. We expect to have a look at all the peers and not find any
* rdmas to time out, so we just use a shared lock while we
* take a look... */
- read_lock (&kibnal_data.kib_global_lock);
+ read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
list_for_each (ptmp, peers) {
peer = list_entry (ptmp, kib_peer_t, ibp_list);
list_for_each (ctmp, &peer->ibp_conns) {
conn = list_entry (ctmp, kib_conn_t, ibc_list);
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
/* In case we have enough credits to return via a
* NOOP, but there were no non-blocking tx descs
if (!kibnal_conn_timed_out(conn))
continue;
-
- CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
- conn, conn->ibc_state, peer->ibp_nid,
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- read_unlock (&kibnal_data.kib_global_lock);
+ /* Handle timeout by closing the whole connection. We
+ * can only be sure RDMA activity has ceased once the
+ * QP has been modified. */
+
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+
+ read_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
- CERROR("Timed out RDMA with "LPX64"\n",
- peer->ibp_nid);
+ CERROR("Timed out RDMA with %s\n",
+ libcfs_nid2str(peer->ibp_nid));
kibnal_close_conn (conn, -ETIMEDOUT);
- kibnal_put_conn (conn);
+ kibnal_conn_decref(conn); /* ...until here */
/* start again now I've dropped the lock */
goto again;
}
}
- read_unlock (&kibnal_data.kib_global_lock);
+ read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
}
-static void
-kib_connd_handle_state(kib_conn_t *conn)
+void
+kibnal_disconnect_conn (kib_conn_t *conn)
{
- vv_return_t retval;
-
- switch (conn->ibc_state) {
- /* all refs have gone, free and be done with it */
- case IBNAL_CONN_DISCONNECTED:
- kibnal_destroy_conn (conn);
- return; /* avoid put_conn */
-
- case IBNAL_CONN_SEND_DREQ:
-
- retval = cm_disconnect(conn->ibc_cep, &kibnal_data.cm_data.dreq_data, NULL);
- if (retval) /* XXX do real things */
- CERROR("disconnect failed: %d\n", retval);
-
- conn->ibc_state = IBNAL_CONN_DREQ;
- break;
+ static cm_drequest_data_t dreq; /* just for the space */
- /* a callback got to the conn before we did */
- case IBNAL_CONN_DREP:
- break;
-
- default:
- CERROR ("Bad conn %p state: %d\n", conn,
- conn->ibc_state);
- LBUG();
- break;
+ cm_return_t cmrc;
+ unsigned long flags;
+
+ LASSERT (!in_interrupt());
+ LASSERT (current == kibnal_data.kib_connd);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+ if (conn->ibc_disconnect) {
+ /* Had the CM callback already */
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
+ kibnal_conn_disconnected(conn);
+ return;
+ }
+
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
+
+ /* active disconnect */
+ cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL);
+ if (cmrc == cm_stat_success) {
+ /* waiting for CM */
+ conn->ibc_state = IBNAL_CONN_DISCONNECT2;
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+ return;
}
- /* drop ref from close_conn */
- kibnal_put_conn(conn);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ cm_cancel(conn->ibc_cep);
+ cfs_pause(cfs_time_seconds(1)/10);
+
+ if (!conn->ibc_disconnect) /* CM callback will never happen now */
+ kibnal_conn_decref(conn);
+
+ LASSERT (atomic_read(&conn->ibc_refcount) > 0);
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1);
+
+ kibnal_conn_disconnected(conn);
}
int
{
wait_queue_t wait;
unsigned long flags;
+ kib_pcreq_t *pcr;
kib_conn_t *conn;
kib_peer_t *peer;
int timeout;
int i;
+ int dropped_lock;
int peer_index = 0;
unsigned long deadline = jiffies;
-
- kportal_daemonize ("kibnal_connd");
- kportal_blockallsigs ();
+
+ cfs_daemonize ("kibnal_connd");
+ cfs_block_allsigs ();
init_waitqueue_entry (&wait, current);
+ kibnal_data.kib_connd = current;
- spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
- for (;;) {
- if (!list_empty (&kibnal_data.kib_connd_conns)) {
- conn = list_entry (kibnal_data.kib_connd_conns.next,
+ while (!kibnal_data.kib_shutdown) {
+
+ dropped_lock = 0;
+
+ if (!list_empty (&kibnal_data.kib_connd_zombies)) {
+ conn = list_entry (kibnal_data.kib_connd_zombies.next,
kib_conn_t, ibc_list);
list_del (&conn->ibc_list);
-
+
spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- kib_connd_handle_state(conn);
+ dropped_lock = 1;
+
+ kibnal_destroy_conn(conn);
spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
- continue;
+ }
+
+ if (!list_empty (&kibnal_data.kib_connd_pcreqs)) {
+ pcr = list_entry(kibnal_data.kib_connd_pcreqs.next,
+ kib_pcreq_t, pcr_list);
+ list_del(&pcr->pcr_list);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+ dropped_lock = 1;
+
+ kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
+ LIBCFS_FREE(pcr, sizeof(*pcr));
+
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
}
if (!list_empty (&kibnal_data.kib_connd_peers)) {
peer = list_entry (kibnal_data.kib_connd_peers.next,
kib_peer_t, ibp_connd_list);
-
+
list_del_init (&peer->ibp_connd_list);
spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ dropped_lock = 1;
- kibnal_connect_peer (peer);
- kib_peer_decref (peer);
+ kibnal_arp_peer (peer);
+ kibnal_peer_decref (peer);
spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
- /* shut down and nobody left to reap... */
- if (kibnal_data.kib_shutdown &&
- atomic_read(&kibnal_data.kib_nconns) == 0)
- break;
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del (&conn->ibc_list);
- spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ dropped_lock = 1;
+
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBNAL_CONN_ACTIVE_ARP:
+ kibnal_arp_done(conn);
+ break;
+
+ case IBNAL_CONN_ACTIVE_CONNECT:
+ kibnal_check_connreply(conn);
+ break;
+
+ case IBNAL_CONN_PASSIVE_WAIT:
+ kibnal_check_passive_wait(conn);
+ break;
+
+ case IBNAL_CONN_DISCONNECT1:
+ case IBNAL_CONN_DISCONNECT2:
+ kibnal_disconnect_conn(conn);
+ break;
+ }
+ kibnal_conn_decref(conn);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
/* careful with the jiffy wrap... */
- while ((timeout = (int)(deadline - jiffies)) <= 0) {
+ timeout = (int)(deadline - jiffies);
+ if (timeout <= 0) {
const int n = 4;
const int p = 1;
int chunk = kibnal_data.kib_peer_hash_size;
-
+
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+ dropped_lock = 1;
+
/* Time to check for RDMA timeouts on a few more
* peers: I do checks every 'p' seconds on a
* proportion of the peer table and I need to check
* connection within (n+1)/n times the timeout
* interval. */
- if (kibnal_tunables.kib_io_timeout > n * p)
- chunk = (chunk * n * p) /
- kibnal_tunables.kib_io_timeout;
+ if (*kibnal_tunables.kib_timeout > n * p)
+ chunk = (chunk * n * p) /
+ *kibnal_tunables.kib_timeout;
if (chunk == 0)
chunk = 1;
for (i = 0; i < chunk; i++) {
kibnal_check_conns (peer_index);
- peer_index = (peer_index + 1) %
+ peer_index = (peer_index + 1) %
kibnal_data.kib_peer_hash_size;
}
deadline += p * HZ;
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
}
- kibnal_data.kib_connd_waketime = jiffies + timeout;
+ if (dropped_lock)
+ continue;
+ /* Nothing to do for 'timeout' */
set_current_state (TASK_INTERRUPTIBLE);
add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- if (!kibnal_data.kib_shutdown &&
- list_empty (&kibnal_data.kib_connd_conns) &&
- list_empty (&kibnal_data.kib_connd_peers))
- schedule_timeout (timeout);
+ schedule_timeout (timeout);
set_current_state (TASK_RUNNING);
remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
-
spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
return (0);
}
+void
+kibnal_async_callback(vv_event_record_t ev)
+{
+ CERROR("type: %d, port: %d, data: "LPX64"\n",
+ ev.event_type, ev.port_num, ev.type.data);
+}
+
+void
+kibnal_cq_callback (unsigned long unused_context)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ kibnal_data.kib_ready = 1;
+ wake_up(&kibnal_data.kib_sched_waitq);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
int
kibnal_scheduler(void *arg)
{
long id = (long)arg;
+ wait_queue_t wait;
char name[16];
- kib_rx_t *rx;
- kib_tx_t *tx;
+ vv_wc_t wc;
+ vv_return_t vvrc;
+ vv_return_t vvrc2;
unsigned long flags;
- int rc;
- int counter = 0;
- int did_something;
+ kib_rx_t *rx;
+ __u64 rxseq = 0;
+ int busy_loops = 0;
snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
- kportal_daemonize(name);
- kportal_blockallsigs();
+ cfs_daemonize(name);
+ cfs_block_allsigs();
- spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ init_waitqueue_entry(&wait, current);
- for (;;) {
- did_something = 0;
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
- while (!list_empty(&kibnal_data.kib_sched_txq)) {
- tx = list_entry(kibnal_data.kib_sched_txq.next,
- kib_tx_t, tx_list);
- list_del(&tx->tx_list);
+ while (!kibnal_data.kib_shutdown) {
+ if (busy_loops++ >= IBNAL_RESCHED) {
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- kibnal_tx_done(tx);
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
+ our_cond_resched();
+ busy_loops = 0;
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
}
- if (!list_empty(&kibnal_data.kib_sched_rxq)) {
- rx = list_entry(kibnal_data.kib_sched_rxq.next,
- kib_rx_t, rx_list);
- list_del(&rx->rx_list);
+ if (kibnal_data.kib_ready &&
+ !kibnal_data.kib_checking_cq) {
+ /* take ownership of completion polling */
+ kibnal_data.kib_checking_cq = 1;
+ /* Assume I'll exhaust the CQ */
+ kibnal_data.kib_ready = 0;
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- kibnal_rx(rx);
+ vvrc = vv_poll_for_completion(kibnal_data.kib_hca,
+ kibnal_data.kib_cq, &wc);
+ if (vvrc == vv_return_err_cq_empty) {
+ vvrc2 = vv_request_completion_notification(
+ kibnal_data.kib_hca,
+ kibnal_data.kib_cq,
+ vv_next_solicit_unsolicit_event);
+ LASSERT (vvrc2 == vv_return_ok);
+ }
- did_something = 1;
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
- }
+ if (vvrc == vv_return_ok &&
+ kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) {
+ rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id);
- /* shut down and no receives to complete... */
- if (kibnal_data.kib_shutdown &&
- atomic_read(&kibnal_data.kib_nconns) == 0)
- break;
+ /* Grab the RX sequence number NOW before
+ * anyone else can get an RX completion */
+ rxseq = rx->rx_conn->ibc_rxseq++;
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ /* give up ownership of completion polling */
+ kibnal_data.kib_checking_cq = 0;
+
+ if (vvrc == vv_return_err_cq_empty)
+ continue;
+
+ LASSERT (vvrc == vv_return_ok);
+ /* Assume there's more: get another scheduler to check
+ * while I handle this completion... */
+
+ kibnal_data.kib_ready = 1;
+ wake_up(&kibnal_data.kib_sched_waitq);
- /* nothing to do or hogging CPU */
- if (!did_something || counter++ == IBNAL_RESCHED) {
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- counter = 0;
-
- if (!did_something) {
- rc = wait_event_interruptible(
- kibnal_data.kib_sched_waitq,
- !list_empty(&kibnal_data.kib_sched_txq) ||
- !list_empty(&kibnal_data.kib_sched_rxq) ||
- (kibnal_data.kib_shutdown &&
- atomic_read (&kibnal_data.kib_nconns) == 0));
- } else {
- our_cond_resched();
+
+ switch (kibnal_wreqid2type(wc.wr_id)) {
+ case IBNAL_WID_RX:
+ kibnal_rx_complete(
+ (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id),
+ wc.completion_status,
+ wc.num_bytes_transfered,
+ rxseq);
+ break;
+
+ case IBNAL_WID_TX:
+ kibnal_tx_complete(
+ (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id),
+ wc.completion_status);
+ break;
+
+ case IBNAL_WID_RDMA:
+ /* We only get RDMA completion notification if
+ * it fails. So we just ignore them completely
+ * because...
+ *
+ * 1) If an RDMA fails, all subsequent work
+ * items, including the final SEND will fail
+ * too, so I'm still guaranteed to notice that
+ * this connection is hosed.
+ *
+ * 2) It's positively dangerous to look inside
+ * the tx descriptor obtained from an RDMA work
+ * item. As soon as I drop the kib_sched_lock,
+ * I give a scheduler on another CPU a chance
+ * to get the final SEND completion, so the tx
+ * descriptor can get freed as I inspect it. */
+ CDEBUG(D_NETERROR, "RDMA failed: %d\n",
+ wc.completion_status);
+ break;
+
+ default:
+ LBUG();
}
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ continue;
}
+
+ /* Nothing to do; sleep... */
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+
+ schedule();
+
+ remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+ set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
}
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
kibnal_thread_fini();
return (0);
}
-
-
-lib_nal_t kibnal_lib = {
- .libnal_data = &kibnal_data, /* NAL private data */
- .libnal_send = kibnal_send,
- .libnal_send_pages = kibnal_send_pages,
- .libnal_recv = kibnal_recv,
- .libnal_recv_pages = kibnal_recv_pages,
- .libnal_dist = kibnal_dist
-};