X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fiiblnd%2Fiiblnd_cb.c;h=fb4bba027063fa96597bd0a6bb6c49fa659e343c;hp=eb9e6fab1a29a076cbc5235306a81a1659cdf857;hb=ed88907a96ba81d3558e71ade9def98bdc785169;hpb=439addad84514e7ff6452710e6a7f15b80d7b589 diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index eb9e6fa..fb4bba0 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -21,658 +21,698 @@ * */ -#include "iibnal.h" +#include "iiblnd.h" -/* - * LIB functions follow - * - */ -static void -kibnal_schedule_tx_done (kib_tx_t *tx) +void +hexdump(char *string, void *ptr, int len) { - unsigned long flags; + unsigned char *c = ptr; + int i; + + return; - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } } -static void +void kibnal_tx_done (kib_tx_t *tx) { - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - unsigned long flags; - int i; - FSTATUS frc; - - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + lnet_msg_t *lntmsg[2]; + int rc = tx->tx_status; + int i; - switch (tx->tx_mapped) { - default: - LBUG(); + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ - case KIB_TX_UNMAPPED: - break; +#if IBNAL_USE_FMR + /* Handle unmapping if required */ +#endif + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + + if (tx->tx_conn != NULL) { + kibnal_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } - case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - frc = iibt_deregister_memory(tx->tx_md.md_handle); - LASSERT (frc == FSUCCESS); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; + tx->tx_nwrq = 0; + tx->tx_status = 0; -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } + spin_lock(&kibnal_data.kib_tx_lock); - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; -#endif - } + spin_unlock(&kibnal_data.kib_tx_lock); + /* delay finalize until my descs have been freed */ for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) + if (lntmsg[i] == NULL) continue; - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; + lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); } +} + +kib_tx_t * +kibnal_get_idle_tx (void) +{ + kib_tx_t *tx; - if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); - tx->tx_conn = NULL; + spin_lock(&kibnal_data.kib_tx_lock); + + if (list_empty (&kibnal_data.kib_idle_txs)) { + spin_unlock(&kibnal_data.kib_tx_lock); + return NULL; } - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; - tx->tx_status = 0; + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + spin_unlock(&kibnal_data.kib_tx_lock); - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } -static kib_tx_t * -kibnal_get_idle_tx (int may_block) +int +kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) { - unsigned long flags; - kib_tx_t *tx = NULL; - ENTRY; - - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + FSTATUS frc; - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } + LASSERT (!in_interrupt()); + /* old peers don't reserve rxs for RDMA replies */ + LASSERT (!rsrvd_credit || + conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + + rx->rx_gl = (IB_LOCAL_DATASEGMENT) { + .Address = rx->rx_hca_msg, + .Lkey = kibnal_data.kib_whole_mem.md_lkey, + .Length = IBNAL_MSG_SIZE, + }; - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } + rx->rx_wrq = (IB_WORK_REQ2) { + .Next = NULL, + .WorkReqId = kibnal_ptr2wreqid(rx, IBNAL_WID_RX), + .MessageLen = IBNAL_MSG_SIZE, + .DSList = &rx->rx_gl, + .DSListDepth = 1, + .Operation = WROpRecv, + }; - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); + LASSERT (rx->rx_nob >= 0); /* not posted */ - /* block for idle tx */ - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", + rx->rx_wrq.DSList->Length, + rx->rx_wrq.DSList->Lkey, + rx->rx_wrq.DSList->Address); - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return 0; } + + rx->rx_nob = -1; /* flag posted */ + mb(); - if (tx != NULL) { - list_del (&tx->tx_list); + frc = iba_post_recv2(conn->ibc_qp, &rx->rx_wrq, NULL); + if (frc == FSUCCESS) { + if (credit || rsrvd_credit) { + spin_lock(&conn->ibc_lock); - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + if (credit) + conn->ibc_outstanding_credits++; + if (rsrvd_credit) + conn->ibc_reserved_credits++; - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } + spin_unlock(&conn->ibc_lock); - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + kibnal_check_sends(conn); + } + return 0; + } - RETURN(tx); + CERROR ("post rx -> %s failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + rc = -EIO; + kibnal_close_conn(rx->rx_conn, rc); + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return rc; } -static int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +int +kibnal_post_receives (kib_conn_t *conn) { - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; + int i; + int rc; + + LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING); + + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc. This ref remains until kibnal_post_rx + * fails (i.e. actual failure or we're disconnecting) */ + kibnal_conn_addref(conn); + rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); + if (rc != 0) + return rc; } return 0; } -static void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +kib_tx_t * +kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) { - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending != 0 || tx->tx_waiting); - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) + if (tx->tx_cookie != cookie) continue; - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} - if (idle) - list_del (&tx->tx_list); +void +kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + int idle; - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); + tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_close_conn (conn, -EPROTO); return; } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBNAL_MSG_GET_REQ) { + lnet_set_reply_msg_len(kibnal_data.kib_ni, + tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kibnal_tx_done(tx); } -static __u32 -kibnal_lkey(kib_pages_t *ibp) +void +kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) { - if (kibnal_whole_mem()) - return kibnal_data.kib_md.md_lkey; - - return ibp->ibp_lkey; + kib_tx_t *tx = kibnal_get_idle_tx(); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); + + kibnal_queue_tx(tx, conn); } -static void -kibnal_post_rx (kib_rx_t *rx, int do_credits) +void +kibnal_handle_rx (kib_rx_t *rx) { + kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; + int credits = msg->ibm_credits; + kib_tx_t *tx; int rc = 0; - unsigned long flags; - FSTATUS frc; - ENTRY; - - rx->rx_gl = (IB_LOCAL_DATASEGMENT) { - .Address = rx->rx_vaddr, - .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(conn->ibc_rx_pages), - }; + int repost = 1; + int rsrvd_credit = 0; + int rc2; - rx->rx_wrq = (IB_WORK_REQ) { - .Operation = WROpRecv, - .DSListDepth = 1, - .MessageLen = IBNAL_MSG_SIZE, - .WorkReqId = kibnal_ptr2wreqid(rx, 1), - .DSList = &rx->rx_gl, - }; + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DREP); - LASSERT (!rx->rx_posted); - rx->rx_posted = 1; - mb(); + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + conn->ibc_credits += credits; + spin_unlock(&conn->ibc_lock); - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else { - frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); - if (frc != FSUCCESS) { - CDEBUG(D_NET, "post failed %d\n", frc); - rc = -EINVAL; - } - CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + kibnal_check_sends(conn); } - if (rc == 0) { - if (do_credits) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_outstanding_credits++; - spin_unlock_irqrestore(&conn->ibc_lock, flags); + switch (msg->ibm_type) { + default: + CERROR("Bad IBNAL message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; - kibnal_check_sends(conn); - } - EXIT; - return; - } + case IBNAL_MSG_NOOP: + break; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - } + case IBNAL_MSG_IMMEDIATE: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + repost = rc < 0; /* repost on error */ + break; + + case IBNAL_MSG_PUT_REQ: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ + break; - /* Drop rx's ref */ - kibnal_put_conn (conn); - EXIT; -} + case IBNAL_MSG_PUT_NAK: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ -#if IBNAL_CKSUM -static inline __u32 kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; + CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - return (sum); -} -#endif + case IBNAL_MSG_PUT_ACK: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ -static void hexdump(char *string, void *ptr, int len) -{ - unsigned char *c = ptr; - int i; + spin_lock(&conn->ibc_lock); + tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); - return; + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } - if (len < 0 || len > 2048) { - printk("XXX what the hell? %d\n",len); - return; - } + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, + kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + if (tx->tx_status == 0 && rc2 < 0) + tx->tx_status = rc2; + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kibnal_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBNAL_MSG_PUT_DONE: + /* This buffer was pre-reserved by not returning the credit + * when the PUT_REQ's buffer was reposted, so I just return it + * now */ + kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; - printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + case IBNAL_MSG_GET_REQ: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ + break; - for (i = 0; i < len;) { - printk("%02x",*(c++)); - i++; - if (!(i & 15)) { - printk("\n"); - } else if (!(i&1)) { - printk(" "); - } + case IBNAL_MSG_GET_DONE: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ + + kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; } - if(len & 15) { - printk("\n"); + if (rc < 0) /* protocol error */ + kibnal_close_conn(conn, rc); + + if (repost) { + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + rsrvd_credit = 0; /* peer isn't pre-reserving */ + + kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); } } -static void -kibnal_rx_callback (IB_WORK_COMPLETION *wc) +void +kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq) { kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + int nob = wc->Length; kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; - int nob = wc->Length; - const int base_nob = offsetof(kib_msg_t, ibm_u); - int credits; - int flipped; unsigned long flags; - __u32 i; -#if IBNAL_CKSUM - __u32 msg_cksum; - __u32 computed_cksum; -#endif - - /* we set the QP to erroring after we've finished disconnecting, - * maybe we should do so sooner. */ - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DISCONNECTED); + int rc; + int err = -EIO; - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ mb(); /* receives complete with error in any case after we've started * disconnecting */ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto failed; + goto ignore; if (wc->Status != WRStatusSuccess) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); + CERROR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), wc->Status); goto failed; } - if (nob < base_nob) { - CERROR ("Short rx from "LPX64": %d < expected %d\n", - conn->ibc_peer->ibp_nid, nob, base_nob); + rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } - hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); - - /* Receiver does any byte flipping if necessary... */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flipped = 0; - } else { - if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->ibm_magic, conn->ibc_peer->ibp_nid); - goto failed; - } - flipped = 1; - __swab16s (&msg->ibm_version); - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); - } + rx->rx_nob = nob; /* Now I know nob > 0 */ + mb(); - if (msg->ibm_version != IBNAL_MSG_VERSION) { - CERROR ("Incompatible msg version %d (%d expected)\n", - msg->ibm_version, IBNAL_MSG_VERSION); + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != kibnal_data.kib_incarnation) { + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; goto failed; } -#if IBNAL_CKSUM - if (nob != msg->ibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + if (msg->ibm_seq != rxseq) { + CERROR ("Out-of-sequence rx from %s" + ": got "LPD64" but expected "LPD64"\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + msg->ibm_seq, rxseq); goto failed; } - msg_cksum = le32_to_cpu(msg->ibm_cksum); - msg->ibm_cksum = 0; - computed_cksum = kibnal_cksum (msg, nob); - - if (msg_cksum != computed_cksum) { - CERROR ("Checksum failure %d: (%d expected)\n", - computed_cksum, msg_cksum); -// goto failed; - } - CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); -#endif - - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; - if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); - return; - - case IBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (kib_immediate_msg_t)) { - CERROR ("Short IMMEDIATE from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (kib_rdma_msg_t)) { - CERROR ("Short RDMA msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32(msg->ibm_u.rdma.ibrm_num_descs); - - CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); - - if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || - (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > - min(nob, IBNAL_MSG_SIZE))) { - CERROR ("num_descs %d too large\n", - msg->ibm_u.rdma.ibrm_num_descs); - goto failed; - } - - if (flipped) { - __swab32(msg->ibm_u.rdma.rd_key); - } - - for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + /* set time last known alive */ + kibnal_peer_alive(conn->ibc_peer); - if (flipped) { - __swab32(desc->rd_nob); - __swab64(desc->rd_addr); - } + /* racing with connection establishment/teardown! */ - CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", - msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob); - } - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (kib_completion_msg_t)) { - CERROR ("Short COMPLETION msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + return; } - if (flipped) - __swab32s(&msg->ibm_u.completion.ibcm_status); - - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); - return; - - default: - CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->ibm_type); - goto failed; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); } - - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + kibnal_handle_rx(rx); return; failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); - + kibnal_close_conn(conn, err); + ignore: /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); + kibnal_conn_decref(conn); } -void -kibnal_rx (kib_rx_t *rx) +struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) { - kib_msg_t *msg = rx->rx_msg; - - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; + struct page *page; - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; - - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); - break; - - case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - break; - - case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); - break; - - default: + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) { + page = vmalloc_to_page ((void *)vaddr); + LASSERT (page != NULL); + return page; + } +#if CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); LBUG(); - break; } - - kibnal_post_rx (rx, 1); +#endif + page = virt_to_page (vaddr); + LASSERT (page != NULL); + return page; } -static struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) +#if !IBNAL_USE_FMR +int +kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, + unsigned long page_offset, unsigned long len) { - struct page *page; + kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); + if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { + CERROR ("Too many RDMA fragments\n"); + return -EMSGSIZE; + } + + if (active) { + if (rd->rd_nfrag == 0) + rd->rd_key = kibnal_data.kib_whole_mem.md_lkey; + } else { + if (rd->rd_nfrag == 0) + rd->rd_key = kibnal_data.kib_whole_mem.md_rkey; + } - if (!VALID_PAGE (page)) - page = NULL; + frag->rf_nob = len; + frag->rf_addr = kibnal_data.kib_whole_mem.md_addr + + lnet_page2phys(page) + page_offset; - return page; + CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n", + rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob); + + rd->rd_nfrag++; + return 0; } -static void -kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, - unsigned long len, int active) +int +kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + unsigned int niov, struct iovec *iov, int offset, int nob) + { - kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; - kib_rdma_desc_t *desc; + int fragnob; + int rc; + unsigned long vaddr; + struct page *page; + int page_offset; - LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", - ibrm->ibrm_num_descs); + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT ((rd != tx->tx_rd) == !active); - desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; - if (active) - ibrm->rd_key = kibnal_data.kib_md.md_lkey; - else - ibrm->rd_key = kibnal_data.kib_md.md_rkey; - desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ - desc->rd_addr = kibnal_page2phys(page) + page_offset + - kibnal_data.kib_md.md_addr; + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + rd->rd_nfrag = 0; + do { + LASSERT (niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR ("Can't find page\n"); + return -EFAULT; + } - ibrm->ibrm_num_descs++; + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + rc = kibnal_append_rdfrag(rd, active, page, + page_offset, fragnob); + if (rc != 0) + return rc; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + return 0; } -static int -kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { - struct page *page; - int page_offset, len; + int fragnob; + int rc; - while (nob > 0) { - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) - return -EFAULT; + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - page_offset = vaddr & (PAGE_SIZE - 1); - len = min(nob, (int)PAGE_SIZE - page_offset); - - kibnal_fill_ibrm(tx, page, page_offset, len, active); - nob -= len; - vaddr += len; + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT ((rd != tx->tx_rd) == !active); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); } + + rd->rd_nfrag = 0; + do { + LASSERT (nkiov > 0); + fragnob = min((int)(kiov->kiov_len - offset), nob); + + rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, + kiov->kiov_offset + offset, + fragnob); + if (rc != 0) + return rc; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + return 0; } +#else +int +kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int npages, unsigned long page_offset, int nob) +{ + IB_ACCESS_CONTROL access = {0,}; + FSTATUS frc; + + LASSERT ((rd != tx->tx_rd) == !active); + LASSERT (!tx->tx_md.md_active); + LASSERT (tx->tx_md.md_fmrcount > 0); + LASSERT (page_offset < PAGE_SIZE); + LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); + LASSERT (npages <= LNET_MAX_IOV); + + if (!active) { + // access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaWrite = 1; + } + + /* Map the memory described by tx->tx_pages + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + IBNAL_RDMA_BASE, + tx->tx_pages, npages, + page_offset, + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + */ + return -EINVAL; +} -static int -kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int niov, struct iovec *iov, int offset, int nob, int active) +int +kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + unsigned int niov, struct iovec *iov, int offset, int nob) { - void *vaddr; - FSTATUS frc; + int resid; + int fragnob; + struct page *page; + int npages; + unsigned long page_offset; + unsigned long vaddr; LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -686,54 +726,47 @@ kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, return (-EMSGSIZE); } - /* our large contiguous iov could be backed by multiple physical - * pages. */ - if (kibnal_whole_mem()) { - int rc; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + - offset, nob, active); - if (rc != 0) { - CERROR ("Can't map iov: %d\n", rc); - return rc; + vaddr = ((unsigned long)iov->iov_base) + offset; + + page_offset = vaddr & (PAGE_SIZE - 1); + resid = nob; + npages = 0; + + do { + LASSERT (npages < LNET_MAX_IOV); + + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page for %lu\n", vaddr); + return -EFAULT; } - return 0; - } - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + tx->tx_pages[npages++] = lnet_page2phys(page); - frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, - kibnal_data.kib_pd, access, - &tx->tx_md.md_handle, &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - if (frc != 0) { - CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); - return -EINVAL; - } + fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); + vaddr += fragnob; + resid -= fragnob; - tx->tx_mapped = KIB_TX_MAPPED; - return (0); + } while (resid > 0); + + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } -static int -kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int nkiov, ptl_kiov_t *kiov, - int offset, int nob, int active) +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { - __u64 *phys = NULL; - int page_offset; - int nphys; - int resid; - int phys_size = 0; - FSTATUS frc; - int i, rc = 0; - + int resid; + int npages; + unsigned long page_offset; + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (nkiov <= LNET_MAX_IOV); + LASSERT (!tx->tx_md.md_active); + LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -743,122 +776,36 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, } page_offset = kiov->kiov_offset + offset; - nphys = 1; - - if (!kibnal_whole_mem()) { - phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - - phys[0] = kibnal_page2phys(kiov->kiov_page); - } else { - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, - kiov->kiov_len, active); - } - - resid = nob - (kiov->kiov_len - offset); + + resid = offset + nob; + npages = 0; - while (resid > 0) { - kiov++; - nkiov--; + do { + LASSERT (npages < LNET_MAX_IOV); LASSERT (nkiov > 0); - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { + if ((npages > 0 && kiov->kiov_offset != 0) || + (resid > kiov->kiov_len && + (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { /* Can't have gaps */ CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - { - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } + "page %d, offset %d, len %d \n", + npages, kiov->kiov_offset, kiov->kiov_len); - rc = -EINVAL; - goto out; - } - - if (nphys == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - if (!kibnal_whole_mem()) { - LASSERT (nphys * sizeof (*phys) < phys_size); - phys[nphys] = kibnal_page2phys(kiov->kiov_page); - } else { - if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - kibnal_fill_ibrm(tx, kiov->kiov_page, - kiov->kiov_offset, kiov->kiov_len, - active); + return -EINVAL; } - nphys ++; - resid -= PAGE_SIZE; - } - - if (kibnal_whole_mem()) - goto out; - -#if 0 - CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); - for (i = 0; i < nphys; i++) - CWARN (" [%d] "LPX64"\n", i, phys[i]); -#endif - -#if IBNAL_FMR -#error "iibnal hasn't learned about FMR yet" - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - IBNAL_RDMA_BASE, - phys, nphys, - 0, /* offset */ - kibnal_data.kib_pd, - access, - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (frc == FSUCCESS) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); -#if IBNAL_FMR - tx->tx_mapped = KIB_TX_MAPPED_FMR; -#else - tx->tx_mapped = KIB_TX_MAPPED; -#endif - } else { - CERROR ("Can't map phys: %d\n", frc); - rc = -EFAULT; - } + tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); + resid -= kiov->kiov_len; + kiov++; + nkiov--; + } while (resid > 0); - out: - if (phys != NULL) - PORTAL_FREE(phys, phys_size); - return (rc); + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } +#endif -static kib_conn_t * +kib_conn_t * kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; @@ -874,134 +821,173 @@ kibnal_find_conn_locked (kib_peer_t *peer) void kibnal_check_sends (kib_conn_t *conn) { - unsigned long flags; kib_tx_t *tx; + FSTATUS frc; int rc; - int i; + int consume_cred; int done; - int nwork; - ENTRY; - spin_lock_irqsave (&conn->ibc_lock, flags); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_nsends_posted <= + *kibnal_tunables.kib_concurrent_sends); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); + list_empty(&conn->ibc_tx_queue_nocred) && + (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || + kibnal_send_keepalive(conn))) { + spin_unlock(&conn->ibc_lock); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); + if (tx != NULL) kibnal_queue_tx_locked(tx, conn); - } } - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + for (;;) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry (conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + consume_cred = 0; + } else if (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + consume_cred = 1; + } else { + /* nothing waiting */ + break; + } + LASSERT (tx->tx_queued); /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); LASSERT (conn->ibc_outstanding_credits >= 0); LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) - GOTO(out, 0); + if (conn->ibc_nsends_posted == + *kibnal_tunables.kib_concurrent_sends) { + /* We've got some tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } - if (conn->ibc_credits == 0) /* no credits */ - GOTO(out, 1); + if (consume_cred) { + if (conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) { /* giving back credits */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + } - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - GOTO(out, 2); - list_del (&tx->tx_list); + tx->tx_queued = 0; + + /* NB don't drop ibc_lock before bumping tx_sending */ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + !list_empty(&conn->ibc_tx_queue_nocred) || + (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && + !kibnal_send_keepalive(conn)))) { /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); continue; } - tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; - conn->ibc_outstanding_credits = 0; + kibnal_pack_msg(tx->tx_msg, conn->ibc_version, + conn->ibc_outstanding_credits, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation, + conn->ibc_txseq); + conn->ibc_txseq++; + conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; - conn->ibc_credits--; + if (consume_cred) + conn->ibc_credits--; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() from + * the first send; hence the ++ rather than = below. */ + tx->tx_sending++; - /* we only get a tx completion for the final rdma op */ - tx->tx_sending = min(tx->tx_nsp, 2); - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; list_add (&tx->tx_list, &conn->ibc_active_txs); -#if IBNAL_CKSUM - tx->tx_msg->ibm_cksum = 0; - tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); -#endif - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - - LASSERT (tx->tx_nsp > 0); - - rc = -ECONNABORTED; - nwork = 0; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - /* Driver only accepts 1 item at a time */ - for (i = 0; i < tx->tx_nsp; i++) { - hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); - rc = iibt_postsend(conn->ibc_qp, - &tx->tx_wrq[i]); - if (rc != 0) - break; - if (wrq_signals_completion(&tx->tx_wrq[i])) - nwork++; - CDEBUG(D_NET, "posted tx wrq %p\n", - &tx->tx_wrq[i]); - } + + LASSERT (tx->tx_nwrq > 0); + + rc = 0; + frc = FSUCCESS; + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else { + frc = iba_post_send2(conn->ibc_qp, tx->tx_wrq, NULL); + if (frc != FSUCCESS) + rc = -EIO; } - spin_lock_irqsave (&conn->ibc_lock, flags); + conn->ibc_last_send = jiffies; + if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; + if (consume_cred) + conn->ibc_credits++; conn->ibc_nsends_posted--; tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - tx->tx_sending -= tx->tx_nsp - nwork; - + tx->tx_waiting = 0; + tx->tx_sending--; + done = (tx->tx_sending == 0); if (done) list_del (&tx->tx_list); - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d posting transmit to %s\n", + frc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); + CDEBUG (D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, rc); @@ -1009,138 +995,172 @@ kibnal_check_sends (kib_conn_t *conn) kibnal_tx_done (tx); return; } - } - EXIT; -out: - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); } -static void -kibnal_tx_callback (IB_WORK_COMPLETION *wc) +void +kibnal_tx_complete (IB_WORK_COMPLETION *wc) { kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - kib_conn_t *conn; - unsigned long flags; + kib_conn_t *conn = tx->tx_conn; + int failed = wc->Status != WRStatusSuccess; int idle; - conn = tx->tx_conn; - LASSERT (conn != NULL); - LASSERT (tx->tx_sending != 0); + CDEBUG(D_NET, "%s: sending %d nwrq %d status %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_sending, tx->tx_nwrq, wc->Status); + + LASSERT (tx->tx_sending > 0); - spin_lock_irqsave(&conn->ibc_lock, flags); + if (failed && + tx->tx_status == 0 && + conn->ibc_state == IBNAL_CONN_ESTABLISHED) { +#if KIBLND_DETAILED_DEBUG + int i; + IB_WORK_REQ2 *wrq = &tx->tx_wrq[0]; + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[0]; + lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; +#endif + CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 + " sending %d waiting %d failed %d nwrk %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, wc->Status, + tx->tx_nwrq); +#if KIBLND_DETAILED_DEBUG + for (i = 0; i < tx->tx_nwrq; i++, wrq++, gl++) { + switch (wrq->Operation) { + default: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p OP %d " + "DSList %p(%p)/%d: "LPX64"/%d K %x\n", + i, wrq, wrq->Next, wrq->Operation, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey); + break; + case WROpSend: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p SEND " + "DSList %p(%p)/%d: "LPX64"/%d K %x\n", + i, wrq, wrq->Next, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey); + break; + case WROpRdmaWrite: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p DMA " + "DSList: %p(%p)/%d "LPX64"/%d K %x -> " + LPX64" K %x\n", + i, wrq, wrq->Next, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey, + wrq->Req.SendRC.RemoteDS.Address, + wrq->Req.SendRC.RemoteDS.Rkey); + break; + } + } + + switch (tx->tx_msg->ibm_type) { + default: + CDEBUG(D_NETERROR, " msg type %x %p/%d, No RDMA\n", + tx->tx_msg->ibm_type, + tx->tx_msg, tx->tx_msg->ibm_nob); + break; - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_sending, tx->tx_nsp, wc->Status); + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + CDEBUG(D_NETERROR, " msg type %x %p/%d, RDMA key %x frags %d...\n", + tx->tx_msg->ibm_type, + tx->tx_msg, tx->tx_msg->ibm_nob, + tx->tx_rd->rd_key, tx->tx_rd->rd_nfrag); + for (i = 0; i < tx->tx_rd->rd_nfrag; i++) + CDEBUG(D_NETERROR, " [%d] "LPX64"/%d\n", i, + tx->tx_rd->rd_frags[i].rf_addr, + tx->tx_rd->rd_frags[i].rf_nob); + if (lntmsg == NULL) { + CDEBUG(D_NETERROR, " No lntmsg\n"); + } else if (lntmsg->msg_iov != NULL) { + CDEBUG(D_NETERROR, " lntmsg in %d VIRT frags...\n", + lntmsg->msg_niov); + for (i = 0; i < lntmsg->msg_niov; i++) + CDEBUG(D_NETERROR, " [%d] %p/%d\n", i, + lntmsg->msg_iov[i].iov_base, + lntmsg->msg_iov[i].iov_len); + } else if (lntmsg->msg_kiov != NULL) { + CDEBUG(D_NETERROR, " lntmsg in %d PAGE frags...\n", + lntmsg->msg_niov); + for (i = 0; i < lntmsg->msg_niov; i++) + CDEBUG(D_NETERROR, " [%d] %p+%d/%d\n", i, + lntmsg->msg_kiov[i].kiov_page, + lntmsg->msg_kiov[i].kiov_offset, + lntmsg->msg_kiov[i].kiov_len); + } else { + CDEBUG(D_NETERROR, " lntmsg in %d frags\n", + lntmsg->msg_niov); + } + + break; + } +#endif + } + + spin_lock(&conn->ibc_lock); /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ + * gets to free it, which also drops its ref on 'conn'. */ tx->tx_sending--; + conn->ibc_nsends_posted--; + + if (failed) { + tx->tx_waiting = 0; + tx->tx_status = -EIO; + } + idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + !tx->tx_waiting && /* Not waiting for peer */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ if (idle) list_del(&tx->tx_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); /* 1 ref for me.... */ - if (tx->tx_sending == 0) - conn->ibc_nsends_posted--; - - if (wc->Status != WRStatusSuccess && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (idle) kibnal_tx_done (tx); - if (wc->Status != WRStatusSuccess) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); - kibnal_close_conn (conn, -ENETDOWN); + if (failed) { + kibnal_close_conn (conn, -EIO); } else { - /* can I shovel some more sends out the door? */ + kibnal_peer_alive(conn->ibc_peer); kibnal_check_sends(conn); } - kibnal_put_conn (conn); -} - -void -kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) -{ - /* XXX flesh out. this seems largely for async errors */ - CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); -} - -void -kibnal_ca_callback (void *ca_arg, void *cq_arg) -{ - IB_HANDLE cq = *(IB_HANDLE *)cq_arg; - IB_HANDLE ca = *(IB_HANDLE *)ca_arg; - IB_WORK_COMPLETION wc; - int armed = 0; - - CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); - - for(;;) { - while (iibt_cq_poll(cq, &wc) == FSUCCESS) { - - /* We will need to rearm the CQ to avoid a potential race. */ - armed = 0; - - if (kibnal_wreqid_is_rx(wc.WorkReqId)) - kibnal_rx_callback(&wc); - else - kibnal_tx_callback(&wc); - } - if (armed) - return; - if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { - CERROR("rearm failed?\n"); - return; - } - armed = 1; - } + kibnal_conn_decref(conn); /* ...until here */ } void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { - IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; - IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq]; + IB_WORK_REQ2 *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (tx->tx_nwrq >= 0 && + tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); LASSERT (nob <= IBNAL_MSG_SIZE); - - tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; - tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; - tx->tx_msg->ibm_type = type; -#if IBNAL_CKSUM - tx->tx_msg->ibm_nob = nob; -#endif - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); + + kibnal_init_msg(tx->tx_msg, type, body_nob); *gl = (IB_LOCAL_DATASEGMENT) { - .Address = tx->tx_vaddr, + .Address = tx->tx_hca_msg, .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), + .Lkey = kibnal_data.kib_whole_mem.md_lkey, }; - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Next = NULL; /* This is the last one */ + + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_TX); wrq->Operation = WROpSend; wrq->DSList = gl; wrq->DSListDepth = 1; @@ -1149,869 +1169,1339 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) wrq->Req.SendRC.Options.s.SolicitedEvent = 1; wrq->Req.SendRC.Options.s.SignaledCompletion = 1; wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = fence; - - tx->tx_nsp++; + wrq->Req.SendRC.Options.s.Fence = 0; + /* fence only needed on RDMA reads */ + + tx->tx_nwrq++; } -static void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +int +kibnal_init_rdma (kib_tx_t *tx, int type, int nob, + kib_rdma_desc_t *dstrd, __u64 dstcookie) { - unsigned long flags; + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + IB_LOCAL_DATASEGMENT *gl; + IB_WORK_REQ2 *wrq; + int rc; - spin_lock_irqsave(&conn->ibc_lock, flags); +#if IBNAL_USE_FMR + LASSERT (tx->tx_nwrq == 0); - kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - + gl = &tx->tx_gl[0]; + gl->Length = nob; + gl->Address = srcrd->rd_addr; + gl->Lkey = srcrd->rd_key; + + wrq = &tx->tx_wrq[0]; + + wrq->Next = wrq + 1; + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); + wrq->Operation = WROpRdmaWrite; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + + wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr; + wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; + + tx->tx_nwrq = 1; + rc = nob; +#else + /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ + int resid = nob; + kib_rdma_frag_t *srcfrag; + int srcidx; + kib_rdma_frag_t *dstfrag; + int dstidx; + int wrknob; + + /* Called by scheduler */ + LASSERT (!in_interrupt()); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + srcidx = dstidx = 0; + srcfrag = &srcrd->rd_frags[0]; + dstfrag = &dstrd->rd_frags[0]; + rc = resid; + + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrag) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx == dstrd->rd_nfrag) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { + CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", + srcidx, srcrd->rd_nfrag, + dstidx, dstrd->rd_nfrag); + rc = -EMSGSIZE; + break; + } + + wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); + + gl = &tx->tx_gl[tx->tx_nwrq]; + gl->Length = wrknob; + gl->Address = srcfrag->rf_addr; + gl->Lkey = srcrd->rd_key; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->Next = wrq + 1; + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); + wrq->Operation = WROpRdmaWrite; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + + wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr; + wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; + + resid -= wrknob; + if (wrknob < srcfrag->rf_nob) { + srcfrag->rf_addr += wrknob; + srcfrag->rf_nob -= wrknob; + } else { + srcfrag++; + srcidx++; + } + + if (wrknob < dstfrag->rf_nob) { + dstfrag->rf_addr += wrknob; + dstfrag->rf_nob -= wrknob; + } else { + dstfrag++; + dstidx++; + } + + tx->tx_nwrq++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; +#endif + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + return rc; +} + +void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + spin_lock(&conn->ibc_lock); + kibnal_queue_tx_locked (tx, conn); + spin_unlock(&conn->ibc_lock); + kibnal_check_sends(conn); } -static void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +void +kibnal_schedule_active_connect_locked (kib_peer_t *peer, int proto_version) +{ + /* Called holding kib_global_lock exclusive with IRQs disabled */ + + peer->ibp_version = proto_version; /* proto version for new conn */ + peer->ibp_connecting++; /* I'm connecting */ + kibnal_peer_addref(peer); /* extra ref for connd */ + + spin_lock(&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock(&kibnal_data.kib_connd_lock); +} + +void +kibnal_schedule_active_connect (kib_peer_t *peer, int proto_version) +{ + unsigned long flags; + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + kibnal_schedule_active_connect_locked(peer, proto_version); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} + +void +kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) { - unsigned long flags; kib_peer_t *peer; kib_conn_t *conn; + unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; + int retry; + int rc; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - read_lock_irqsave(g_lock, flags); + for (retry = 0; ; retry = 1) { + read_lock_irqsave(g_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - read_unlock_irqrestore(g_lock, flags); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) { + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + kibnal_conn_addref(conn); /* 1 ref for me... */ + read_unlock_irqrestore(g_lock, flags); + + kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...to here */ + return; + } + } - kibnal_queue_tx (tx, conn); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); + /* Making one or more connections; I'll need a write lock... */ + read_unlock(g_lock); + write_lock(g_lock); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) + break; + + write_unlock_irqrestore(g_lock, flags); + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } + + rc = kibnal_add_persistent_peer(nid); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } } conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - write_unlock_irqrestore (g_lock, flags); + kibnal_conn_addref(conn); /* 1 ref for me... */ + write_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...until here */ return; } - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { - write_unlock_irqrestore (g_lock, flags); + if (!kibnal_peer_connecting(peer)) { + if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ + time_after_eq(jiffies, peer->ibp_reconnect_time))) { + write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; kibnal_tx_done (tx); return; } - - peer->ibp_connecting = 1; - kib_peer_addref(peer); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); + + kibnal_schedule_active_connect_locked(peer, IBNAL_MSG_VERSION); } /* A connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - write_unlock_irqrestore (g_lock, flags); + write_unlock_irqrestore(g_lock, flags); } -static ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) +void +kibnal_txlist_done (struct list_head *txlist, int status) { - int nob = libmsg->md->length; - kib_tx_t *tx; - kib_msg_t *ibmsg; - int rc; - IB_ACCESS_CONTROL access = {0,}; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ - - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; + kib_tx_t *tx; - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob, 0); - else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob, 0); - - if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); - rc = -ENOMEM; - goto failed; - } + list_del (&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kibnal_tx_done (tx); } - - tx->tx_passive_rdma = 1; +} - ibmsg = tx->tx_msg; +int +kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - /* map_kiov alrady filled the rdma descs for the whole_mem case */ - if (!kibnal_whole_mem()) { - ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - ibmsg->ibm_u.rdma.ibrm_num_descs = 1; - } + /* NB 'private' is different depending on what we're sending.... */ - kibnal_init_tx_msg (tx, type, - kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); - kibnal_launch_tx(tx, nid); - return (PTL_OK); + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (PTL_FAIL); -} + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - IB_ACCESS_CONTROL access = {0,}; - IB_WR_OP rdma_op; - int rc; - __u32 i; + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can allocate txd for GET to %s: \n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, + 0, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, + 0, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kibnal_tx_done(tx); + return -EIO; + } - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); +#if IBNAL_USE_FMR + nob = sizeof(kib_get_msg_t); +#else + { + int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; + + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); + } +#endif + kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - /* Called by scheduler */ - LASSERT (!in_interrupt ()); + tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, + lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kibnal_tx_done(tx); + return -EIO; + } - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kibnal_launch_tx(tx, target.nid); + return 0; - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); + if (payload_kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kibnal_tx_done(tx); + return -EIO; + } - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); - if (type == IBNAL_MSG_GET_DONE) { - rdma_op = WROpRdmaWrite; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access.s.LocalWrite = 1; - rdma_op = WROpRdmaRead; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kibnal_launch_tx(tx, target.nid); + return 0; } - tx = kibnal_get_idle_tx (0); /* Mustn't block */ + /* send IMMEDIATE */ + + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBNAL_MSG_SIZE); + + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 - " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); - return; + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; } - LASSERT (tx->tx_nsp == 0); - - if (nob == 0) - GOTO(init_tx, 0); - - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); else - rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); - - if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - if (!kibnal_whole_mem()) { - tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; - } - - /* XXX ugh. different page-sized hosts. */ - if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != - rxmsg->ibm_u.rdma.ibrm_num_descs) { - CERROR("tx descs (%u) != rx descs (%u)\n", - tx->tx_msg->ibm_u.rdma.ibrm_num_descs, - rxmsg->ibm_u.rdma.ibrm_num_descs); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - /* map_kiov filled in the rdma descs which describe our side of the - * rdma transfer. */ - /* ibrm_num_descs was verified in rx_callback */ - for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ - IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; - IB_WORK_REQ *wrq = &tx->tx_wrq[i]; - - ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; - rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; - - ds->Address = ldesc->rd_addr; - ds->Length = ldesc->rd_nob; - ds->Lkey = tx->tx_msg->ibm_u.rdma.rd_key; - - memset(wrq, 0, sizeof(*wrq)); - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); - wrq->Operation = rdma_op; - wrq->DSList = ds; - wrq->DSListDepth = 1; - wrq->MessageLen = ds->Length; - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 0; - wrq->Req.SendRC.Options.s.SignaledCompletion = 0; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; - wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key; + lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); - /* only the last rdma post triggers tx completion */ - if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) - wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); - tx->tx_nsp++; + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kibnal_launch_tx(tx, target.nid); + return 0; +} + +void +kibnal_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) +{ + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; } -init_tx: - txmsg = tx->tx_msg; + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, + niov, iov, offset, nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, + niov, kiov, offset, nob); - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; + rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (rc == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CWARN("No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); - } - - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + kibnal_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kibnal_tx_done(tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); } -static ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); +int +kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) +{ + kib_rx_t *rx = private; + kib_conn_t *conn = rx->rx_conn; - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* Can't block if RDMA completions need normal credits */ + LCONSOLE_ERROR("Dropping message from %s: no buffers free. " + "%s is running an old version of LNET that may " + "deadlock if messages wait for buffers)\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return -EDEADLK; + } + + *new_private = private; + return 0; +} - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); +int +kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int post_cred = 1; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); - switch (type) { + switch (rxmsg->ibm_type) { default: LBUG(); - return (PTL_FAIL); - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - return (PTL_OK); + case IBNAL_MSG_IMMEDIATE: + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize (ni, lntmsg, 0); + break; + + case IBNAL_MSG_PUT_REQ: + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; } - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob >= IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", - nid, payload_nob); - return (PTL_FAIL); + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + 0, + niov, iov, offset, mlen); + else + rc = kibnal_setup_rd_kiov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + 0, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kibnal_tx_done(tx); + /* tell peer it's over */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } - break; - } - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; +#if IBNAL_USE_FMR + nob = sizeof(kib_putack_msg_t); +#else + { + int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); + } +#endif + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kibnal_queue_tx(tx, conn); + + if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ break; - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); - + case IBNAL_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kibnal_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } break; } - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); - if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); + kibnal_post_rx(rx, post_cred, 0); + return rc; +} + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +void +kibnal_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kibnal_peer_notify (kib_peer_t *peer) +{ + time_t last_alive = 0; + int error = 0; + unsigned long flags; + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive); + } + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); +} + +void +kibnal_schedule_conn (kib_conn_t *conn) +{ + unsigned long flags; + + kibnal_conn_addref(conn); /* ++ref for connd */ + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); +} + +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immediate housekeeping to start shutdown of an + * established connection. 'error' is zero for a normal shutdown. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + return; /* already being handled */ + + /* NB Can't take ibc_lock here (could be in IRQ context), without + * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ + + if (error == 0 && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s" + " rx# "LPD64" tx# "LPD64"\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_txseq, conn->ibc_rxseq); + } else { + CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" + " rx# "LPD64" tx# "LPD64"\n", + libcfs_nid2str(peer->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", + conn->ibc_txseq, conn->ibc_rxseq); +#if 0 + /* can't skip down the queue without holding ibc_lock (see above) */ + list_for_each(tmp, &conn->ibc_tx_queue) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + CERROR(" queued tx type %x cookie "LPX64 + " sending %d waiting %d ticks %ld/%d\n", + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, + (long)(tx->tx_deadline - jiffies), HZ); + } + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + CERROR(" active tx type %x cookie "LPX64 + " sending %d waiting %d ticks %ld/%d\n", + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, + (long)(tx->tx_deadline - jiffies), HZ); + } +#endif } - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + list_del (&conn->ibc_list); - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } + if (list_empty (&peer->ibp_conns)) { /* no more conns */ + if (peer->ibp_persistence == 0 && /* non-persistent peer */ + kibnal_peer_active(peer)) /* still in peer table */ + kibnal_unlink_peer_locked (peer); - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); + peer->ibp_error = error; /* set/clear error on last conn */ + } - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING); - kibnal_launch_tx(tx, nid); - return (PTL_OK); + kibnal_schedule_conn(conn); + kibnal_conn_decref(conn); /* lose ibc_list's ref */ } -static ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) +void +kibnal_close_conn (kib_conn_t *conn, int error) { - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} + unsigned long flags; + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); -static ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } -static ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +void +kibnal_handle_early_rxs(kib_conn_t *conn) { - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); + unsigned long flags; + kib_rx_t *rx; - switch (rxmsg->ibm_type) { - default: - LBUG(); - return (PTL_FAIL); + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); - } + kibnal_handle_rx(rx); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} - if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); +void +kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + spin_lock(&conn->ibc_lock); - case IBNAL_MSG_GET_RDMA: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + list_for_each_safe (tmp, nxt, txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); - return (PTL_OK); + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_queued = 0; + tx->tx_waiting = 0; + + if (tx->tx_sending == 0) { + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } } -} -static ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); + spin_unlock(&conn->ibc_lock); + + kibnal_txlist_done(&zombies, -ECONNABORTED); } -static ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +void +kibnal_conn_disconnected(kib_conn_t *conn) { - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); -} + static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError}; -/***************************************************************************** - * the rest of this file concerns connection management. active connetions - * start with connect_peer, passive connections start with passive_callback. - * active disconnects start with conn_close, cm_callback starts passive - * disconnects and contains the guts of how the disconnect state machine - * progresses. - *****************************************************************************/ + FSTATUS frc; -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); - if (pid < 0) - return ((int)pid); + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} + /* move QP to error state to make posted work items complete */ + frc = iba_modify_qp(conn->ibc_qp, &qpam, NULL); + if (frc != FSUCCESS) + CERROR("can't move qp state to error: %d\n", frc); -static void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_active_txs); + + kibnal_handle_early_rxs(conn); } -/* this can be called by anyone at any time to close a connection. if - * the connection is still established it heads to the connd to start - * the disconnection in a safe context. It has no effect if called - * on a connection that is already disconnecting */ void -kibnal_close_conn_locked (kib_conn_t *conn, int error) +kibnal_peer_connect_failed (kib_peer_t *peer, int type, int error) { - /* This just does the immmediate housekeeping, and schedules the - * connection for the connd to finish off. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; + LIST_HEAD (zombies); + unsigned long flags; - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, - IBNAL_CONN_DISCONNECTED); + LASSERT (error != 0); + LASSERT (!in_interrupt()); - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - return; /* already disconnecting */ + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - CDEBUG (error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + LASSERT (kibnal_peer_connecting(peer)); - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_connd_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_connd_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + switch (type) { + case IBNAL_CONN_ACTIVE: + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + break; + + case IBNAL_CONN_PASSIVE: + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + break; + + case IBNAL_CONN_WAITING: + /* Can't assert; I might be racing with a successful connection + * which clears passivewait */ + peer->ibp_passivewait = 0; + break; + default: + LBUG(); + } + + if (kibnal_peer_connecting(peer) || /* another attempt underway */ + !list_empty(&peer->ibp_conns)) { /* got connected */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; } + + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_interval *= 2; + peer->ibp_reconnect_interval = + MAX(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_min_reconnect_interval); + peer->ibp_reconnect_interval = + MIN(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_max_reconnect_interval); - if (list_empty (&peer->ibp_conns) && /* no more conns */ - peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) { /* still in peer table */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval * HZ; + + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (kibnal_peer_active(peer) && + peer->ibp_persistence == 0) { + /* failed connection attempt on non-persistent peer */ kibnal_unlink_peer_locked (peer); } - conn->ibc_state = IBNAL_CONN_SEND_DREQ; + peer->ibp_error = error; + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - spin_lock (&kibnal_data.kib_connd_lock); + kibnal_peer_notify(peer); - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); + if (list_empty (&zombies)) + return; + + CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); + + kibnal_txlist_done (&zombies, -EHOSTUNREACH); } void -kibnal_close_conn (kib_conn_t *conn, int error) +kibnal_connreq_done (kib_conn_t *conn, int type, int status) { + kib_peer_t *peer = conn->ibc_peer; + struct list_head txs; + kib_tx_t *tx; unsigned long flags; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + LASSERT (!in_interrupt()); + LASSERT (type == IBNAL_CONN_ACTIVE || type == IBNAL_CONN_PASSIVE); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); + LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); + LASSERT (kibnal_peer_connecting(peer)); - kibnal_close_conn_locked (conn, error); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); -} + LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars)); + conn->ibc_cvars = NULL; -static void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) -{ - LIST_HEAD (zombies); - kib_tx_t *tx; - unsigned long flags; + if (status != 0) { + /* failed to establish connection */ + kibnal_peer_connect_failed(conn->ibc_peer, type, status); + kibnal_conn_disconnected(conn); + kibnal_conn_decref(conn); /* Lose CM's ref */ + return; + } - LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + /* connection established */ + LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING); - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + conn->ibc_last_send = jiffies; + kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); + kibnal_peer_alive(peer); - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; + CDEBUG(D_NET, "Connection %s ESTABLISHED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (peer->ibp_connecting != 0) { - /* another connection attempt under way (loopback?)... */ + peer->ibp_passivewait = 0; /* not waiting (got conn now) */ + kibnal_conn_addref(conn); /* +1 ref for ibc_list */ + list_add_tail(&conn->ibc_list, &peer->ibp_conns); + + if (!kibnal_peer_active(peer)) { + /* peer has been deleted */ + kibnal_close_conn_locked(conn, -ECONNABORTED); write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed(conn->ibc_peer, type, -ECONNABORTED); + kibnal_conn_decref(conn); /* lose CM's ref */ return; } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); - /* Take peer's blocked blocked transmits; I'll complete - * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); + switch (type) { + case IBNAL_CONN_ACTIVE: + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + break; + + case IBNAL_CONN_PASSIVE: + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + break; + default: + LBUG(); } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ - if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation); - while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + /* grab txs blocking for a conn */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock (&conn->ibc_lock); + while (!list_empty (&txs)) { + tx = list_entry (txs.next, kib_tx_t, tx_list); list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); + + kibnal_queue_tx_locked (tx, conn); } + spin_unlock (&conn->ibc_lock); + kibnal_check_sends (conn); } -static void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) +void +kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why) { - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int i; - - /* passive connection has no connreq & vice versa */ - LASSERTF(!active == !(conn->ibc_connreq != NULL), - "%d %p\n", active, conn->ibc_connreq); - if (active) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + static CM_REJECT_INFO msgs[3]; + CM_REJECT_INFO *msg = &msgs[why]; + FSTATUS frc; + + LASSERT (why >= 0 && why < sizeof(msgs)/sizeof(msgs[0])); + + /* If I wasn't so lazy, I'd initialise this only once; it's effectively + * read-only... */ + msg->Reason = RC_USER_REJ; + msg->PrivateData[0] = (IBNAL_MSG_MAGIC) & 0xff; + msg->PrivateData[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; + msg->PrivateData[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; + msg->PrivateData[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; + msg->PrivateData[4] = (IBNAL_MSG_VERSION) & 0xff; + msg->PrivateData[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; + msg->PrivateData[6] = why; + + frc = iba_cm_reject(cep, msg); + if (frc != FSUCCESS) + CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid)); +} - LASSERT (peer->ibp_connecting != 0); - - if (status == 0) { - /* connection established... */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_ESTABLISHED; +void +kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) +{ + kib_peer_t *peer = conn->ibc_peer; + unsigned long flags; + int magic; + int version; + int why; + + LASSERT (type == IBNAL_CONN_ACTIVE || + type == IBNAL_CONN_PASSIVE); + + CDEBUG(D_NET, "%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), rej->Reason); + + switch (rej->Reason) { + case RC_STALE_CONN: + if (type == IBNAL_CONN_PASSIVE) { + CERROR("Connection to %s rejected (stale QP)\n", + libcfs_nid2str(peer->ibp_nid)); + } else { + CWARN("Connection from %s rejected (stale QP): " + "retrying...\n", libcfs_nid2str(peer->ibp_nid)); - if (!kibnal_peer_active(peer)) { - /* ...but peer deleted meantime */ - status = -ECONNABORTED; + /* retry from scratch to allocate a new conn + * which will use a different QP */ + kibnal_schedule_active_connect(peer, peer->ibp_version); } - } else { - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, - IBNAL_CONN_CONNECTING); - } - if (status == 0) { - /* Everything worked! */ - - peer->ibp_connecting--; + /* An FCM_DISCONNECTED callback is still outstanding: give it a + * ref since kibnal_connreq_done() drops the CM's ref on conn + * on failure */ + kibnal_conn_addref(conn); + break; - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + case RC_USER_REJ: + magic = (rej->PrivateData[0]) | + (rej->PrivateData[1] << 8) | + (rej->PrivateData[2] << 16) | + (rej->PrivateData[3] << 24); + version = (rej->PrivateData[4]) | + (rej->PrivateData[5] << 8); + why = (rej->PrivateData[6]); + + /* retry with old proto version */ + if (magic == IBNAL_MSG_MAGIC && + version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + conn->ibc_version == IBNAL_MSG_VERSION && + type != IBNAL_CONN_PASSIVE) { + /* retry with a new conn */ + CWARN ("Connection to %s refused: " + "retrying with old protocol version 0x%x\n", + libcfs_nid2str(peer->ibp_nid), version); + kibnal_schedule_active_connect(peer, version); + break; + } - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); + if (magic != IBNAL_MSG_MAGIC || + version != IBNAL_MSG_VERSION) { + CERROR("%s connection with %s rejected " + "(magic/ver %08x/%d why %d): " + "incompatible protocol\n", + (type == IBNAL_CONN_ACTIVE) ? + "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), + magic, version, why); + break; + } - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - kibnal_queue_tx_locked (tx, conn); + if (type == IBNAL_CONN_ACTIVE && + why == IBNAL_REJECT_CONN_RACE) { + /* lost connection race */ + CWARN("Connection to %s rejected: " + "lost connection race\n", + libcfs_nid2str(peer->ibp_nid)); + + write_lock_irqsave(&kibnal_data.kib_global_lock, + flags); + + if (list_empty(&peer->ibp_conns)) { + peer->ibp_passivewait = 1; + peer->ibp_passivewait_deadline = + jiffies + + (*kibnal_tunables.kib_timeout * HZ); + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + break; } - - spin_unlock (&conn->ibc_lock); - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + CERROR("%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), why); + break; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + default: + CERROR("%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), rej->Reason); + } + + kibnal_connreq_done(conn, type, -ECONNREFUSED); +} - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); +void +kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info) +{ + CDEBUG(D_NET, "%s: state %d, status 0x%x\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_state, info->Status); + + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, - conn->ibc_rxs[i].rx_vaddr); + switch (info->Status) { + default: + LBUG(); + break; - kibnal_post_rx (&conn->ibc_rxs[i], 0); - } + case FCM_DISCONNECT_REQUEST: + /* Schedule conn to iba_cm_disconnect() if it wasn't already */ + kibnal_close_conn (conn, 0); + break; - kibnal_check_sends (conn); - return; + case FCM_DISCONNECT_REPLY: /* peer acks my disconnect req */ + case FCM_DISCONNECTED: /* end of TIME_WAIT */ + CDEBUG(D_NET, "Connection %s disconnected.\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_conn_decref(conn); /* Lose CM's ref */ + break; } +} - /* connection failed */ - if (state == IBNAL_CONN_CONNECTING) { - /* schedule for connd to close */ - kibnal_close_conn_locked (conn, status); - } else { - /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - } +void +kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + kib_conn_t *conn = arg; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + /* Established Connection Notifier */ + switch (info->Status) { + default: + CERROR("Unexpected status %d on Connection %s\n", + info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + LBUG(); + break; + + case FCM_CONNECT_TIMEOUT: + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ETIMEDOUT); + break; + + case FCM_CONNECT_REJECT: + kibnal_check_connreject(conn, IBNAL_CONN_PASSIVE, + &info->Info.Reject); + break; - kibnal_peer_connect_failed (conn->ibc_peer, active, status); + case FCM_CONNECT_ESTABLISHED: + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, 0); + break; - /* If we didn't establish the connection we don't have to pass - * through the disconnect protocol before dropping the CM ref */ - if (state < IBNAL_CONN_CONNECTING) - kibnal_put_conn (conn); + case FCM_DISCONNECT_REQUEST: + case FCM_DISCONNECT_REPLY: + case FCM_DISCONNECTED: + kibnal_cm_disconnect_callback(conn, info); + break; + } } -static int -kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, - ptl_nid_t nid, __u64 incarnation, int queue_depth) +int +kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob) { - kib_conn_t *conn = kibnal_create_conn(); + lnet_nid_t nid; + kib_conn_t *conn; kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; + int rc; + + rc = kibnal_unpack_msg(msg, 0, nob); + if (rc != 0) { + /* SILENT! kibnal_unpack_msg() complains if required */ + kibnal_reject(LNET_NID_ANY, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + nid = msg->ibm_srcnid; - if (conn == NULL) - return (-ENOMEM); + if (msg->ibm_version != IBNAL_MSG_VERSION) + CWARN("Connection from %s: old protocol version 0x%x\n", + libcfs_nid2str(nid), msg->ibm_version); - if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-EPROTO); + if (msg->ibm_type != IBNAL_MSG_CONNREQ) { + CERROR("Can't accept %s: bad request type %d (%d expected)\n", + libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) { + CERROR("Can't accept %s: bad dst NID %s (%s expected)\n", + libcfs_nid2str(nid), + libcfs_nid2str(msg->ibm_dstnid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || + msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || + msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", + libcfs_nid2str(nid), + msg->ibm_u.connparams.ibcp_queue_depth, + msg->ibm_u.connparams.ibcp_max_msg_size, + msg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MSG_QUEUE_SIZE, + IBNAL_MSG_SIZE, + IBNAL_MAX_RDMA_FRAGS); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + conn = kibnal_create_conn(nid, msg->ibm_version); + if (conn == NULL) { + kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); + return -ENOMEM; } /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (nid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) { + kibnal_conn_decref(conn); + kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); + return -ENOMEM; } write_lock_irqsave (&kibnal_data.kib_global_lock, flags); @@ -2020,456 +2510,253 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, if (peer2 == NULL) { /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + LASSERT (peer->ibp_connecting == 0); } else { - kib_peer_decref (peer); + kibnal_peer_decref(peer); peer = peer2; - } - kib_peer_addref(peer); /* +1 ref for conn */ - peer->ibp_connecting++; + if (peer->ibp_connecting != 0 && + peer->ibp_nid < kibnal_data.kib_ni->ni_nid) { + /* Resolve concurrent connection attempts in favour of + * the higher NID */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + kibnal_conn_decref(conn); + kibnal_reject(nid, cep, IBNAL_REJECT_CONN_RACE); + return -EALREADY; + } + } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + kibnal_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_accepting++; + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - /* conn->ibc_cep is set when cm_accept is called */ - conn->ibc_incarnation = incarnation; + conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); - *connp = conn; - return (0); -} - -static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; - FSTATUS frc; - - modify_attr.RequestState = state; + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - frc = iibt_qp_modify(qp, &modify_attr, NULL); - if (frc != FSUCCESS) - CERROR("couldn't set qp state to %d, error %d\n", state, frc); + *connp = conn; + return 0; } -static void kibnal_flush_pending(kib_conn_t *conn) +void +kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) { - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - int done; - - /* NB we wait until the connection has closed before completing - * outstanding passive RDMAs so we can be sure the network can't - * touch the mapped memory any more. */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); - - /* set the QP to the error state so that we get flush callbacks - * on our posted receives which can then drop their conn refs */ - kibnal_set_qp_state(conn->ibc_qp, QPStateError); - - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - if (!done) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } + CM_REQUEST_INFO *req = &info->Info.Request; + CM_REPLY_INFO *rep; + kib_conn_t *conn; + FSTATUS frc; + int rc; + + LASSERT(arg == NULL); /* no conn yet for passive */ - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); + CDEBUG(D_NET, "%x\n", info->Status); + + if (info->Status == FCM_CONNECT_CANCEL) { + up(&kibnal_data.kib_listener_signal); + return; } - spin_unlock_irqrestore (&conn->ibc_lock, flags); + LASSERT (info->Status == FCM_CONNECT_REQUEST); - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + rc = kibnal_accept(&conn, cep, (kib_msg_t *)req->PrivateData, + CM_REQUEST_INFO_USER_LEN); + if (rc != 0) /* kibnal_accept has rejected */ + return; - list_del(&tx->tx_list); - kibnal_tx_done (tx); + conn->ibc_cvars->cv_path = req->PathInfo.Path; + + rc = kibnal_conn_rts(conn, + req->CEPInfo.QPN, + req->CEPInfo.OfferedInitiatorDepth, + req->CEPInfo.OfferedResponderResources, + req->CEPInfo.StartingPSN); + if (rc != 0) { + kibnal_reject(conn->ibc_peer->ibp_nid, cep, + IBNAL_REJECT_NO_RESOURCES); + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); + return; } -} -static void -kibnal_reject (IB_HANDLE cep, uint16_t reason) -{ - CM_REJECT_INFO *rej; + memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); + rep = &conn->ibc_cvars->cv_cmci.Info.Reply; - PORTAL_ALLOC(rej, sizeof(*rej)); - if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ - return; + rep->QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; + rep->QKey = conn->ibc_cvars->cv_qpattrs.Qkey; + rep->StartingPSN = conn->ibc_cvars->cv_qpattrs.RecvPSN; + rep->EndToEndFlowControl = conn->ibc_cvars->cv_qpattrs.FlowControl; + rep->ArbInitiatorDepth = conn->ibc_cvars->cv_qpattrs.InitiatorDepth; + rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources; + rep->TargetAckDelay = kibnal_data.kib_hca_attrs.LocalCaAckDelay; + rep->FailoverAccepted = IBNAL_FAILOVER_ACCEPTED; + rep->RnRRetryCount = req->CEPInfo.RnrRetryCount; + + CLASSERT (CM_REPLY_INFO_USER_LEN >= + offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); - rej->Reason = reason; - iibt_cm_reject(cep, rej); - PORTAL_FREE(rej, sizeof(*rej)); -} + kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData, + conn->ibc_version, + CM_REPLY_INFO_USER_LEN, + IBNAL_MSG_CONNACK, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation); -static FSTATUS -kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, - IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr; - FSTATUS frc; - ENTRY; + LASSERT (conn->ibc_cep == NULL); + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToRecv, - .RecvPSN = IBNAL_STARTING_PSN, - .DestQPNumber = qpn, - .ResponderResources = resp_res, - .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ - .Attrs = (IB_QP_ATTR_RECVPSN | - IB_QP_ATTR_DESTQPNUMBER | - IB_QP_ATTR_RESPONDERRESOURCES | - IB_QP_ATTR_DESTAV | - IB_QP_ATTR_PATHMTU | - IB_QP_ATTR_MINRNRTIMER), - }; - GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, - &modify_attr.DestAV); - - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - if (frc != FSUCCESS) - RETURN(frc); - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToSend, - .FlowControl = TRUE, - .InitiatorDepth = init_depth, - .SendPSN = send_psn, - .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .Attrs = (IB_QP_ATTR_FLOWCONTROL | - IB_QP_ATTR_INITIATORDEPTH | - IB_QP_ATTR_SENDPSN | - IB_QP_ATTR_LOCALACKTIMEOUT | - IB_QP_ATTR_RETRYCOUNT | - IB_QP_ATTR_RNRRETRYCOUNT), - }; + frc = iba_cm_accept(cep, + &conn->ibc_cvars->cv_cmci, + NULL, + kibnal_cm_passive_callback, conn, + &conn->ibc_cep); - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - RETURN(frc); + if (frc == FSUCCESS || frc == FPENDING) + return; + + CERROR("iba_cm_accept(%s) failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); } -static void -kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +void +kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep) { - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - kib_wire_connreq_t *wcr; - CM_REPLY_INFO *rep = &info->Info.Reply; - uint16_t reason; - FSTATUS frc; + kib_msg_t *msg = (kib_msg_t *)rep->PrivateData; + lnet_nid_t nid = conn->ibc_peer->ibp_nid; + FSTATUS frc; + int rc; - wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; + rc = kibnal_unpack_msg(msg, conn->ibc_version, CM_REPLY_INFO_USER_LEN); + if (rc != 0) { + CERROR ("Error %d unpacking connack from %s\n", + rc, libcfs_nid2str(nid)); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; + } + + if (msg->ibm_type != IBNAL_MSG_CONNACK) { + CERROR("Bad connack request type %d (%d expected) from %s\n", + msg->ibm_type, IBNAL_MSG_CONNREQ, + libcfs_nid2str(msg->ibm_srcnid)); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; + } - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't connect "LPX64": bad magic %08x\n", - conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || + msg->ibm_dststamp != kibnal_data.kib_incarnation) { + CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n", + libcfs_nid2str(msg->ibm_srcnid), + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(msg->ibm_dstnid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid), + msg->ibm_dststamp, kibnal_data.kib_incarnation); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ESTALE); + return; } - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't connect "LPX64": bad version %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); - } - - if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { - CERROR ("Can't connect "LPX64": bad queue depth %d\n", - conn->ibc_peer->ibp_nid, - le16_to_cpu(wcr->wcr_queue_depth)); - GOTO(reject, reason = RC_USER_REJ); + if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || + msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || + msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", + libcfs_nid2str(msg->ibm_srcnid), + msg->ibm_u.connparams.ibcp_queue_depth, + msg->ibm_u.connparams.ibcp_max_msg_size, + msg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MSG_QUEUE_SIZE, + IBNAL_MSG_SIZE, + IBNAL_MAX_RDMA_FRAGS); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; } - if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { - CERROR ("Unexpected NID "LPX64" from "LPX64"\n", - le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - GOTO(reject, reason = RC_USER_REJ); - } - - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "Connection %s REP_RECEIVED.\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); - conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, - min_t(__u8, rep->ArbInitiatorDepth, - ca_attr->MaxQPResponderResources), - &conn->ibc_connreq->cr_path, - min_t(__u8, rep->ArbResponderResources, - ca_attr->MaxQPInitiatorDepth), - rep->StartingPSN); - if (frc != FSUCCESS) { - CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - GOTO(reject, reason = RC_NO_QP); - } - - /* the callback arguments are ignored for an active accept */ - conn->ibc_connreq->cr_discarded.Status = FSUCCESS; - frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, - NULL, NULL, NULL, NULL); - if (frc != FCM_CONNECT_ESTABLISHED) { - CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - kibnal_connreq_done (conn, 1, -ECONNABORTED); - /* XXX don't call reject after accept fails? */ + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); + + rc = kibnal_conn_rts(conn, + rep->QPN, + rep->ArbInitiatorDepth, + rep->ArbResponderResources, + rep->StartingPSN); + if (rc != 0) { + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_NO_RESOURCES); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EIO); return; } - CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 1, 0); - return; + memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); + + frc = iba_cm_accept(conn->ibc_cep, + &conn->ibc_cvars->cv_cmci, + NULL, NULL, NULL, NULL); -reject: - kibnal_reject(cep, reason); - kibnal_connreq_done (conn, 1, -EPROTO); + if (frc == FCM_CONNECT_ESTABLISHED) { + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, 0); + return; + } + + CERROR("Connection %s CMAccept failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ECONNABORTED); } -/* ib_cm.h has a wealth of information on the CM procedures */ -static void -kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +void +kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) { kib_conn_t *conn = arg; CDEBUG(D_NET, "status 0x%x\n", info->Status); - /* Established Connection Notifier */ switch (info->Status) { default: - CERROR("unknown status %d on Connection %p -> "LPX64"\n", - info->Status, conn, conn->ibc_peer->ibp_nid); + CERROR("unknown status %d on Connection %s\n", + info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); LBUG(); break; - case FCM_CONNECT_REPLY: - kibnal_connect_reply(cep, info, arg); + case FCM_CONNECT_TIMEOUT: + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ETIMEDOUT); + break; + + case FCM_CONNECT_REJECT: + kibnal_check_connreject(conn, IBNAL_CONN_ACTIVE, + &info->Info.Reject); break; - case FCM_DISCONNECT_REQUEST: - /* XXX lock around these state management bits? */ - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - kibnal_close_conn (conn, 0); - conn->ibc_state = IBNAL_CONN_DREP; - iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + case FCM_CONNECT_REPLY: + kibnal_check_connreply(conn, &info->Info.Reply); break; - /* these both guarantee that no more cm callbacks will occur */ - case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ + case FCM_DISCONNECT_REQUEST: case FCM_DISCONNECT_REPLY: - CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - kibnal_flush_pending(conn); - kibnal_put_conn(conn); /* Lose CM's ref */ + case FCM_DISCONNECTED: + kibnal_cm_disconnect_callback(conn, info); break; } - - return; -} - -static int -kibnal_set_cm_flags(IB_HANDLE cep) -{ - FSTATUS frc; - uint32 value = 1; - - frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, - (char *)&value, sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting timeout callback: %d\n", frc); - return -1; - } - -#if 0 - frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, - sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting async accept: %d\n", frc); - return -1; - } -#endif - - return 0; } void -kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - IB_QP_ATTRIBUTES_QUERY *query; - CM_REQUEST_INFO *req; - CM_CONN_INFO *rep = NULL, *rcv = NULL; - kib_wire_connreq_t *wcr; - kib_conn_t *conn = NULL; - uint16_t reason = 0; - FSTATUS frc; - int rc = 0; - - LASSERT(cep); - LASSERT(info); - LASSERT(arg == NULL); /* no conn yet for passive */ - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - req = &info->Info.Request; - wcr = (kib_wire_connreq_t *)req->PrivateData; - - CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, - le64_to_cpu(wcr->wcr_nid)); - - if (info->Status == FCM_CONNECT_CANCEL) - return; - - LASSERT (info->Status == FCM_CONNECT_REQUEST); - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't accept: bad magic %08x\n", - le32_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't accept: bad version %d\n", - le16_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - rc = kibnal_accept(&conn, cep, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); - if (rc != 0) { - CERROR ("Can't accept "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), rc); - GOTO(out, reason = RC_NO_RESOURCES); - } - - frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, - min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, - ca_attr->MaxQPResponderResources), - &req->PathInfo.Path, - min_t(__u8, req->CEPInfo.OfferedResponderResources, - ca_attr->MaxQPInitiatorDepth), - req->CEPInfo.StartingPSN); - - if (frc != FSUCCESS) { - CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - - frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Couldn't query qp attributes "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - query = &conn->ibc_qp_attrs; - - PORTAL_ALLOC(rep, sizeof(*rep)); - PORTAL_ALLOC(rcv, sizeof(*rcv)); - if (rep == NULL || rcv == NULL) { - if (rep) PORTAL_FREE(rep, sizeof(*rep)); - if (rcv) PORTAL_FREE(rcv, sizeof(*rcv)); - CERROR ("can't allocate reply and receive buffers\n"); - GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); - } - - /* don't try to deref this into the incoming wcr :) */ - wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; - - rep->Info.Reply = (CM_REPLY_INFO) { - .QPN = query->QPNumber, - .QKey = query->Qkey, - .StartingPSN = query->RecvPSN, - .EndToEndFlowControl = query->FlowControl, - /* XXX Hmm. */ - .ArbInitiatorDepth = query->InitiatorDepth, - .ArbResponderResources = query->ResponderResources, - .TargetAckDelay = 0, - .FailoverAccepted = 0, - .RnRRetryCount = req->CEPInfo.RnrRetryCount, - }; - - *wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, - &conn->ibc_cep); - - PORTAL_FREE(rep, sizeof(*rep)); - PORTAL_FREE(rcv, sizeof(*rcv)); - - if (frc != FCM_CONNECT_ESTABLISHED) { - /* XXX it seems we don't call reject after this point? */ - CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); - rc = -ECONNABORTED; - goto out; - } - - if (kibnal_set_cm_flags(conn->ibc_cep)) { - rc = -ECONNABORTED; - goto out; - } - - CWARN("Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); - -out: - if (reason) { - kibnal_reject(cep, reason); - rc = -ECONNABORTED; - } - if (conn != NULL) - kibnal_connreq_done(conn, 0, rc); - - return; -} - -static void dump_path_records(PATH_RESULTS *results) { IB_PATH_RECORD *path; int i; - for(i = 0; i < results->NumPathRecords; i++) { + for (i = 0; i < results->NumPathRecords; i++) { path = &results->PathRecords[i]; CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " LPX64":"LPX64" pkey %x\n", @@ -2482,110 +2769,104 @@ dump_path_records(PATH_RESULTS *results) } } -static void -kibnal_pathreq_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) +void +kibnal_pathreq_callback (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qrslt) { - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - PATH_RESULTS *path; - FSTATUS frc; + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + CM_REQUEST_INFO *req = &conn->ibc_cvars->cv_cmci.Info.Request; + PATH_RESULTS *path = (PATH_RESULTS *)qrslt->QueryResult; + FSTATUS frc; - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); + if (qrslt->Status != FSUCCESS || + qrslt->ResultDataSize < sizeof(*path)) { + CDEBUG (D_NETERROR, "pathreq %s failed: status %d data size %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + qrslt->Status, qrslt->ResultDataSize); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - path = (PATH_RESULTS *)query_res->QueryResult; - if (path->NumPathRecords < 1) { - CERROR ("expected path records: %d\n", path->NumPathRecords); - kibnal_connreq_done (conn, 1, -EINVAL); + CDEBUG (D_NETERROR, "pathreq %s failed: no path records\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - dump_path_records(path); + //dump_path_records(path); + conn->ibc_cvars->cv_path = path->PathRecords[0]; - /* just using the first. this is probably a horrible idea. */ - conn->ibc_connreq->cr_path = path->PathRecords[0]; + LASSERT (conn->ibc_cep == NULL); - conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); + conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid); if (conn->ibc_cep == NULL) { - CERROR ("Can't create CEP\n"); - kibnal_connreq_done (conn, 1, -EINVAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ENOMEM); return; } - if (kibnal_set_cm_flags(conn->ibc_cep)) { - kibnal_connreq_done (conn, 1, -EINVAL); - return; + memset(req, 0, sizeof(*req)); + req->SID = conn->ibc_cvars->cv_svcrec.RID.ServiceID; + req->CEPInfo.CaGUID = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx]; + req->CEPInfo.EndToEndFlowControl = IBNAL_EE_FLOW; + req->CEPInfo.PortGUID = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID; + req->CEPInfo.RetryCount = IBNAL_RETRY; + req->CEPInfo.RnrRetryCount = IBNAL_RNR_RETRY; + req->CEPInfo.AckTimeout = IBNAL_ACK_TIMEOUT; + req->CEPInfo.StartingPSN = IBNAL_STARTING_PSN; + req->CEPInfo.QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; + req->CEPInfo.QKey = conn->ibc_cvars->cv_qpattrs.Qkey; + req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources; + req->CEPInfo.OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth; + req->PathInfo.bSubnetLocal = IBNAL_LOCAL_SUB; + req->PathInfo.Path = conn->ibc_cvars->cv_path; + + CLASSERT (CM_REQUEST_INFO_USER_LEN >= + offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); + + kibnal_pack_connmsg((kib_msg_t *)req->PrivateData, + conn->ibc_version, + CM_REQUEST_INFO_USER_LEN, + IBNAL_MSG_CONNREQ, + conn->ibc_peer->ibp_nid, 0); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_magic = + LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); } - conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { - .SID = conn->ibc_connreq->cr_service.RID.ServiceID, - .CEPInfo = (CM_CEP_INFO) { - .CaGUID = kibnal_data.kib_hca_guids[0], - .EndToEndFlowControl = FALSE, - .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .AckTimeout = IBNAL_ACK_TIMEOUT, - .StartingPSN = IBNAL_STARTING_PSN, - .QPN = conn->ibc_qp_attrs.QPNumber, - .QKey = conn->ibc_qp_attrs.Qkey, - .OfferedResponderResources = ca_attr->MaxQPResponderResources, - .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, - }, - .PathInfo = (CM_CEP_PATHINFO) { - .bSubnetLocal = TRUE, - .Path = conn->ibc_connreq->cr_path, - }, - }; - -#if 0 - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; -#endif /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; - - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - - memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, - CM_REQUEST_INFO_USER_LEN); - memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, - &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - /* kibnal_cm_callback gets my conn ref */ - frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, - kibnal_cm_callback, conn); - if (frc != FPENDING && frc != FSUCCESS) { - CERROR ("Connect: %d\n", frc); - /* Back out state change as connect failed */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, -EINVAL); - } + /* cm callback gets my conn ref */ + frc = iba_cm_connect(conn->ibc_cep, req, + kibnal_cm_active_callback, conn); + if (frc == FPENDING || frc == FSUCCESS) + return; + + CERROR ("Connect %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static void -dump_service_records(SERVICE_RECORD_RESULTS *results) +void +kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results) { IB_SERVICE_RECORD *svc; int i; - for(i = 0; i < results->NumServiceRecords; i++) { + for (i = 0; i < results->NumServiceRecords; i++) { svc = &results->ServiceRecords[i]; CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", i, @@ -2596,161 +2877,147 @@ dump_service_records(SERVICE_RECORD_RESULTS *results) } } - -static void -kibnal_service_get_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) +void +kibnal_service_get_callback (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qrslt) { - kib_conn_t *conn = arg; - SERVICE_RECORD_RESULTS *svc; - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY path_query; - FSTATUS frc; - - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); + kib_conn_t *conn = arg; + SERVICE_RECORD_RESULTS *svc; + FSTATUS frc; + + if (qrslt->Status != FSUCCESS || + qrslt->ResultDataSize < sizeof(*svc)) { + CDEBUG (D_NETERROR, "Lookup %s failed: status %d data size %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + qrslt->Status, qrslt->ResultDataSize); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; - + svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult; if (svc->NumServiceRecords < 1) { - CERROR ("%d service records\n", svc->NumServiceRecords); - kibnal_connreq_done (conn, 1, -EINVAL); + CDEBUG (D_NETERROR, "lookup %s failed: no service records\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - dump_service_records(svc); + //kibnal_dump_service_records(svc); + conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0]; - conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; + qry = &conn->ibc_cvars->cv_query; + memset(qry, 0, sizeof(*qry)); - CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", - query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + qry->OutputType = OutputTypePathRecord; + qry->InputType = InputTypePortGuidPair; - memset(&path_query, 0, sizeof(path_query)); - path_query.InputType = InputTypePortGuidPair; - path_query.OutputType = OutputTypePathRecord; - path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; - path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; + qry->InputValue.PortGuidPair.SourcePortGuid = + kibnal_data.kib_port_guid; + qry->InputValue.PortGuidPair.DestPortGuid = + conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID; - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ - - /* kibnal_service_get_callback gets my conn ref */ - - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &path_query, - kibnal_pathreq_callback, - &sd_params, conn); + /* kibnal_pathreq_callback gets my conn ref */ + frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_pathreq_callback, + &kibnal_data.kib_sdretry, + conn); if (frc == FPENDING) return; - CERROR ("Path record request failed: %d\n", frc); - kibnal_connreq_done (conn, 1, -EINVAL); + CERROR ("pathreq %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static void +void kibnal_connect_peer (kib_peer_t *peer) { - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY query; - FSTATUS frc; - kib_conn_t *conn = kibnal_create_conn(); + QUERY *qry; + FSTATUS frc; + kib_conn_t *conn; LASSERT (peer->ibp_connecting != 0); + conn = kibnal_create_conn(peer->ibp_nid, peer->ibp_version); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); + kibnal_peer_connect_failed(peer, IBNAL_CONN_ACTIVE, -ENOMEM); return; } conn->ibc_peer = peer; - kib_peer_addref(peer); - - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done (conn, 1, -ENOMEM); - return; - } - - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + kibnal_peer_addref(peer); - kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + qry = &conn->ibc_cvars->cv_query; + memset(qry, 0, sizeof(*qry)); - memset(&query, 0, sizeof(query)); - query.InputType = InputTypeServiceRecord; - query.OutputType = OutputTypeServiceRecord; - query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; - query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + qry->OutputType = OutputTypeServiceRecord; + qry->InputType = InputTypeServiceRecord; - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + qry->InputValue.ServiceRecordValue.ComponentMask = + KIBNAL_SERVICE_KEY_MASK; + kibnal_set_service_keys( + &qry->InputValue.ServiceRecordValue.ServiceRecord, + peer->ibp_nid); /* kibnal_service_get_callback gets my conn ref */ - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &query, - kibnal_service_get_callback, - &sd_params, conn); + frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_service_get_callback, + &kibnal_data.kib_sdretry, + conn); if (frc == FPENDING) return; - CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); - kibnal_connreq_done (conn, 1, frc); + CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static int -kibnal_conn_timed_out (kib_conn_t *conn) +int +kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) { kib_tx_t *tx; struct list_head *ttmp; - unsigned long flags; + int timed_out = 0; - spin_lock_irqsave (&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - list_for_each (ttmp, &conn->ibc_tx_queue) { + list_for_each (ttmp, txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + timed_out = 1; + break; } } - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); + return timed_out; +} - return 0; +int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + return kibnal_check_txs(conn, &conn->ibc_tx_queue) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || + kibnal_check_txs(conn, &conn->ibc_active_txs); } -static void -kibnal_check_conns (int idx) +void +kibnal_check_peers (int idx) { + rwlock_t *rwlock = &kibnal_data.kib_global_lock; struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; kib_peer_t *peer; @@ -2762,15 +3029,33 @@ kibnal_check_conns (int idx) /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + read_lock_irqsave(rwlock, flags); list_for_each (ptmp, peers) { peer = list_entry (ptmp, kib_peer_t, ibp_list); + if (peer->ibp_passivewait) { + LASSERT (list_empty(&peer->ibp_conns)); + + if (!time_after_eq(jiffies, + peer->ibp_passivewait_deadline)) + continue; + + kibnal_peer_addref(peer); /* ++ ref for me... */ + read_unlock_irqrestore(rwlock, flags); + + kibnal_peer_connect_failed(peer, IBNAL_CONN_WAITING, + -ETIMEDOUT); + kibnal_peer_decref(peer); /* ...until here */ + + /* start again now I've dropped the lock */ + goto again; + } + list_for_each (ctmp, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs @@ -2779,60 +3064,57 @@ kibnal_check_conns (int idx) if (!kibnal_conn_timed_out(conn)) continue; + + /* Handle timeout by closing the whole connection. We + * can only be sure RDMA activity has ceased once the + * QP has been modified. */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); + kibnal_conn_addref(conn); /* 1 ref for me... */ - atomic_inc (&conn->ibc_refcount); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); + read_unlock_irqrestore(rwlock, flags); - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); + CERROR("Timed out RDMA with %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); + kibnal_conn_decref(conn); /* ...until here */ /* start again now I've dropped the lock */ goto again; } } - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + read_unlock_irqrestore(rwlock, flags); } -static void -kib_connd_handle_state(kib_conn_t *conn) +void +kibnal_disconnect_conn (kib_conn_t *conn) { - FSTATUS frc; - - switch (conn->ibc_state) { - /* all refs have gone, free and be done with it */ - case IBNAL_CONN_DISCONNECTED: - kibnal_destroy_conn (conn); - return; /* avoid put_conn */ + FSTATUS frc; - case IBNAL_CONN_SEND_DREQ: - frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); - if (frc != FSUCCESS) /* XXX do real things */ - CERROR("disconnect failed: %d\n", frc); - conn->ibc_state = IBNAL_CONN_DREQ; - break; + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING); - /* a callback got to the conn before we did */ - case IBNAL_CONN_DREP: - break; - - default: - CERROR ("Bad conn %p state: %d\n", conn, - conn->ibc_state); - LBUG(); - break; + kibnal_conn_disconnected(conn); + + frc = iba_cm_disconnect(conn->ibc_cep, NULL, NULL); + switch (frc) { + case FSUCCESS: + break; + + case FINSUFFICIENT_RESOURCES: + CERROR("ENOMEM disconnecting %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* This might cause the module to become unloadable since the + * FCM_DISCONNECTED callback is still outstanding */ + break; + + default: + CERROR("Unexpected error disconnecting %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + LBUG(); } - /* drop ref from close_conn */ - kibnal_put_conn(conn); + kibnal_peer_notify(conn->ibc_peer); } int @@ -2844,27 +3126,43 @@ kibnal_connd (void *arg) kib_peer_t *peer; int timeout; int i; + int did_something; int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("kibnal_connd"); - kportal_blockallsigs (); + cfs_daemonize ("kibnal_connd"); + cfs_block_allsigs (); init_waitqueue_entry (&wait, current); - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + while (!kibnal_data.kib_shutdown) { + did_something = 0; + + if (!list_empty (&kibnal_data.kib_connd_zombies)) { + conn = list_entry (kibnal_data.kib_connd_zombies.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + did_something = 1; + + kibnal_destroy_conn(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } - for (;;) { if (!list_empty (&kibnal_data.kib_connd_conns)) { conn = list_entry (kibnal_data.kib_connd_conns.next, kib_conn_t, ibc_list); list_del (&conn->ibc_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - kib_connd_handle_state(conn); + did_something = 1; + kibnal_disconnect_conn(conn); + kibnal_conn_decref(conn); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - continue; } if (!list_empty (&kibnal_data.kib_connd_peers)) { @@ -2873,26 +3171,22 @@ kibnal_connd (void *arg) list_del_init (&peer->ibp_connd_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + did_something = 1; kibnal_connect_peer (peer); - kib_peer_decref (peer); + kibnal_peer_decref (peer); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - /* shut down and nobody left to reap... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - /* careful with the jiffy wrap... */ while ((timeout = (int)(deadline - jiffies)) <= 0) { const int n = 4; const int p = 1; int chunk = kibnal_data.kib_peer_hash_size; + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a * proportion of the peer table and I need to check @@ -2901,22 +3195,27 @@ kibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (kibnal_tunables.kib_io_timeout > n * p) + if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; + *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); + kibnal_check_peers (peer_index); peer_index = (peer_index + 1) % kibnal_data.kib_peer_hash_size; } deadline += p * HZ; + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + did_something = 1; } - kibnal_data.kib_connd_waketime = jiffies + timeout; + if (did_something) + continue; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); set_current_state (TASK_INTERRUPTIBLE); add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); @@ -2938,78 +3237,149 @@ kibnal_connd (void *arg) return (0); } + +void +kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); +} + +void +kibnal_hca_callback (void *hca_arg, void *cq_arg) +{ + unsigned long flags; + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + int kibnal_scheduler(void *arg) { - long id = (long)arg; - char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; + long id = (long)arg; + wait_queue_t wait; + char name[16]; + FSTATUS frc; + FSTATUS frc2; + IB_WORK_COMPLETION wc; + kib_rx_t *rx; + unsigned long flags; + __u64 rxseq = 0; + int busy_loops = 0; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + init_waitqueue_entry(&wait, current); - for (;;) { - did_something = 0; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); + while (!kibnal_data.kib_shutdown) { + if (busy_loops++ >= IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + our_cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); + if (kibnal_data.kib_ready && + !kibnal_data.kib_checking_cq) { + /* take ownership of completion polling */ + kibnal_data.kib_checking_cq = 1; + /* Assume I'll exhaust the CQ */ + kibnal_data.kib_ready = 0; spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + frc = iba_poll_cq(kibnal_data.kib_cq, &wc); + if (frc == FNOT_DONE) { + /* CQ empty */ + frc2 = iba_rearm_cq(kibnal_data.kib_cq, + CQEventSelNextWC); + LASSERT (frc2 == FSUCCESS); + } + + if (frc == FSUCCESS && + kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) { + rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId); + + /* Grab the RX sequence number NOW before + * anyone else can get an RX completion */ + rxseq = rx->rx_conn->ibc_rxseq++; + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + /* give up ownership of completion polling */ + kibnal_data.kib_checking_cq = 0; - kibnal_rx(rx); + if (frc == FNOT_DONE) + continue; - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } + LASSERT (frc == FSUCCESS); + /* Assume there's more: get another scheduler to check + * while I handle this completion... */ - /* shut down and no receives to complete... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - (kibnal_data.kib_shutdown && - atomic_read (&kibnal_data.kib_nconns) == 0)); - } else { - our_cond_resched(); - } - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + switch (kibnal_wreqid2type(wc.WorkReqId)) { + case IBNAL_WID_RX: + kibnal_rx_complete(&wc, rxseq); + break; + + case IBNAL_WID_TX: + kibnal_tx_complete(&wc); + break; + + case IBNAL_WID_RDMA: + /* We only get RDMA completion notification if + * it fails. So we just ignore them completely + * because... + * + * 1) If an RDMA fails, all subsequent work + * items, including the final SEND will fail + * too, so I'm still guaranteed to notice that + * this connection is hosed. + * + * 2) It's positively dangerous to look inside + * the tx descriptor obtained from an RDMA work + * item. As soon as I drop the kib_sched_lock, + * I give a scheduler on another CPU a chance + * to get the final SEND completion, so the tx + * descriptor can get freed as I inspect it. */ + CERROR ("RDMA failed: %d\n", wc.Status); + break; + + default: + LBUG(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + continue; } + + /* Nothing to do; sleep... */ + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + schedule(); + + remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); @@ -3017,13 +3387,3 @@ kibnal_scheduler(void *arg) kibnal_thread_fini(); return (0); } - - -lib_nal_t kibnal_lib = { - libnal_data: &kibnal_data, /* NAL private data */ - libnal_send: kibnal_send, - libnal_send_pages: kibnal_send_pages, - libnal_recv: kibnal_recv, - libnal_recv_pages: kibnal_recv_pages, - libnal_dist: kibnal_dist -};