X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fo2iblnd%2Fo2iblnd_cb.c;h=e22f9464c8355f47bec5f9c293e354f48a27391f;hp=674f66c04b5861a3e23fb5ad302833414a4ea87c;hb=f9d837b479232bfc4f271f23cd3729ca67cb6c1d;hpb=5c883ea2748ae9e430a9cd863a9b630b2a74440a diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 674f66c0..e22f946 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -588,6 +588,7 @@ kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, return -EPROTONOSUPPORT; } +#ifdef HAVE_FMR_POOL_API /* * FMR does not support gaps but the tx has gaps then * we should make sure that the number of fragments we'll be sending @@ -606,11 +607,13 @@ kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, return -EFBIG; } } +#endif fps = net->ibn_fmr_ps[cpt]; rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr); if (rc != 0) { - CERROR("Can't map %u pages: %d\n", nob, rc); + CERROR("Can't map %u bytes (%u/%u)s: %d\n", nob, + tx->tx_nfrags, rd->rd_nfrags, rc); return rc; } @@ -623,11 +626,17 @@ kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, * for FastReg or FMR with no gaps we can accumulate all * the fragments in one FastReg or FMR fragment. */ - if (((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) && !tx->tx_gaps) || + if ( +#ifdef HAVE_FMR_POOL_API + ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) + && !tx->tx_gaps) || +#endif (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) { /* FMR requires zero based address */ +#ifdef HAVE_FMR_POOL_API if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; +#endif rd->rd_frags[0].rf_nob = nob; rd->rd_nfrags = 1; } else { @@ -648,7 +657,11 @@ kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, static void kiblnd_unmap_tx(struct kib_tx *tx) { - if (tx->tx_fmr.fmr_pfmr || tx->tx_fmr.fmr_frd) + if ( +#ifdef HAVE_FMR_POOL_API + tx->tx_fmr.fmr_pfmr || +#endif + tx->tx_fmr.fmr_frd) kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status); if (tx->tx_nfrags != 0) { @@ -675,8 +688,11 @@ kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd) * dead in the water and fail the operation. */ if (tunables->lnd_map_on_demand && - (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED || - net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) + (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED +#ifdef HAVE_FMR_POOL_API + || net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED +#endif + )) return NULL; /* @@ -730,83 +746,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, return -EINVAL; } -static int kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, unsigned int niov, - struct kvec *iov, int offset, int nob) -{ - struct kib_net *net = ni->ni_data; - struct page *page; - struct scatterlist *sg; - unsigned long vaddr; - int fragnob; - int page_offset; - unsigned int max_niov; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (net != NULL); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - max_niov = niov; - - sg = tx->tx_frags; - do { - LASSERT(niov > 0); - - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = lnet_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR("Can't find page\n"); - return -EFAULT; - } - - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - - /* - * We're allowed to start at a non-aligned page offset in - * the first fragment and end at a non-aligned page offset - * in the last fragment. - */ - if ((fragnob < (int)PAGE_SIZE - page_offset) && - (niov < max_niov) && nob > fragnob) { - CDEBUG(D_NET, "fragnob %d < available page %d: with" - " remaining %d iovs with %d nob left\n", - fragnob, (int)PAGE_SIZE - page_offset, niov, - nob); - tx->tx_gaps = true; - } - - sg_set_page(sg, page, fragnob, page_offset); - sg = sg_next(sg); - if (!sg) { - CERROR("lacking enough sg entries to map tx\n"); - return -EFAULT; - } - - if (offset + fragnob < iov->iov_len) { - offset += fragnob; - } else { - offset = 0; - iov++; - niov--; - } - nob -= fragnob; - } while (nob > 0); - - return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); -} - static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, struct kib_rdma_desc *rd, int nkiov, - lnet_kiov_t *kiov, int offset, int nob) + struct bio_vec *kiov, int offset, int nob) { struct kib_net *net = ni->ni_data; struct scatterlist *sg; @@ -819,8 +761,8 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, LASSERT(nkiov > 0); LASSERT(net != NULL); - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; + while (offset >= kiov->bv_len) { + offset -= kiov->bv_len; nkiov--; kiov++; LASSERT(nkiov > 0); @@ -832,24 +774,24 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, do { LASSERT(nkiov > 0); - fragnob = min((int)(kiov->kiov_len - offset), nob); + fragnob = min((int)(kiov->bv_len - offset), nob); /* * We're allowed to start at a non-aligned page offset in * the first fragment and end at a non-aligned page offset * in the last fragment. */ - if ((fragnob < (int)(kiov->kiov_len - offset)) && + if ((fragnob < (int)(kiov->bv_len - offset)) && nkiov < max_nkiov && nob > fragnob) { CDEBUG(D_NET, "fragnob %d < available page %d: with" " remaining %d kiovs with %d nob left\n", - fragnob, (int)(kiov->kiov_len - offset), + fragnob, (int)(kiov->bv_len - offset), nkiov, nob); tx->tx_gaps = true; } - sg_set_page(sg, kiov->kiov_page, fragnob, - kiov->kiov_offset + offset); + sg_set_page(sg, kiov->bv_page, fragnob, + kiov->bv_offset + offset); sg = sg_next(sg); if (!sg) { CERROR("lacking enough sg entries to map tx\n"); @@ -1094,24 +1036,28 @@ kiblnd_check_sends_locked(struct kib_conn *conn) static void kiblnd_tx_complete(struct kib_tx *tx, int status) { - int failed = (status != IB_WC_SUCCESS); + int failed = (status != IB_WC_SUCCESS); struct kib_conn *conn = tx->tx_conn; - int idle; + int idle; - LASSERT (tx->tx_sending > 0); + if (tx->tx_sending <= 0) { + CERROR("Received an event on a freed tx: %p status %d\n", + tx, tx->tx_status); + return; + } - if (failed) { - if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) CNETERR("Tx -> %s cookie %#llx" - " sending %d waiting %d: failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_cookie, tx->tx_sending, tx->tx_waiting, - status); + " sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); - kiblnd_close_conn(conn, -EIO); - } else { - kiblnd_peer_alive(conn->ibc_peer); - } + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } spin_lock(&conn->ibc_lock); @@ -1308,7 +1254,7 @@ kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn) return; } - timeout_ns = lnet_get_lnd_timeout() * NSEC_PER_SEC; + timeout_ns = kiblnd_timeout() * NSEC_PER_SEC; tx->tx_queued = 1; tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns); @@ -1362,10 +1308,11 @@ kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) spin_unlock(&conn->ibc_lock); } -static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, - struct sockaddr_in *srcaddr, - struct sockaddr_in *dstaddr, - int timeout_ms) +static int +kiblnd_resolve_addr_cap(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) { unsigned short port; int rc; @@ -1395,8 +1342,36 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, } } - CERROR("Failed to bind to a free privileged port\n"); - return rc; + CERROR("cannot bind to a free privileged port: rc = %d\n", rc); + + return rc; +} + +static int +kiblnd_resolve_addr(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + const struct cred *old_creds = NULL; + struct cred *new_creds; + int rc; + + if (!capable(CAP_NET_BIND_SERVICE)) { + new_creds = prepare_kernel_cred(NULL); + if (!new_creds) + return -ENOMEM; + + cap_raise(new_creds->cap_effective, CAP_NET_BIND_SERVICE); + old_creds = override_creds(new_creds); + } + + rc = kiblnd_resolve_addr_cap(cmid, srcaddr, dstaddr, timeout_ms); + + if (old_creds) + revert_creds(old_creds); + + return rc; } static void @@ -1437,12 +1412,12 @@ kiblnd_connect_peer(struct kib_peer_ni *peer_ni) if (*kiblnd_tunables.kib_use_priv_port) { rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, - lnet_get_lnd_timeout() * 1000); + kiblnd_timeout() * 1000); } else { rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, (struct sockaddr *)&dstaddr, - lnet_get_lnd_timeout() * 1000); + kiblnd_timeout() * 1000); } if (rc != 0) { /* Can't initiate address resolution: */ @@ -1519,47 +1494,49 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) struct kib_peer_ni *peer2; struct kib_conn *conn; rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - unsigned long flags; - int rc; - int i; + unsigned long flags; + int rc; + int i; struct lnet_ioctl_config_o2iblnd_tunables *tunables; - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems + */ - LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */ + LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */ + LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */ - /* First time, just use a read lock since I expect to find my peer_ni - * connected */ + /* First time, just use a read lock since I expect to find my peer_ni + * connected + */ read_lock_irqsave(g_lock, flags); - peer_ni = kiblnd_find_peer_locked(ni, nid); + peer_ni = kiblnd_find_peer_locked(ni, nid); if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) { - /* Found a peer_ni with an established connection */ - conn = kiblnd_get_conn_locked(peer_ni); - kiblnd_conn_addref(conn); /* 1 ref for me... */ + /* Found a peer_ni with an established connection */ + conn = kiblnd_get_conn_locked(peer_ni); + kiblnd_conn_addref(conn); /* 1 ref for me... */ read_unlock_irqrestore(g_lock, flags); - if (tx != NULL) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - return; - } + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } read_unlock(g_lock); /* Re-try with a write lock */ write_lock(g_lock); - peer_ni = kiblnd_find_peer_locked(ni, nid); - if (peer_ni != NULL) { + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL) { if (list_empty(&peer_ni->ibp_conns)) { - /* found a peer_ni, but it's still connecting... */ + /* found a peer_ni, but it's still connecting... */ LASSERT(kiblnd_peer_connecting(peer_ni)); - if (tx != NULL) + if (tx != NULL) list_add_tail(&tx->tx_list, - &peer_ni->ibp_tx_queue); + &peer_ni->ibp_tx_queue); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer_ni); @@ -1567,12 +1544,12 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) write_unlock_irqrestore(g_lock, flags); - if (tx != NULL) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } - return; - } + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } write_unlock_irqrestore(g_lock, flags); @@ -1591,14 +1568,14 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) write_lock_irqsave(g_lock, flags); - peer2 = kiblnd_find_peer_locked(ni, nid); - if (peer2 != NULL) { + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { if (list_empty(&peer2->ibp_conns)) { - /* found a peer_ni, but it's still connecting... */ + /* found a peer_ni, but it's still connecting... */ LASSERT(kiblnd_peer_connecting(peer2)); - if (tx != NULL) + if (tx != NULL) list_add_tail(&tx->tx_list, - &peer2->ibp_tx_queue); + &peer2->ibp_tx_queue); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer2); @@ -1606,14 +1583,14 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) write_unlock_irqrestore(g_lock, flags); - if (tx != NULL) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } - kiblnd_peer_decref(peer_ni); - return; - } + kiblnd_peer_decref(peer_ni); + return; + } /* Brand new peer_ni */ LASSERT(peer_ni->ibp_connecting == 0); @@ -1626,14 +1603,14 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) if (tx != NULL) list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); - kiblnd_peer_addref(peer_ni); - list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid)); + kiblnd_peer_addref(peer_ni); + hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid); write_unlock_irqrestore(g_lock, flags); for (i = 0; i < tunables->lnd_conns_per_peer; i++) kiblnd_connect_peer(peer_ni); - kiblnd_peer_decref(peer_ni); + kiblnd_peer_decref(peer_ni); } int @@ -1645,8 +1622,7 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) int target_is_router = lntmsg->msg_target_is_router; int routing = lntmsg->msg_routing; unsigned int payload_niov = lntmsg->msg_niov; - struct kvec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + struct bio_vec *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; struct kib_msg *ibmsg; @@ -1665,8 +1641,6 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) /* Thread context */ LASSERT (!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); switch (type) { default: @@ -1695,16 +1669,10 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) ibmsg = tx->tx_msg; rd = &ibmsg->ibm_u.get.ibgm_rd; - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_kiov, + 0, lntmsg->msg_md->md_length); if (rc != 0) { CERROR("Can't setup GET sink for %s: %d\n", libcfs_nid2str(target.nid), rc); @@ -1747,14 +1715,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) return -ENOMEM; } - if (payload_kiov == NULL) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, - payload_niov, payload_kiov, - payload_offset, payload_nob); + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); if (rc != 0) { CERROR("Can't setup PUT src for %s: %d\n", libcfs_nid2str(target.nid), rc); @@ -1788,16 +1751,11 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - if (payload_kiov != NULL) - lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(struct kib_msg, + ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); @@ -1811,13 +1769,12 @@ static void kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) { struct lnet_process_id target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct kvec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; + unsigned int niov = lntmsg->msg_niov; + struct bio_vec *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; struct kib_tx *tx; - int rc; + int rc; tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); if (tx == NULL) { @@ -1828,9 +1785,6 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) if (nob == 0) rc = 0; - else if (kiov == NULL) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - niov, iov, offset, nob); else rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, niov, kiov, offset, nob); @@ -1873,7 +1827,7 @@ failed_0: int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + int delayed, unsigned int niov, struct bio_vec *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen) { struct kib_rx *rx = private; @@ -1887,8 +1841,6 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, LASSERT (mlen <= rlen); LASSERT (!in_interrupt()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); switch (rxmsg->ibm_type) { default: @@ -1904,16 +1856,11 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, break; } - if (kiov != NULL) - lnet_copy_flat2kiov(niov, kiov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - IBLND_MSG_SIZE, rxmsg, - offsetof(struct kib_msg, ibm_u.immediate.ibim_payload), - mlen); + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(struct kib_msg, + ibm_u.immediate.ibim_payload), + mlen); lnet_finalize(lntmsg, 0); break; @@ -1940,12 +1887,8 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, txmsg = tx->tx_msg; rd = &txmsg->ibm_u.putack.ibpam_rd; - if (kiov == NULL) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - niov, iov, offset, mlen); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - niov, kiov, offset, mlen); + rc = kiblnd_setup_rd_kiov(ni, tx, rd, + niov, kiov, offset, mlen); if (rc != 0) { CERROR("Can't setup PUT sink for %s: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); @@ -1992,7 +1935,7 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) { - struct task_struct *task = kthread_run(fn, arg, name); + struct task_struct *task = kthread_run(fn, arg, "%s", name); if (IS_ERR(task)) return PTR_ERR(task); @@ -2150,15 +2093,12 @@ void kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) { LIST_HEAD(zombies); - struct list_head *tmp; - struct list_head *nxt; + struct kib_tx *nxt; struct kib_tx *tx; spin_lock(&conn->ibc_lock); - list_for_each_safe(tmp, nxt, txs) { - tx = list_entry(tmp, struct kib_tx, tx_list); - + list_for_each_entry_safe(tx, nxt, txs, tx_list) { if (txs == &conn->ibc_active_txs) { LASSERT(!tx->tx_queued); LASSERT(tx->tx_waiting || @@ -2195,6 +2135,10 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) if (tx->tx_sending == 0) { tx->tx_queued = 0; list_move(&tx->tx_list, &zombies); + } else { + /* keep tx until cq destroy */ + list_move(&tx->tx_list, &conn->ibc_zombie_txs); + conn->ibc_waits ++; } } @@ -2209,6 +2153,31 @@ kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) kiblnd_txlist_done(&zombies, -ECONNABORTED, LNET_MSG_STATUS_OK); } +static bool +kiblnd_tx_may_discard(struct kib_conn *conn) +{ + bool rc = false; + struct kib_tx *nxt; + struct kib_tx *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_entry_safe(tx, nxt, &conn->ibc_zombie_txs, tx_list) { + if (tx->tx_sending > 0 && tx->tx_lntmsg[0] && + lnet_md_discarded(tx->tx_lntmsg[0]->msg_md)) { + tx->tx_sending --; + if (tx->tx_sending == 0) { + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + rc = true; + } + } + } + + spin_unlock(&conn->ibc_lock); + return rc; +} + static void kiblnd_finalise_conn(struct kib_conn *conn) { @@ -2239,10 +2208,11 @@ kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, int error) { LIST_HEAD(zombies); - unsigned long flags; + unsigned long flags; + enum lnet_msg_hstatus hstatus; - LASSERT (error != 0); - LASSERT (!in_interrupt()); + LASSERT(error != 0); + LASSERT(!in_interrupt()); write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -2285,8 +2255,20 @@ kiblnd_peer_connect_failed(struct kib_peer_ni *peer_ni, int active, CNETERR("Deleting messages for %s: connection failed\n", libcfs_nid2str(peer_ni->ibp_nid)); - kiblnd_txlist_done(&zombies, error, - LNET_MSG_STATUS_LOCAL_DROPPED); + switch (error) { + case -EHOSTUNREACH: + case -ETIMEDOUT: + hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; + break; + case -ECONNREFUSED: + hstatus = LNET_MSG_STATUS_REMOTE_DROPPED; + break; + default: + hstatus = LNET_MSG_STATUS_LOCAL_DROPPED; + break; + } + + kiblnd_txlist_done(&zombies, error, hstatus); } static void @@ -2310,22 +2292,25 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && peer_ni->ibp_accepting > 0)); - LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - conn->ibc_connvars = NULL; + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; - if (status != 0) { - /* failed to establish connection */ - kiblnd_peer_connect_failed(peer_ni, active, status); - kiblnd_finalise_conn(conn); - return; - } + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer_ni, active, status); + kiblnd_finalise_conn(conn); + return; + } - /* connection established */ + /* connection established */ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* reset retry count */ + peer_ni->ibp_retries = 0; + conn->ibc_last_send = ktime_get(); - kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); - kiblnd_peer_alive(peer_ni); + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer_ni); /* Add conn to peer_ni's list and nuke any dangling conns from a different * peer_ni instance... */ @@ -2399,7 +2384,11 @@ kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) { int rc; +#ifdef HAVE_RDMA_REJECT_4ARGS + rc = rdma_reject(cmid, rej, sizeof(*rej), IB_CM_REJ_CONSUMER_DEFINED); +#else rc = rdma_reject(cmid, rej, sizeof(*rej)); +#endif if (rc != 0) CWARN("Error %d sending reject\n", rc); @@ -2408,7 +2397,7 @@ kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) static int kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) { - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; struct kib_msg *reqmsg = priv; struct kib_msg *ackmsg; struct kib_dev *ibdev; @@ -2417,27 +2406,27 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) struct kib_conn *conn; struct lnet_ni *ni = NULL; struct kib_net *net = NULL; - lnet_nid_t nid; - struct rdma_conn_param cp; + lnet_nid_t nid; + struct rdma_conn_param cp; struct kib_rej rej; - int version = IBLND_MSG_VERSION; - unsigned long flags; - int rc; - struct sockaddr_in *peer_addr; - LASSERT (!in_interrupt()); + int version = IBLND_MSG_VERSION; + unsigned long flags; + int rc; + struct sockaddr_in *peer_addr; + LASSERT(!in_interrupt()); /* cmid inherits 'context' from the corresponding listener id */ ibdev = cmid->context; LASSERT(ibdev); - memset(&rej, 0, sizeof(rej)); - rej.ibr_magic = IBLND_MSG_MAGIC; - rej.ibr_why = IBLND_REJECT_FATAL; - rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; + memset(&rej, 0, sizeof(rej)); + rej.ibr_magic = IBLND_MSG_MAGIC; + rej.ibr_why = IBLND_REJECT_FATAL; + rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; - peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); - if (*kiblnd_tunables.kib_require_priv_port && - ntohs(peer_addr->sin_port) >= PROT_SOCK) { + peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); + if (*kiblnd_tunables.kib_require_priv_port && + ntohs(peer_addr->sin_port) >= PROT_SOCK) { __u32 ip = ntohl(peer_addr->sin_addr.s_addr); CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n", &ip, ntohs(peer_addr->sin_port)); @@ -2484,17 +2473,16 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) if (ni == NULL || /* no matching net */ ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ net->ibn_dev != ibdev) { /* wrong device */ - CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): " - "bad dst nid %s\n", libcfs_nid2str(nid), - ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid), + CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n", libcfs_nid2str(nid), + ni ? libcfs_nid2str(ni->ni_nid) : "NA", ibdev->ibd_ifname, ibdev->ibd_nnets, - &ibdev->ibd_ifip, + &ibdev->ibd_ifip, libcfs_nid2str(reqmsg->ibm_dstnid)); goto failed; } - /* check time stamp as soon as possible */ + /* check time stamp as soon as possible */ if (reqmsg->ibm_dststamp != 0 && reqmsg->ibm_dststamp != net->ibn_incarnation) { CWARN("Stale connection request\n"); @@ -2513,8 +2501,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) if (reqmsg->ibm_u.connparams.ibcp_queue_depth > kiblnd_msg_queue_size(version, ni)) { - CERROR("Can't accept conn from %s, queue depth too large: " - " %d (<=%d wanted)\n", + CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n", libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth, kiblnd_msg_queue_size(version, ni)); @@ -2527,8 +2514,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) if (reqmsg->ibm_u.connparams.ibcp_max_frags > IBLND_MAX_RDMA_FRAGS) { - CWARN("Can't accept conn from %s (version %x): " - "max_frags %d too large (%d wanted)\n", + CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n", libcfs_nid2str(nid), version, reqmsg->ibm_u.connparams.ibcp_max_frags, IBLND_MAX_RDMA_FRAGS); @@ -2540,9 +2526,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < IBLND_MAX_RDMA_FRAGS && net->ibn_fmr_ps == NULL) { - CWARN("Can't accept conn from %s (version %x): " - "max_frags %d incompatible without FMR pool " - "(%d wanted)\n", + CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n", libcfs_nid2str(nid), version, reqmsg->ibm_u.connparams.ibcp_max_frags, IBLND_MAX_RDMA_FRAGS); @@ -2553,13 +2537,13 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { - CERROR("Can't accept %s: message size %d too big (%d max)\n", - libcfs_nid2str(nid), - reqmsg->ibm_u.connparams.ibcp_max_msg_size, - IBLND_MSG_SIZE); - goto failed; - } + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } /* assume 'nid' is a new peer_ni; create */ rc = kiblnd_create_peer(ni, &peer_ni, nid); @@ -2575,16 +2559,16 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) write_lock_irqsave(g_lock, flags); - peer2 = kiblnd_find_peer_locked(ni, nid); - if (peer2 != NULL) { - if (peer2->ibp_version == 0) { - peer2->ibp_version = version; - peer2->ibp_incarnation = reqmsg->ibm_srcstamp; - } + peer2 = kiblnd_find_peer_locked(ni, nid); + if (peer2 != NULL) { + if (peer2->ibp_version == 0) { + peer2->ibp_version = version; + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + } - /* not the guy I've talked with */ - if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || - peer2->ibp_version != version) { + /* not the guy I've talked with */ + if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || + peer2->ibp_version != version) { kiblnd_close_peer_conns_locked(peer2, -ESTALE); if (kiblnd_peer_active(peer2)) { @@ -2597,10 +2581,10 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) libcfs_nid2str(nid), peer2->ibp_version, version, peer2->ibp_incarnation, reqmsg->ibm_srcstamp); - kiblnd_peer_decref(peer_ni); - rej.ibr_why = IBLND_REJECT_CONN_STALE; - goto failed; - } + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } /* Tie-break connection race in favour of the higher NID. * If we keep running into a race condition multiple times, @@ -2642,78 +2626,80 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) peer2->ibp_queue_depth = peer_ni->ibp_queue_depth; write_unlock_irqrestore(g_lock, flags); - kiblnd_peer_decref(peer_ni); - peer_ni = peer2; - } else { - /* Brand new peer_ni */ - LASSERT (peer_ni->ibp_accepting == 0); - LASSERT (peer_ni->ibp_version == 0 && - peer_ni->ibp_incarnation == 0); + kiblnd_peer_decref(peer_ni); + peer_ni = peer2; + } else { + /* Brand new peer_ni */ + LASSERT(peer_ni->ibp_accepting == 0); + LASSERT(peer_ni->ibp_version == 0 && + peer_ni->ibp_incarnation == 0); - peer_ni->ibp_accepting = 1; - peer_ni->ibp_version = version; - peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp; + peer_ni->ibp_accepting = 1; + peer_ni->ibp_version = version; + peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp; - /* I have a ref on ni that prevents it being shutdown */ - LASSERT (net->ibn_shutdown == 0); + /* I have a ref on ni that prevents it being shutdown */ + LASSERT(net->ibn_shutdown == 0); - kiblnd_peer_addref(peer_ni); - list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid)); + kiblnd_peer_addref(peer_ni); + hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid); write_unlock_irqrestore(g_lock, flags); - } + } - conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, version); - if (conn == NULL) { - kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM); - kiblnd_peer_decref(peer_ni); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } + conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, + version); + if (!conn) { + kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM); + kiblnd_peer_decref(peer_ni); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } - /* conn now "owns" cmid, so I return success from here on to ensure the - * CM callback doesn't destroy cmid. */ + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. + */ conn->ibc_incarnation = reqmsg->ibm_srcstamp; conn->ibc_credits = conn->ibc_queue_depth; conn->ibc_reserved_credits = conn->ibc_queue_depth; LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); - ackmsg = &conn->ibc_connvars->cv_msg; - memset(ackmsg, 0, sizeof(*ackmsg)); + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); - kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, - sizeof(ackmsg->ibm_u.connparams)); + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); + kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); - memset(&cp, 0, sizeof(cp)); - cp.private_data = ackmsg; - cp.private_data_len = ackmsg->ibm_nob; - cp.responder_resources = 0; /* No atomic ops or RDMA reads */ - cp.initiator_depth = 0; - cp.flow_control = 1; - cp.retry_count = *kiblnd_tunables.kib_retry_count; - cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; - CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); - rc = rdma_accept(cmid, &cp); - if (rc != 0) { - CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); - rej.ibr_version = version; - rej.ibr_why = IBLND_REJECT_FATAL; + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + rej.ibr_version = version; + rej.ibr_why = IBLND_REJECT_FATAL; - kiblnd_reject(cmid, &rej); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } + kiblnd_reject(cmid, &rej); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } - lnet_ni_decref(ni); - return 0; + lnet_ni_decref(ni); + return 0; failed: if (ni != NULL) { @@ -2767,10 +2753,15 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version, goto out; } - switch (why) { - default: - reason = "Unknown"; - break; + if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) { + reason = "retry count exceeded due to no listener"; + goto out; + } + + switch (why) { + default: + reason = "Unknown"; + break; case IBLND_REJECT_RDMA_FRAGS: { struct lnet_ioctl_config_o2iblnd_tunables *tunables; @@ -2864,117 +2855,121 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) IBLND_REJECT_CONN_STALE, NULL); break; - case IB_CM_REJ_INVALID_SERVICE_ID: + case IB_CM_REJ_INVALID_SERVICE_ID: + peer_ni->ibp_retries++; kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, IBLND_REJECT_INVALID_SRV_ID, NULL); - CNETERR("%s rejected: no listener at %d\n", - libcfs_nid2str(peer_ni->ibp_nid), - *kiblnd_tunables.kib_service); - break; + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + *kiblnd_tunables.kib_service); + break; - case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_CONSUMER_DEFINED: if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { struct kib_rej *rej = priv; struct kib_connparams *cp = NULL; - int flip = 0; - __u64 incarnation = -1; - - /* NB. default incarnation is -1 because: - * a) V1 will ignore dst incarnation in connreq. - * b) V2 will provide incarnation while rejecting me, - * -1 will be overwrote. - * - * if I try to connect to a V1 peer_ni with V2 protocol, - * it rejected me then upgrade to V2, I have no idea - * about the upgrading and try to reconnect with V1, - * in this case upgraded V2 can find out I'm trying to - * talk to the old guy and reject me(incarnation is -1). - */ - - if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || - rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { - __swab32s(&rej->ibr_magic); - __swab16s(&rej->ibr_version); - flip = 1; - } + bool flip = false; + __u64 incarnation = -1; + + /* NB. default incarnation is -1 because: + * a) V1 will ignore dst incarnation in connreq. + * b) V2 will provide incarnation while rejecting me, + * -1 will be overwrote. + * + * if I try to connect to a V1 peer_ni with V2 protocol, + * it rejected me then upgrade to V2, I have no idea + * about the upgrading and try to reconnect with V1, + * in this case upgraded V2 can find out I'm trying to + * talk to the old guy and reject me(incarnation is -1). + */ + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + flip = true; + } if (priv_nob >= sizeof(struct kib_rej) && - rej->ibr_version > IBLND_MSG_VERSION_1) { - /* priv_nob is always 148 in current version - * of OFED, so we still need to check version. - * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */ - cp = &rej->ibr_cp; - - if (flip) { - __swab64s(&rej->ibr_incarnation); - __swab16s(&cp->ibcp_queue_depth); - __swab16s(&cp->ibcp_max_frags); - __swab32s(&cp->ibcp_max_msg_size); - } - - incarnation = rej->ibr_incarnation; - } - - if (rej->ibr_magic != IBLND_MSG_MAGIC && - rej->ibr_magic != LNET_PROTO_MAGIC) { - CERROR("%s rejected: consumer defined fatal error\n", - libcfs_nid2str(peer_ni->ibp_nid)); - break; - } - - if (rej->ibr_version != IBLND_MSG_VERSION && - rej->ibr_version != IBLND_MSG_VERSION_1) { - CERROR("%s rejected: o2iblnd version %x error\n", - libcfs_nid2str(peer_ni->ibp_nid), - rej->ibr_version); - break; - } - - if (rej->ibr_why == IBLND_REJECT_FATAL && - rej->ibr_version == IBLND_MSG_VERSION_1) { - CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n", - libcfs_nid2str(peer_ni->ibp_nid), rej->ibr_version); - - if (conn->ibc_version != IBLND_MSG_VERSION_1) - rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; - } - - switch (rej->ibr_why) { - case IBLND_REJECT_CONN_RACE: - case IBLND_REJECT_CONN_STALE: - case IBLND_REJECT_CONN_UNCOMPAT: + rej->ibr_version > IBLND_MSG_VERSION_1) { + /* priv_nob is always 148 in current version + * of OFED, so we still need to check version. + * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) + */ + cp = &rej->ibr_cp; + + if (flip) { + __swab64s(&rej->ibr_incarnation); + __swab16s(&cp->ibcp_queue_depth); + __swab16s(&cp->ibcp_max_frags); + __swab32s(&cp->ibcp_max_msg_size); + } + + incarnation = rej->ibr_incarnation; + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION && + rej->ibr_version != IBLND_MSG_VERSION_1) { + CERROR("%s rejected: o2iblnd version %x error\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_version); + break; + } + + if (rej->ibr_why == IBLND_REJECT_FATAL && + rej->ibr_version == IBLND_MSG_VERSION_1) { + CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_version); + + if (conn->ibc_version != IBLND_MSG_VERSION_1) + rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + case IBLND_REJECT_CONN_STALE: + case IBLND_REJECT_CONN_UNCOMPAT: case IBLND_REJECT_MSG_QUEUE_SIZE: case IBLND_REJECT_RDMA_FRAGS: kiblnd_check_reconnect(conn, rej->ibr_version, - incarnation, rej->ibr_why, cp); - break; - - case IBLND_REJECT_NO_RESOURCES: - CERROR("%s rejected: o2iblnd no resources\n", - libcfs_nid2str(peer_ni->ibp_nid)); - break; - - case IBLND_REJECT_FATAL: - CERROR("%s rejected: o2iblnd fatal error\n", - libcfs_nid2str(peer_ni->ibp_nid)); - break; - - default: - CERROR("%s rejected: o2iblnd reason %d\n", - libcfs_nid2str(peer_ni->ibp_nid), - rej->ibr_why); - break; - } - break; - } - /* fall through */ - default: - CNETERR("%s rejected: reason %d, size %d\n", - libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob); - break; - } + incarnation, + rej->ibr_why, cp); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; - kiblnd_connreq_done(conn, -ECONNREFUSED); + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + rej->ibr_why); + break; + } + break; + } + /* fall through */ + default: + CNETERR("%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, -ECONNREFUSED); } static void @@ -3132,8 +3127,7 @@ kiblnd_active_connect(struct rdma_cm_id *cmid) LASSERT(cmid->context == (void *)conn); LASSERT(conn->ibc_cmid == cmid); - - rc = rdma_connect(cmid, &cp); + rc = rdma_connect_locked(cmid, &cp); if (rc != 0) { CERROR("Can't connect to %s: %d\n", libcfs_nid2str(peer_ni->ibp_nid), rc); @@ -3185,7 +3179,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) rc = event->status; } else { rc = rdma_resolve_route( - cmid, lnet_get_lnd_timeout() * 1000); + cmid, kiblnd_timeout() * 1000); if (rc == 0) { struct kib_net *net = peer_ni->ibp_ni->ni_data; struct kib_dev *dev = net->ibn_dev; @@ -3342,8 +3336,10 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) } if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { - CERROR("Timed out tx: %s, %lld seconds\n", + CERROR("Timed out tx: %s(WSQ:%d%d%d), %lld seconds\n", kiblnd_queue2str(conn, txs), + tx->tx_waiting, tx->tx_sending, tx->tx_queued, + kiblnd_timeout() + ktime_ms_delta(ktime_get(), tx->tx_deadline) / MSEC_PER_SEC); return 1; @@ -3369,22 +3365,20 @@ kiblnd_check_conns (int idx) LIST_HEAD(closes); LIST_HEAD(checksends); LIST_HEAD(timedout_txs); - struct list_head *peers = &kiblnd_data.kib_peers[idx]; - struct list_head *ptmp; + struct hlist_head *peers = &kiblnd_data.kib_peers[idx]; struct kib_peer_ni *peer_ni; - struct kib_conn *conn; + struct kib_conn *conn; struct kib_tx *tx, *tx_tmp; struct list_head *ctmp; - unsigned long flags; + unsigned long flags; /* NB. We expect to have a look at all the peers and not find any * RDMAs to time out, so we just use a shared lock while we - * take a look... */ + * take a look... + */ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - list_for_each(ptmp, peers) { - peer_ni = list_entry(ptmp, struct kib_peer_ni, ibp_list); - + hlist_for_each_entry(peer_ni, peers, ibp_list) { /* Check tx_deadline */ list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) { if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { @@ -3414,10 +3408,10 @@ kiblnd_check_conns (int idx) } if (timedout) { - CERROR("Timed out RDMA with %s (%lld): " - "c: %u, oc: %u, rc: %u\n", + CERROR("Timed out RDMA with %s (%lld): c: %u, oc: %u, rc: %u\n", libcfs_nid2str(peer_ni->ibp_nid), - ktime_get_seconds() - peer_ni->ibp_last_alive, + ktime_get_seconds() + - peer_ni->ibp_last_alive, conn->ibc_credits, conn->ibc_outstanding_credits, conn->ibc_reserved_credits); @@ -3436,11 +3430,12 @@ kiblnd_check_conns (int idx) if (!list_empty(&timedout_txs)) kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT, - LNET_MSG_STATUS_LOCAL_TIMEOUT); + LNET_MSG_STATUS_NETWORK_TIMEOUT); /* Handle timeout by closing the whole * connection. We can only be sure RDMA activity - * has ceased once the QP has been modified. */ + * has ceased once the QP has been modified. + */ while (!list_empty(&closes)) { conn = list_entry(closes.next, struct kib_conn, ibc_connd_list); @@ -3451,7 +3446,8 @@ kiblnd_check_conns (int idx) /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ + * free to do it last time... + */ while (!list_empty(&checksends)) { conn = list_entry(checksends.next, struct kib_conn, ibc_connd_list); @@ -3492,19 +3488,17 @@ kiblnd_disconnect_conn(struct kib_conn *conn) int kiblnd_connd (void *arg) { - spinlock_t *lock= &kiblnd_data.kib_connd_lock; + spinlock_t *lock = &kiblnd_data.kib_connd_lock; wait_queue_entry_t wait; - unsigned long flags; + unsigned long flags; struct kib_conn *conn; - int timeout; - int i; - int dropped_lock; - int peer_index = 0; - unsigned long deadline = jiffies; - - cfs_block_allsigs(); + int timeout; + int i; + bool dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; - init_waitqueue_entry(&wait, current); + init_wait(&wait); kiblnd_data.kib_connd = current; spin_lock_irqsave(lock, flags); @@ -3512,7 +3506,7 @@ kiblnd_connd (void *arg) while (!kiblnd_data.kib_shutdown) { int reconn = 0; - dropped_lock = 0; + dropped_lock = false; if (!list_empty(&kiblnd_data.kib_connd_zombies)) { struct kib_peer_ni *peer_ni = NULL; @@ -3526,7 +3520,7 @@ kiblnd_connd (void *arg) } spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; + dropped_lock = true; kiblnd_destroy_conn(conn); @@ -3546,18 +3540,25 @@ kiblnd_connd (void *arg) } if (!list_empty(&kiblnd_data.kib_connd_conns)) { + int wait; conn = list_entry(kiblnd_data.kib_connd_conns.next, struct kib_conn, ibc_list); list_del(&conn->ibc_list); spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; + dropped_lock = true; kiblnd_disconnect_conn(conn); - kiblnd_conn_decref(conn); + wait = conn->ibc_waits; + if (wait == 0) /* keep ref for connd_wait, see below */ + kiblnd_conn_decref(conn); spin_lock_irqsave(lock, flags); - } + + if (wait) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); + } while (reconn < KIB_RECONN_BREAK) { if (kiblnd_data.kib_reconn_sec != @@ -3575,7 +3576,7 @@ kiblnd_connd (void *arg) list_del(&conn->ibc_list); spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; + dropped_lock = true; reconn += kiblnd_reconnect_peer(conn->ibc_peer); kiblnd_peer_decref(conn->ibc_peer); @@ -3584,26 +3585,43 @@ kiblnd_connd (void *arg) spin_lock_irqsave(lock, flags); } - /* careful with the jiffy wrap... */ - timeout = (int)(deadline - jiffies); - if (timeout <= 0) { - const int n = 4; - const int p = 1; - int chunk = kiblnd_data.kib_peer_hash_size; - unsigned int lnd_timeout; - + if (!list_empty(&kiblnd_data.kib_connd_waits)) { + conn = list_entry(kiblnd_data.kib_connd_waits.next, + struct kib_conn, ibc_list); + list_del(&conn->ibc_list); spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer_ni table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ + dropped_lock = kiblnd_tx_may_discard(conn); + if (dropped_lock) + kiblnd_conn_decref(conn); - lnd_timeout = lnet_get_lnd_timeout(); + spin_lock_irqsave(lock, flags); + if (!dropped_lock) + list_add_tail(&conn->ibc_list, + &kiblnd_data.kib_connd_waits); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = HASH_SIZE(kiblnd_data.kib_peers); + unsigned int lnd_timeout; + + spin_unlock_irqrestore(lock, flags); + dropped_lock = true; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer_ni table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. + */ + + lnd_timeout = kiblnd_timeout(); if (lnd_timeout > n * p) chunk = (chunk * n * p) / lnd_timeout; if (chunk == 0) @@ -3612,7 +3630,7 @@ kiblnd_connd (void *arg) for (i = 0; i < chunk; i++) { kiblnd_check_conns(peer_index); peer_index = (peer_index + 1) % - kiblnd_data.kib_peer_hash_size; + HASH_SIZE(kiblnd_data.kib_peers); } deadline += cfs_time_seconds(p); @@ -3753,43 +3771,36 @@ kiblnd_cq_event(struct ib_event *event, void *arg) int kiblnd_scheduler(void *arg) { - long id = (long)arg; - struct kib_sched_info *sched; + long id = (long)arg; + struct kib_sched_info *sched; struct kib_conn *conn; - wait_queue_entry_t wait; - unsigned long flags; - struct ib_wc wc; - int did_something; - int busy_loops = 0; - int rc; - - cfs_block_allsigs(); + wait_queue_entry_t wait; + unsigned long flags; + struct ib_wc wc; + bool did_something; + int rc; - init_waitqueue_entry(&wait, current); + init_wait(&wait); sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); if (rc != 0) { - CWARN("Unable to bind on CPU partition %d, please verify " - "whether all CPUs are healthy and reload modules if " - "necessary, otherwise your system might under risk of " - "low performance\n", sched->ibs_cpt); + CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", sched->ibs_cpt); } spin_lock_irqsave(&sched->ibs_lock, flags); while (!kiblnd_data.kib_shutdown) { - if (busy_loops++ >= IBLND_RESCHED) { + if (need_resched()) { spin_unlock_irqrestore(&sched->ibs_lock, flags); cond_resched(); - busy_loops = 0; spin_lock_irqsave(&sched->ibs_lock, flags); } - did_something = 0; + did_something = false; if (!list_empty(&sched->ibs_conns)) { conn = list_entry(sched->ibs_conns.next, @@ -3803,18 +3814,17 @@ kiblnd_scheduler(void *arg) wc.wr_id = IBLND_WID_INVAL; - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - if (rc == 0) { - rc = ib_req_notify_cq(conn->ibc_cq, - IB_CQ_NEXT_COMP); - if (rc < 0) { - CWARN("%s: ib_req_notify_cq failed: %d, " - "closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + if (rc < 0) { + CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); spin_lock_irqsave(&sched->ibs_lock, - flags); + flags); continue; } @@ -3835,8 +3845,7 @@ kiblnd_scheduler(void *arg) } if (rc < 0) { - CWARN("%s: ib_poll_cq failed: %d, " - "closing connection\n", + CWARN("%s: ib_poll_cq failed: %d, closing connection\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kiblnd_close_conn(conn, -EIO); @@ -3854,7 +3863,7 @@ kiblnd_scheduler(void *arg) /* +1 ref for sched_conns */ kiblnd_conn_addref(conn); list_add_tail(&conn->ibc_sched_list, - &sched->ibs_conns); + &sched->ibs_conns); if (waitqueue_active(&sched->ibs_waitq)) wake_up(&sched->ibs_waitq); } else { @@ -3866,21 +3875,20 @@ kiblnd_scheduler(void *arg) kiblnd_complete(&wc); spin_lock_irqsave(&sched->ibs_lock, flags); - } + } - kiblnd_conn_decref(conn); /* ...drop my ref from above */ - did_something = 1; - } + kiblnd_conn_decref(conn); /* ..drop my ref from above */ + did_something = true; + } - if (did_something) - continue; + if (did_something) + continue; set_current_state(TASK_INTERRUPTIBLE); add_wait_queue_exclusive(&sched->ibs_waitq, &wait); spin_unlock_irqrestore(&sched->ibs_lock, flags); schedule(); - busy_loops = 0; remove_wait_queue(&sched->ibs_waitq, &wait); set_current_state(TASK_RUNNING); @@ -3896,60 +3904,58 @@ kiblnd_scheduler(void *arg) int kiblnd_failover_thread(void *arg) { - rwlock_t *glock = &kiblnd_data.kib_global_lock; + rwlock_t *glock = &kiblnd_data.kib_global_lock; struct kib_dev *dev; struct net *ns = arg; wait_queue_entry_t wait; - unsigned long flags; - int rc; + unsigned long flags; + int rc; LASSERT(*kiblnd_tunables.kib_dev_failover != 0); - cfs_block_allsigs(); - - init_waitqueue_entry(&wait, current); + init_wait(&wait); write_lock_irqsave(glock, flags); - while (!kiblnd_data.kib_shutdown) { - int do_failover = 0; - int long_sleep; + while (!kiblnd_data.kib_shutdown) { + bool do_failover = false; + int long_sleep; list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, - ibd_fail_list) { + ibd_fail_list) { if (ktime_get_seconds() < dev->ibd_next_failover) - continue; - do_failover = 1; - break; - } + continue; + do_failover = true; + break; + } - if (do_failover) { + if (do_failover) { list_del_init(&dev->ibd_fail_list); - dev->ibd_failover = 1; + dev->ibd_failover = 1; write_unlock_irqrestore(glock, flags); rc = kiblnd_dev_failover(dev, ns); write_lock_irqsave(glock, flags); - LASSERT (dev->ibd_failover); - dev->ibd_failover = 0; - if (rc >= 0) { /* Device is OK or failover succeed */ + LASSERT(dev->ibd_failover); + dev->ibd_failover = 0; + if (rc >= 0) { /* Device is OK or failover succeed */ dev->ibd_next_failover = ktime_get_seconds() + 3; - continue; - } + continue; + } - /* failed to failover, retry later */ + /* failed to failover, retry later */ dev->ibd_next_failover = ktime_get_seconds() + - min(dev->ibd_failed_failover, 10); - if (kiblnd_dev_can_failover(dev)) { + min(dev->ibd_failed_failover, 10); + if (kiblnd_dev_can_failover(dev)) { list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } + &kiblnd_data.kib_failed_devs); + } - continue; - } + continue; + } - /* long sleep if no more pending failover */ + /* long sleep if no more pending failover */ long_sleep = list_empty(&kiblnd_data.kib_failed_devs); set_current_state(TASK_INTERRUPTIBLE); @@ -3957,28 +3963,29 @@ kiblnd_failover_thread(void *arg) write_unlock_irqrestore(glock, flags); rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : - cfs_time_seconds(1)); + cfs_time_seconds(1)); set_current_state(TASK_RUNNING); remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); write_lock_irqsave(glock, flags); - if (!long_sleep || rc != 0) - continue; + if (!long_sleep || rc != 0) + continue; - /* have a long sleep, routine check all active devices, - * we need checking like this because if there is not active - * connection on the dev and no SEND from local, we may listen - * on wrong HCA for ever while there is a bonding failover */ + /* have a long sleep, routine check all active devices, + * we need checking like this because if there is not active + * connection on the dev and no SEND from local, we may listen + * on wrong HCA for ever while there is a bonding failover + */ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { - if (kiblnd_dev_can_failover(dev)) { + if (kiblnd_dev_can_failover(dev)) { list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } - } - } + &kiblnd_data.kib_failed_devs); + } + } + } write_unlock_irqrestore(glock, flags); - kiblnd_thread_fini(); - return 0; + kiblnd_thread_fini(); + return 0; }