From: Jeremy Filizetti Date: Wed, 13 May 2015 21:19:04 +0000 (-0400) Subject: LU-3322 ko2iblnd: Support different configs between systems X-Git-Tag: 2.7.62~35 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=7f5c9753872cfa8ad47821be3fa924c74c4c8b0d;p=fs%2Flustre-release.git LU-3322 ko2iblnd: Support different configs between systems This patch adds suppoort for ko2iblnd to have different values for peer_credits and map_on_demand between systems. Signed-off-by: Jeremy Filizetti Change-Id: Idfe5acdfdde5c2185488b92c96d7a83f1705a556 Reviewed-on: http://review.whamcloud.com/11794 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index a067bce..6ca9367 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -635,7 +635,7 @@ kiblnd_debug_conn (kib_conn_t *conn) kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); CDEBUG(D_CONSOLE, " rxs:\n"); - for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) + for (i = 0; i < IBLND_RX_MSGS(conn); i++) kiblnd_debug_rx(&conn->ibc_rxs[i]); spin_unlock(&conn->ibc_lock); @@ -704,7 +704,7 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) kib_conn_t * kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, - int state, int version) + int state, int version, kib_connparams_t *cp) { /* CAVEAT EMPTOR: * If the new conn is created successfully it takes over the caller's @@ -759,6 +759,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, cmid->context = conn; /* for future CM callbacks */ conn->ibc_cmid = cmid; + if (cp == NULL) { + conn->ibc_max_frags = IBLND_CFG_RDMA_FRAGS; + conn->ibc_queue_depth = *kiblnd_tunables.kib_peertxcredits; + } else { + conn->ibc_max_frags = cp->ibcp_max_frags; + conn->ibc_queue_depth = cp->ibcp_queue_depth; + } + INIT_LIST_HEAD(&conn->ibc_early_rxs); INIT_LIST_HEAD(&conn->ibc_tx_noops); INIT_LIST_HEAD(&conn->ibc_tx_queue); @@ -803,14 +811,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, write_unlock_irqrestore(glock, flags); LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, - IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); + IBLND_RX_MSGS(conn) * sizeof(kib_rx_t)); if (conn->ibc_rxs == NULL) { CERROR("Cannot allocate RX buffers\n"); goto failed_2; } rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, - IBLND_RX_MSG_PAGES(version)); + IBLND_RX_MSG_PAGES(conn)); if (rc != 0) goto failed_2; @@ -825,14 +833,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, #else cq = ib_create_cq(cmid->device, kiblnd_cq_completion, kiblnd_cq_event, conn, - IBLND_CQ_ENTRIES(version), + IBLND_CQ_ENTRIES(conn), kiblnd_get_completion_vector(conn, cpt)); #endif - if (IS_ERR(cq)) { - CERROR("Can't create CQ: %ld, cqe: %d\n", - PTR_ERR(cq), IBLND_CQ_ENTRIES(version)); - goto failed_2; - } + if (IS_ERR(cq)) { + CERROR("Failed to create CQ with %d CQEs: %ld\n", + IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); + goto failed_2; + } conn->ibc_cq = cq; @@ -844,8 +852,8 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, init_qp_attr->event_handler = kiblnd_qp_event; init_qp_attr->qp_context = conn; - init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version); - init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version); + init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); + init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); init_qp_attr->cap.max_send_sge = 1; init_qp_attr->cap.max_recv_sge = 1; init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; @@ -866,23 +874,23 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); /* 1 ref for caller and each rxmsg */ - atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version)); - conn->ibc_nrx = IBLND_RX_MSGS(version); + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); + conn->ibc_nrx = IBLND_RX_MSGS(conn); /* post receives */ - for (i = 0; i < IBLND_RX_MSGS(version); i++) { - rc = kiblnd_post_rx(&conn->ibc_rxs[i], - IBLND_POSTRX_NO_CREDIT); - if (rc != 0) { - CERROR("Can't post rxmsg: %d\n", rc); + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], + IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); - /* Make posted receives complete */ - kiblnd_abort_receives(conn); + /* Make posted receives complete */ + kiblnd_abort_receives(conn); - /* correct # of posted buffers - * NB locking needed now I'm racing with completion */ + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ spin_lock_irqsave(&sched->ibs_lock, flags); - conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; + conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; spin_unlock_irqrestore(&sched->ibs_lock, flags); /* cmid will be destroyed by CM(ofed) after cm_callback @@ -891,9 +899,9 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, rdma_destroy_qp(conn->ibc_cmid); conn->ibc_cmid = NULL; - /* Drop my own and unused rxbuffer refcounts */ - while (i++ <= IBLND_RX_MSGS(version)) - kiblnd_conn_decref(conn); + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS(conn)) + kiblnd_conn_decref(conn); return NULL; } @@ -963,7 +971,7 @@ kiblnd_destroy_conn (kib_conn_t *conn) if (conn->ibc_rxs != NULL) { LIBCFS_FREE(conn->ibc_rxs, - IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t)); + IBLND_RX_MSGS(conn) * sizeof(kib_rx_t)); } if (conn->ibc_connvars != NULL) @@ -1239,16 +1247,16 @@ kiblnd_unmap_rx_descs(kib_conn_t *conn) LASSERT (conn->ibc_rxs != NULL); LASSERT (conn->ibc_hdev != NULL); - for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) { - rx = &conn->ibc_rxs[i]; + for (i = 0; i < IBLND_RX_MSGS(conn); i++) { + rx = &conn->ibc_rxs[i]; - LASSERT (rx->rx_nob >= 0); /* not posted */ + LASSERT(rx->rx_nob >= 0); /* not posted */ - kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, - KIBLND_UNMAP_ADDR(rx, rx_msgunmap, - rx->rx_msgaddr), - IBLND_MSG_SIZE, DMA_FROM_DEVICE); - } + kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(rx, rx_msgunmap, + rx->rx_msgaddr), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } kiblnd_free_pages(conn->ibc_rx_pages); @@ -1264,34 +1272,34 @@ kiblnd_map_rx_descs(kib_conn_t *conn) int ipg; int i; - for (pg_off = ipg = i = 0; - i < IBLND_RX_MSGS(conn->ibc_version); i++) { - pg = conn->ibc_rx_pages->ibp_pages[ipg]; - rx = &conn->ibc_rxs[i]; + for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { + pg = conn->ibc_rx_pages->ibp_pages[ipg]; + rx = &conn->ibc_rxs[i]; - rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off); + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off); - rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, - rx->rx_msg, IBLND_MSG_SIZE, - DMA_FROM_DEVICE); - LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, - rx->rx_msgaddr)); - KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); + rx->rx_msgaddr = + kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, + rx->rx_msg, IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, + rx->rx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); CDEBUG(D_NET, "rx %d: %p "LPX64"("LPX64")\n", i, rx->rx_msg, rx->rx_msgaddr, (__u64)(page_to_phys(pg) + pg_off)); - pg_off += IBLND_MSG_SIZE; - LASSERT (pg_off <= PAGE_SIZE); + pg_off += IBLND_MSG_SIZE; + LASSERT(pg_off <= PAGE_SIZE); - if (pg_off == PAGE_SIZE) { - pg_off = 0; - ipg++; - LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version)); - } - } + if (pg_off == PAGE_SIZE) { + pg_off = 0; + ipg++; + LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); + } + } } static void @@ -1398,12 +1406,16 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo) } struct ib_mr * -kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd) +kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd, + int negotiated_nfrags) { + __u16 nfrags = (negotiated_nfrags != -1) ? + negotiated_nfrags : *kiblnd_tunables.kib_map_on_demand; + LASSERT(hdev->ibh_mrs != NULL); if (*kiblnd_tunables.kib_map_on_demand > 0 && - *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags) + nfrags <= rd->rd_nfrags) return NULL; return hdev->ibh_mrs; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 9183c53..28541f0 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -179,18 +179,18 @@ kiblnd_concurrent_sends_v1(void) #define IBLND_FMR_POOL 256 #define IBLND_FMR_POOL_FLUSH 192 -/* TX messages (shared by all connections) */ -#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) - /* RX messages (per connection) */ -#define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v)) -#define IBLND_RX_MSG_BYTES(v) (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE) -#define IBLND_RX_MSG_PAGES(v) ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE) +#define IBLND_RX_MSGS(c) \ + ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version)) +#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES(c) \ + ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE) /* WRs and CQEs (per connection) */ -#define IBLND_RECV_WRS(v) IBLND_RX_MSGS(v) -#define IBLND_SEND_WRS(v) ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v)) -#define IBLND_CQ_ENTRIES(v) (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v)) +#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) +#define IBLND_SEND_WRS(c) \ + ((c->ibc_max_frags + 1) * IBLND_CONCURRENT_SENDS(c->ibc_version)) +#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) struct kib_hca_dev; @@ -539,8 +539,10 @@ typedef struct { #define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */ #define IBLND_REJECT_CONN_STALE 5 /* stale peer */ -#define IBLND_REJECT_RDMA_FRAGS 6 /* Fatal: peer's rdma frags can't match mine */ -#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /* Fatal: peer's msg queue size can't match mine */ +/* peer's rdma frags doesn't match mine */ +#define IBLND_REJECT_RDMA_FRAGS 6 +/* peer's msg queue size doesn't match mine */ +#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /***********************************************************************/ @@ -657,6 +659,10 @@ typedef struct kib_conn int ibc_reserved_credits; /* set on comms error */ int ibc_comms_error; + /* connections queue depth */ + __u16 ibc_queue_depth; + /* connections max frags */ + __u16 ibc_max_frags; /* receive buffers owned */ unsigned short ibc_nrx; /** rejected by connection race */ @@ -1054,7 +1060,8 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, #define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, - kib_rdma_desc_t *rd); + kib_rdma_desc_t *rd, + int negotiated_nfrags); void kiblnd_map_rx_descs(kib_conn_t *conn); void kiblnd_unmap_rx_descs(kib_conn_t *conn); void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node); @@ -1079,7 +1086,7 @@ int kiblnd_cm_callback(struct rdma_cm_id *cmid, int kiblnd_translate_mtu(int value); int kiblnd_dev_failover(kib_dev_t *dev); -int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); +int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); void kiblnd_destroy_peer (kib_peer_t *peer); void kiblnd_connect_peer(kib_peer_t *peer); void kiblnd_destroy_dev (kib_dev_t *dev); @@ -1090,7 +1097,7 @@ int kiblnd_close_stale_conns_locked (kib_peer_t *peer, int kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why); kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, - int state, int version); + int state, int version, kib_connparams_t *cp); void kiblnd_destroy_conn (kib_conn_t *conn); void kiblnd_close_conn (kib_conn_t *conn, int error); void kiblnd_close_conn_locked (kib_conn_t *conn, int error); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 4a1916e..c4f6d8c 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -329,19 +329,19 @@ kiblnd_handle_rx (kib_rx_t *rx) spin_lock(&conn->ibc_lock); if (conn->ibc_credits + credits > - IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) { + conn->ibc_queue_depth) { rc2 = conn->ibc_credits; spin_unlock(&conn->ibc_lock); - CERROR("Bad credits from %s: %d + %d > %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc2, credits, - IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, + conn->ibc_queue_depth); - kiblnd_close_conn(conn, -EPROTO); - kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); - return; - } + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } conn->ibc_credits += credits; @@ -650,13 +650,14 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags) nob += rd->rd_frags[i].rf_nob; } - /* looking for pre-mapping MR */ - mr = kiblnd_find_rd_dma_mr(hdev, rd); - if (mr != NULL) { - /* found pre-mapping MR */ - rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; - return 0; - } + mr = kiblnd_find_rd_dma_mr(hdev, rd, + (tx->tx_conn != NULL) ? + tx->tx_conn->ibc_max_frags : -1); + if (mr != NULL) { + /* found pre-mapping MR */ + rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; + return 0; + } if (net->ibn_fmr_ps != NULL) return kiblnd_fmr_map_tx(net, tx, rd, nob); @@ -769,16 +770,16 @@ __must_hold(&conn->ibc_lock) int done; struct ib_send_wr *bad_wrq; - LASSERT (tx->tx_queued); - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nwrq > 0); - LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver)); + LASSERT(tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT(tx->tx_nwrq > 0); + LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); - LASSERT (credit == 0 || credit == 1); - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver)); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver)); + LASSERT(credit == 0 || credit == 1); + LASSERT(conn->ibc_outstanding_credits >= 0); + LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth); + LASSERT(conn->ibc_credits >= 0); + LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) { /* tx completions outstanding... */ @@ -1027,9 +1028,9 @@ kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob) int nob = offsetof (kib_msg_t, ibm_u) + body_nob; struct ib_mr *mr = hdev->ibh_mrs; - LASSERT (tx->tx_nwrq >= 0); - LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); - LASSERT (nob <= IBLND_MSG_SIZE); + LASSERT(tx->tx_nwrq >= 0); + LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT(nob <= IBLND_MSG_SIZE); LASSERT(mr != NULL); kiblnd_init_msg(tx->tx_msg, type, body_nob); @@ -1083,16 +1084,16 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, break; } - if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) { - CERROR("RDMA too fragmented for %s (%d): " - "%d/%d src %d/%d dst frags\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - IBLND_RDMA_FRAGS(conn->ibc_version), - srcidx, srcrd->rd_nfrags, - dstidx, dstrd->rd_nfrags); - rc = -EMSGSIZE; - break; - } + if (tx->tx_nwrq >= conn->ibc_max_frags) { + CERROR("RDMA has too many fragments for peer %s (%d), " + "src idx/frags: %d/%d dst idx/frags: %d/%d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags, + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), kiblnd_rd_frag_size(dstrd, dstidx)), resid); @@ -1365,17 +1366,17 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) write_unlock_irqrestore(g_lock, flags); - /* Allocate a peer ready to add to the peer table and retry */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc != 0) { - CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); - if (tx != NULL) { - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kiblnd_tx_done(ni, tx); - } - return; - } + /* Allocate a peer ready to add to the peer table and retry */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kiblnd_tx_done(ni, tx); + } + return; + } write_lock_irqsave(g_lock, flags); @@ -2227,7 +2228,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) if (ni == NULL || /* no matching net */ ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ net->ibn_dev != ibdev) { /* wrong device */ - CERROR("Can't accept %s on %s (%s:%d:%pI4h): " + CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): " "bad dst nid %s\n", libcfs_nid2str(nid), ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid), ibdev->ibd_ifname, ibdev->ibd_nnets, @@ -2254,32 +2255,46 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - if (reqmsg->ibm_u.connparams.ibcp_queue_depth != - IBLND_MSG_QUEUE_SIZE(version)) { - CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", - libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth, - IBLND_MSG_QUEUE_SIZE(version)); - - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; - - goto failed; - } - - if (reqmsg->ibm_u.connparams.ibcp_max_frags != - IBLND_RDMA_FRAGS(version)) { - CERROR("Can't accept %s(version %x): " - "incompatible max_frags %d (%d wanted)\n", - libcfs_nid2str(nid), version, - reqmsg->ibm_u.connparams.ibcp_max_frags, - IBLND_RDMA_FRAGS(version)); + if (reqmsg->ibm_u.connparams.ibcp_queue_depth > + IBLND_MSG_QUEUE_SIZE(version)) { + CERROR("Can't accept conn from %s, queue depth too large: " + " %d (<=%d wanted)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(version)); - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; - goto failed; + goto failed; + } - } + if (reqmsg->ibm_u.connparams.ibcp_max_frags > + IBLND_RDMA_FRAGS(version)) { + CWARN("Can't accept conn from %s (version %x): " + "max_frags %d too large (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(version)); + + if (version >= IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < + IBLND_RDMA_FRAGS(version) && net->ibn_fmr_ps == NULL) { + CWARN("Can't accept conn from %s (version %x): " + "max_frags %d incompatible without FMR pool " + "(%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(version)); + + if (version >= IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + } if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { CERROR("Can't accept %s: message size %d too big (%d max)\n", @@ -2289,13 +2304,13 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) goto failed; } - /* assume 'nid' is a new peer; create */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc != 0) { - CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } + /* assume 'nid' is a new peer; create */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } write_lock_irqsave(g_lock, flags); @@ -2357,7 +2372,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) write_unlock_irqrestore(g_lock, flags); } - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version); + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version, + &reqmsg->ibm_u.connparams); if (conn == NULL) { kiblnd_peer_connect_failed(peer, 0, -ENOMEM); kiblnd_peer_decref(peer); @@ -2368,20 +2384,22 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) /* conn now "owns" cmid, so I return success from here on to ensure the * CM callback doesn't destroy cmid. */ - conn->ibc_incarnation = reqmsg->ibm_srcstamp; - conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version); - conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version); - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version) - <= IBLND_RX_MSGS(version)); + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = reqmsg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_reserved_credits = reqmsg->ibm_u.connparams.ibcp_queue_depth; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); ackmsg = &conn->ibc_connvars->cv_msg; memset(ackmsg, 0, sizeof(*ackmsg)); kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, sizeof(ackmsg->ibm_u.connparams)); - ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); - ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + ackmsg->ibm_u.connparams.ibcp_queue_depth = + reqmsg->ibm_u.connparams.ibcp_queue_depth; + ackmsg->ibm_u.connparams.ibcp_max_frags = + reqmsg->ibm_u.connparams.ibcp_max_frags; + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); @@ -2454,10 +2472,9 @@ kiblnd_reconnect (kib_conn_t *conn, int version, } else { retry_now = 1; } - peer->ibp_connecting++; - - peer->ibp_version = version; - peer->ibp_incarnation = incarnation; + peer->ibp_connecting++; + peer->ibp_version = version; + peer->ibp_incarnation = incarnation; } write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); @@ -2470,6 +2487,32 @@ kiblnd_reconnect (kib_conn_t *conn, int version, reason = "Unknown"; break; + case IBLND_REJECT_RDMA_FRAGS: + if (conn->ibc_max_frags <= cp->ibcp_max_frags) { + CNETERR("Unsupported max frags, peer supports %d\n", + cp->ibcp_max_frags); + goto failed; + } else if (*kiblnd_tunables.kib_map_on_demand == 0) { + CNETERR("map_on_demand must be enabled to support " + "map_on_demand peers\n"); + goto failed; + } + + conn->ibc_max_frags = cp->ibcp_max_frags; + reason = "rdma fragments"; + break; + + case IBLND_REJECT_MSG_QUEUE_SIZE: + if (conn->ibc_queue_depth <= cp->ibcp_queue_depth) { + CNETERR("Unsupported queue depth, peer supports %d\n", + cp->ibcp_queue_depth); + goto failed; + } + + conn->ibc_queue_depth = cp->ibcp_queue_depth; + reason = "queue depth"; + break; + case IBLND_REJECT_CONN_STALE: reason = "stale"; break; @@ -2479,15 +2522,22 @@ kiblnd_reconnect (kib_conn_t *conn, int version, break; } - CNETERR("%s: retrying (%s), %x, %x, " - "queue_dep: %d, max_frag: %d, msg_size: %d\n", - libcfs_nid2str(peer->ibp_nid), - reason, IBLND_MSG_VERSION, version, - cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version), - cp != NULL? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version), - cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE); + CNETERR("%s: retrying (%s), %x, %x, " + "queue_depth: %d, max_frags: %d, msg_size: %d\n", + libcfs_nid2str(peer->ibp_nid), + reason, IBLND_MSG_VERSION, version, + conn->ibc_queue_depth, conn->ibc_max_frags, + cp != NULL ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE); kiblnd_connect_peer(peer); + return; + + failed: + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + peer->ibp_connecting--; + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return; } static void @@ -2581,26 +2631,12 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob) case IBLND_REJECT_CONN_RACE: case IBLND_REJECT_CONN_STALE: case IBLND_REJECT_CONN_UNCOMPAT: + case IBLND_REJECT_MSG_QUEUE_SIZE: + case IBLND_REJECT_RDMA_FRAGS: kiblnd_reconnect(conn, rej->ibr_version, incarnation, rej->ibr_why, cp); break; - case IBLND_REJECT_MSG_QUEUE_SIZE: - CERROR("%s rejected: incompatible message queue depth %d, %d\n", - libcfs_nid2str(peer->ibp_nid), - cp != NULL ? cp->ibcp_queue_depth : - IBLND_MSG_QUEUE_SIZE(rej->ibr_version), - IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); - break; - - case IBLND_REJECT_RDMA_FRAGS: - CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n", - libcfs_nid2str(peer->ibp_nid), - cp != NULL ? cp->ibcp_max_frags : - IBLND_RDMA_FRAGS(rej->ibr_version), - IBLND_RDMA_FRAGS(conn->ibc_version)); - break; - case IBLND_REJECT_NO_RESOURCES: CERROR("%s rejected: o2iblnd no resources\n", libcfs_nid2str(peer->ibp_nid)); @@ -2663,25 +2699,25 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) goto failed; } - if (msg->ibm_u.connparams.ibcp_queue_depth != - IBLND_MSG_QUEUE_SIZE(ver)) { - CERROR("%s has incompatible queue depth %d(%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_queue_depth, - IBLND_MSG_QUEUE_SIZE(ver)); - rc = -EPROTO; - goto failed; - } + if (msg->ibm_u.connparams.ibcp_queue_depth > + conn->ibc_queue_depth) { + CERROR("%s has incompatible queue depth %d (<=%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + conn->ibc_queue_depth); + rc = -EPROTO; + goto failed; + } - if (msg->ibm_u.connparams.ibcp_max_frags != - IBLND_RDMA_FRAGS(ver)) { - CERROR("%s has incompatible max_frags %d (%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags, - IBLND_RDMA_FRAGS(ver)); - rc = -EPROTO; - goto failed; - } + if (msg->ibm_u.connparams.ibcp_max_frags > + conn->ibc_max_frags) { + CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_frags, + conn->ibc_max_frags); + rc = -EPROTO; + goto failed; + } if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { CERROR("%s max message size %d too big (%d max)\n", @@ -2708,11 +2744,13 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) goto failed; } - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = - conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver); - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) - <= IBLND_RX_MSGS(ver)); + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; + conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags; + LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + + IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); kiblnd_connreq_done(conn, 0); return; @@ -2748,7 +2786,8 @@ kiblnd_active_connect (struct rdma_cm_id *cmid) read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version); + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, + version, NULL); if (conn == NULL) { kiblnd_peer_connect_failed(peer, 1, -ENOMEM); kiblnd_peer_decref(peer); /* lose cmid's ref */ @@ -2761,11 +2800,11 @@ kiblnd_active_connect (struct rdma_cm_id *cmid) msg = &conn->ibc_connvars->cv_msg; - memset(msg, 0, sizeof(*msg)); - kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); - msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); - msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); - msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; + msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags; + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; kiblnd_pack_msg(peer->ibp_ni, msg, version, 0, peer->ibp_nid, incarnation);