Whamcloud - gitweb
LU-3322 ko2iblnd: Support different configs between systems 94/11794/11
authorJeremy Filizetti <jeremy.filizetti@gmail.com>
Wed, 13 May 2015 21:19:04 +0000 (17:19 -0400)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 7 Oct 2015 17:39:25 +0000 (17:39 +0000)
This patch adds suppoort for ko2iblnd to have different values for
peer_credits and map_on_demand between systems.

Signed-off-by: Jeremy Filizetti <jeremy.filizetti@gmail.com>
Change-Id: Idfe5acdfdde5c2185488b92c96d7a83f1705a556
Reviewed-on: http://review.whamcloud.com/11794
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index a067bce..6ca9367 100644 (file)
@@ -635,7 +635,7 @@ kiblnd_debug_conn (kib_conn_t *conn)
                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 
        CDEBUG(D_CONSOLE, "   rxs:\n");
-       for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+       for (i = 0; i < IBLND_RX_MSGS(conn); i++)
                kiblnd_debug_rx(&conn->ibc_rxs[i]);
 
        spin_unlock(&conn->ibc_lock);
@@ -704,7 +704,7 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
 
 kib_conn_t *
 kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
-                   int state, int version)
+                  int state, int version, kib_connparams_t *cp)
 {
        /* CAVEAT EMPTOR:
         * If the new conn is created successfully it takes over the caller's
@@ -759,6 +759,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
        cmid->context = conn;                   /* for future CM callbacks */
        conn->ibc_cmid = cmid;
 
+       if (cp == NULL) {
+               conn->ibc_max_frags = IBLND_CFG_RDMA_FRAGS;
+               conn->ibc_queue_depth = *kiblnd_tunables.kib_peertxcredits;
+       } else {
+               conn->ibc_max_frags = cp->ibcp_max_frags;
+               conn->ibc_queue_depth = cp->ibcp_queue_depth;
+       }
+
        INIT_LIST_HEAD(&conn->ibc_early_rxs);
        INIT_LIST_HEAD(&conn->ibc_tx_noops);
        INIT_LIST_HEAD(&conn->ibc_tx_queue);
@@ -803,14 +811,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
        write_unlock_irqrestore(glock, flags);
 
        LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
-                        IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+                        IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
        if (conn->ibc_rxs == NULL) {
                CERROR("Cannot allocate RX buffers\n");
                goto failed_2;
        }
 
        rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
-                               IBLND_RX_MSG_PAGES(version));
+                               IBLND_RX_MSG_PAGES(conn));
        if (rc != 0)
                goto failed_2;
 
@@ -825,14 +833,14 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
 #else
        cq = ib_create_cq(cmid->device,
                          kiblnd_cq_completion, kiblnd_cq_event, conn,
-                         IBLND_CQ_ENTRIES(version),
+                         IBLND_CQ_ENTRIES(conn),
                          kiblnd_get_completion_vector(conn, cpt));
 #endif
-        if (IS_ERR(cq)) {
-                CERROR("Can't create CQ: %ld, cqe: %d\n",
-                       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
-                goto failed_2;
-        }
+       if (IS_ERR(cq)) {
+               CERROR("Failed to create CQ with %d CQEs: %ld\n",
+                       IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
+               goto failed_2;
+       }
 
         conn->ibc_cq = cq;
 
@@ -844,8 +852,8 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
 
         init_qp_attr->event_handler = kiblnd_qp_event;
         init_qp_attr->qp_context = conn;
-        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
-        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+       init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
+       init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
         init_qp_attr->cap.max_send_sge = 1;
         init_qp_attr->cap.max_recv_sge = 1;
         init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -866,23 +874,23 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
         LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
 
         /* 1 ref for caller and each rxmsg */
-       atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
-        conn->ibc_nrx = IBLND_RX_MSGS(version);
+       atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
+       conn->ibc_nrx = IBLND_RX_MSGS(conn);
 
         /* post receives */
-        for (i = 0; i < IBLND_RX_MSGS(version); i++) {
-                rc = kiblnd_post_rx(&conn->ibc_rxs[i],
-                                    IBLND_POSTRX_NO_CREDIT);
-                if (rc != 0) {
-                        CERROR("Can't post rxmsg: %d\n", rc);
+       for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+               rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+                                   IBLND_POSTRX_NO_CREDIT);
+               if (rc != 0) {
+                       CERROR("Can't post rxmsg: %d\n", rc);
 
-                        /* Make posted receives complete */
-                        kiblnd_abort_receives(conn);
+                       /* Make posted receives complete */
+                       kiblnd_abort_receives(conn);
 
-                        /* correct # of posted buffers
-                         * NB locking needed now I'm racing with completion */
+                       /* correct # of posted buffers
+                        * NB locking needed now I'm racing with completion */
                        spin_lock_irqsave(&sched->ibs_lock, flags);
-                       conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+                       conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
                        spin_unlock_irqrestore(&sched->ibs_lock, flags);
 
                         /* cmid will be destroyed by CM(ofed) after cm_callback
@@ -891,9 +899,9 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
                         rdma_destroy_qp(conn->ibc_cmid);
                         conn->ibc_cmid = NULL;
 
-                        /* Drop my own and unused rxbuffer refcounts */
-                        while (i++ <= IBLND_RX_MSGS(version))
-                                kiblnd_conn_decref(conn);
+                       /* Drop my own and unused rxbuffer refcounts */
+                       while (i++ <= IBLND_RX_MSGS(conn))
+                               kiblnd_conn_decref(conn);
 
                         return NULL;
                 }
@@ -963,7 +971,7 @@ kiblnd_destroy_conn (kib_conn_t *conn)
 
        if (conn->ibc_rxs != NULL) {
                LIBCFS_FREE(conn->ibc_rxs,
-                           IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+                           IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
        }
 
        if (conn->ibc_connvars != NULL)
@@ -1239,16 +1247,16 @@ kiblnd_unmap_rx_descs(kib_conn_t *conn)
         LASSERT (conn->ibc_rxs != NULL);
         LASSERT (conn->ibc_hdev != NULL);
 
-        for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
-                rx = &conn->ibc_rxs[i];
+       for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
+               rx = &conn->ibc_rxs[i];
 
-                LASSERT (rx->rx_nob >= 0); /* not posted */
+               LASSERT(rx->rx_nob >= 0); /* not posted */
 
-                kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
-                                        KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
-                                                          rx->rx_msgaddr),
-                                        IBLND_MSG_SIZE, DMA_FROM_DEVICE);
-        }
+               kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+                                       KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+                                                         rx->rx_msgaddr),
+                                       IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+       }
 
         kiblnd_free_pages(conn->ibc_rx_pages);
 
@@ -1264,34 +1272,34 @@ kiblnd_map_rx_descs(kib_conn_t *conn)
         int             ipg;
         int             i;
 
-        for (pg_off = ipg = i = 0;
-             i < IBLND_RX_MSGS(conn->ibc_version); i++) {
-                pg = conn->ibc_rx_pages->ibp_pages[ipg];
-                rx = &conn->ibc_rxs[i];
+       for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
+               pg = conn->ibc_rx_pages->ibp_pages[ipg];
+               rx = &conn->ibc_rxs[i];
 
-                rx->rx_conn = conn;
-                rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+               rx->rx_conn = conn;
+               rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
 
-                rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
-                                                       rx->rx_msg, IBLND_MSG_SIZE,
-                                                       DMA_FROM_DEVICE);
-                LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
-                                                   rx->rx_msgaddr));
-                KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+               rx->rx_msgaddr =
+                       kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+                                             rx->rx_msg, IBLND_MSG_SIZE,
+                                             DMA_FROM_DEVICE);
+               LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+                                                 rx->rx_msgaddr));
+               KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
 
                CDEBUG(D_NET, "rx %d: %p "LPX64"("LPX64")\n",
                       i, rx->rx_msg, rx->rx_msgaddr,
                       (__u64)(page_to_phys(pg) + pg_off));
 
-                pg_off += IBLND_MSG_SIZE;
-                LASSERT (pg_off <= PAGE_SIZE);
+               pg_off += IBLND_MSG_SIZE;
+               LASSERT(pg_off <= PAGE_SIZE);
 
-                if (pg_off == PAGE_SIZE) {
-                        pg_off = 0;
-                        ipg++;
-                        LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
-                }
-        }
+               if (pg_off == PAGE_SIZE) {
+                       pg_off = 0;
+                       ipg++;
+                       LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
+               }
+       }
 }
 
 static void
@@ -1398,12 +1406,16 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
 }
 
 struct ib_mr *
-kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd,
+                     int negotiated_nfrags)
 {
+       __u16   nfrags = (negotiated_nfrags != -1) ?
+         negotiated_nfrags : *kiblnd_tunables.kib_map_on_demand;
+
        LASSERT(hdev->ibh_mrs != NULL);
 
        if (*kiblnd_tunables.kib_map_on_demand > 0 &&
-           *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+           nfrags <= rd->rd_nfrags)
                return NULL;
 
        return hdev->ibh_mrs;
index 9183c53..28541f0 100644 (file)
@@ -179,18 +179,18 @@ kiblnd_concurrent_sends_v1(void)
 #define IBLND_FMR_POOL                 256
 #define IBLND_FMR_POOL_FLUSH           192
 
-/* TX messages (shared by all connections) */
-#define IBLND_TX_MSGS()            (*kiblnd_tunables.kib_ntx)
-
 /* RX messages (per connection) */
-#define IBLND_RX_MSGS(v)            (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
-#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
-#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+#define IBLND_RX_MSGS(c)       \
+       ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
+#define IBLND_RX_MSG_BYTES(c)       (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(c)  \
+       ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
 
 /* WRs and CQEs (per connection) */
-#define IBLND_RECV_WRS(v)            IBLND_RX_MSGS(v)
-#define IBLND_SEND_WRS(v)          ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
-#define IBLND_CQ_ENTRIES(v)         (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+#define IBLND_RECV_WRS(c)            IBLND_RX_MSGS(c)
+#define IBLND_SEND_WRS(c)      \
+       ((c->ibc_max_frags + 1) * IBLND_CONCURRENT_SENDS(c->ibc_version))
+#define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
 
 struct kib_hca_dev;
 
@@ -539,8 +539,10 @@ typedef struct {
 #define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer */
 #define IBLND_REJECT_CONN_STALE      5          /* stale peer */
 
-#define IBLND_REJECT_RDMA_FRAGS      6          /* Fatal: peer's rdma frags can't match mine */
-#define IBLND_REJECT_MSG_QUEUE_SIZE  7          /* Fatal: peer's msg queue size can't match mine */
+/* peer's rdma frags doesn't match mine */
+#define IBLND_REJECT_RDMA_FRAGS      6
+/* peer's msg queue size doesn't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7
 
 /***********************************************************************/
 
@@ -657,6 +659,10 @@ typedef struct kib_conn
        int                     ibc_reserved_credits;
        /* set on comms error */
        int                     ibc_comms_error;
+       /* connections queue depth */
+       __u16                   ibc_queue_depth;
+       /* connections max frags */
+       __u16                   ibc_max_frags;
        /* receive buffers owned */
        unsigned short          ibc_nrx;
        /** rejected by connection race */
@@ -1054,7 +1060,8 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
 #define KIBLND_CONN_PARAM_LEN(e)        ((e)->param.conn.private_data_len)
 
 struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
-                                    kib_rdma_desc_t *rd);
+                                   kib_rdma_desc_t *rd,
+                                   int negotiated_nfrags);
 void kiblnd_map_rx_descs(kib_conn_t *conn);
 void kiblnd_unmap_rx_descs(kib_conn_t *conn);
 void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
@@ -1079,7 +1086,7 @@ int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
 int  kiblnd_translate_mtu(int value);
 
 int  kiblnd_dev_failover(kib_dev_t *dev);
-int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
 void kiblnd_destroy_peer (kib_peer_t *peer);
 void kiblnd_connect_peer(kib_peer_t *peer);
 void kiblnd_destroy_dev (kib_dev_t *dev);
@@ -1090,7 +1097,7 @@ int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
 int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
 
 kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
-                                int state, int version);
+                               int state, int version, kib_connparams_t *cp);
 void kiblnd_destroy_conn (kib_conn_t *conn);
 void kiblnd_close_conn (kib_conn_t *conn, int error);
 void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
index 4a1916e..c4f6d8c 100644 (file)
@@ -329,19 +329,19 @@ kiblnd_handle_rx (kib_rx_t *rx)
                spin_lock(&conn->ibc_lock);
 
                if (conn->ibc_credits + credits >
-                   IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+                   conn->ibc_queue_depth) {
                        rc2 = conn->ibc_credits;
                        spin_unlock(&conn->ibc_lock);
 
-                        CERROR("Bad credits from %s: %d + %d > %d\n",
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                               rc2, credits,
-                               IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+                       CERROR("Bad credits from %s: %d + %d > %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              rc2, credits,
+                              conn->ibc_queue_depth);
 
-                        kiblnd_close_conn(conn, -EPROTO);
-                        kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
-                        return;
-                }
+                       kiblnd_close_conn(conn, -EPROTO);
+                       kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+                       return;
+               }
 
                 conn->ibc_credits += credits;
 
@@ -650,13 +650,14 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
                 nob += rd->rd_frags[i].rf_nob;
         }
 
-        /* looking for pre-mapping MR */
-        mr = kiblnd_find_rd_dma_mr(hdev, rd);
-        if (mr != NULL) {
-                /* found pre-mapping MR */
-                rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
-                return 0;
-        }
+       mr = kiblnd_find_rd_dma_mr(hdev, rd,
+                                  (tx->tx_conn != NULL) ?
+                                  tx->tx_conn->ibc_max_frags : -1);
+       if (mr != NULL) {
+               /* found pre-mapping MR */
+               rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+               return 0;
+       }
 
        if (net->ibn_fmr_ps != NULL)
                return kiblnd_fmr_map_tx(net, tx, rd, nob);
@@ -769,16 +770,16 @@ __must_hold(&conn->ibc_lock)
         int                done;
         struct ib_send_wr *bad_wrq;
 
-        LASSERT (tx->tx_queued);
-        /* We rely on this for QP sizing */
-        LASSERT (tx->tx_nwrq > 0);
-        LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+       LASSERT(tx->tx_queued);
+       /* We rely on this for QP sizing */
+       LASSERT(tx->tx_nwrq > 0);
+       LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
 
-        LASSERT (credit == 0 || credit == 1);
-        LASSERT (conn->ibc_outstanding_credits >= 0);
-        LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
-        LASSERT (conn->ibc_credits >= 0);
-        LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+       LASSERT(credit == 0 || credit == 1);
+       LASSERT(conn->ibc_outstanding_credits >= 0);
+       LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
+       LASSERT(conn->ibc_credits >= 0);
+       LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
 
         if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
                 /* tx completions outstanding... */
@@ -1027,9 +1028,9 @@ kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
         int                nob = offsetof (kib_msg_t, ibm_u) + body_nob;
        struct ib_mr      *mr = hdev->ibh_mrs;
 
-        LASSERT (tx->tx_nwrq >= 0);
-        LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
-        LASSERT (nob <= IBLND_MSG_SIZE);
+       LASSERT(tx->tx_nwrq >= 0);
+       LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+       LASSERT(nob <= IBLND_MSG_SIZE);
        LASSERT(mr != NULL);
 
         kiblnd_init_msg(tx->tx_msg, type, body_nob);
@@ -1083,16 +1084,16 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                         break;
                 }
 
-                if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
-                        CERROR("RDMA too fragmented for %s (%d): "
-                               "%d/%d src %d/%d dst frags\n",
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                               IBLND_RDMA_FRAGS(conn->ibc_version),
-                               srcidx, srcrd->rd_nfrags,
-                               dstidx, dstrd->rd_nfrags);
-                        rc = -EMSGSIZE;
-                        break;
-                }
+               if (tx->tx_nwrq >= conn->ibc_max_frags) {
+                       CERROR("RDMA has too many fragments for peer %s (%d), "
+                              "src idx/frags: %d/%d dst idx/frags: %d/%d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              conn->ibc_max_frags,
+                              srcidx, srcrd->rd_nfrags,
+                              dstidx, dstrd->rd_nfrags);
+                       rc = -EMSGSIZE;
+                       break;
+               }
 
                 wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
                                  kiblnd_rd_frag_size(dstrd, dstidx)), resid);
@@ -1365,17 +1366,17 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
 
        write_unlock_irqrestore(g_lock, flags);
 
-        /* Allocate a peer ready to add to the peer table and retry */
-        rc = kiblnd_create_peer(ni, &peer, nid);
-        if (rc != 0) {
-                CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
-                if (tx != NULL) {
-                        tx->tx_status = -EHOSTUNREACH;
-                        tx->tx_waiting = 0;
-                        kiblnd_tx_done(ni, tx);
-                }
-                return;
-        }
+       /* Allocate a peer ready to add to the peer table and retry */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+               if (tx != NULL) {
+                       tx->tx_status = -EHOSTUNREACH;
+                       tx->tx_waiting = 0;
+                       kiblnd_tx_done(ni, tx);
+               }
+               return;
+       }
 
        write_lock_irqsave(g_lock, flags);
 
@@ -2227,7 +2228,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         if (ni == NULL ||                         /* no matching net */
             ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
             net->ibn_dev != ibdev) {              /* wrong device */
-               CERROR("Can't accept %s on %s (%s:%d:%pI4h): "
+               CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): "
                        "bad dst nid %s\n", libcfs_nid2str(nid),
                        ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
                        ibdev->ibd_ifname, ibdev->ibd_nnets,
@@ -2254,32 +2255,46 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                 goto failed;
         }
 
-        if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
-            IBLND_MSG_QUEUE_SIZE(version)) {
-                CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
-                       libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
-                       IBLND_MSG_QUEUE_SIZE(version));
-
-                if (version == IBLND_MSG_VERSION)
-                        rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
-
-                goto failed;
-        }
-
-        if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
-            IBLND_RDMA_FRAGS(version)) {
-                CERROR("Can't accept %s(version %x): "
-                       "incompatible max_frags %d (%d wanted)\n",
-                       libcfs_nid2str(nid), version,
-                       reqmsg->ibm_u.connparams.ibcp_max_frags,
-                       IBLND_RDMA_FRAGS(version));
+       if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
+           IBLND_MSG_QUEUE_SIZE(version)) {
+               CERROR("Can't accept conn from %s, queue depth too large: "
+                      " %d (<=%d wanted)\n",
+                      libcfs_nid2str(nid),
+                      reqmsg->ibm_u.connparams.ibcp_queue_depth,
+                      IBLND_MSG_QUEUE_SIZE(version));
 
-                if (version == IBLND_MSG_VERSION)
-                        rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+               if (version == IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
 
-                goto failed;
+               goto failed;
+       }
 
-        }
+       if (reqmsg->ibm_u.connparams.ibcp_max_frags >
+           IBLND_RDMA_FRAGS(version)) {
+               CWARN("Can't accept conn from %s (version %x): "
+                     "max_frags %d too large (%d wanted)\n",
+                      libcfs_nid2str(nid), version,
+                      reqmsg->ibm_u.connparams.ibcp_max_frags,
+                      IBLND_RDMA_FRAGS(version));
+
+               if (version >= IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+               goto failed;
+       } else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
+                  IBLND_RDMA_FRAGS(version) && net->ibn_fmr_ps == NULL) {
+               CWARN("Can't accept conn from %s (version %x): "
+                     "max_frags %d incompatible without FMR pool "
+                     "(%d wanted)\n",
+                     libcfs_nid2str(nid), version,
+                     reqmsg->ibm_u.connparams.ibcp_max_frags,
+                     IBLND_RDMA_FRAGS(version));
+
+               if (version >= IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+               goto failed;
+       }
 
         if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
                 CERROR("Can't accept %s: message size %d too big (%d max)\n",
@@ -2289,13 +2304,13 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                 goto failed;
         }
 
-        /* assume 'nid' is a new peer; create  */
-        rc = kiblnd_create_peer(ni, &peer, nid);
-        if (rc != 0) {
-                CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
-                rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
-                goto failed;
-        }
+       /* assume 'nid' is a new peer; create  */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+               rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+               goto failed;
+       }
 
        write_lock_irqsave(g_lock, flags);
 
@@ -2357,7 +2372,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                write_unlock_irqrestore(g_lock, flags);
         }
 
-        conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version,
+                                 &reqmsg->ibm_u.connparams);
         if (conn == NULL) {
                 kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
                 kiblnd_peer_decref(peer);
@@ -2368,20 +2384,22 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         /* conn now "owns" cmid, so I return success from here on to ensure the
          * CM callback doesn't destroy cmid. */
 
-        conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
-        conn->ibc_credits          = IBLND_MSG_QUEUE_SIZE(version);
-        conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
-        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
-                 <= IBLND_RX_MSGS(version));
+       conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+       conn->ibc_credits          = reqmsg->ibm_u.connparams.ibcp_queue_depth;
+       conn->ibc_reserved_credits = reqmsg->ibm_u.connparams.ibcp_queue_depth;
+       LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+               IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
 
         ackmsg = &conn->ibc_connvars->cv_msg;
         memset(ackmsg, 0, sizeof(*ackmsg));
 
         kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
                         sizeof(ackmsg->ibm_u.connparams));
-        ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
-        ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
-        ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+       ackmsg->ibm_u.connparams.ibcp_queue_depth  =
+               reqmsg->ibm_u.connparams.ibcp_queue_depth;
+       ackmsg->ibm_u.connparams.ibcp_max_frags    =
+               reqmsg->ibm_u.connparams.ibcp_max_frags;
+       ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
         kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
 
@@ -2454,10 +2472,9 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
                } else {
                        retry_now = 1;
                }
-                peer->ibp_connecting++;
-
-                peer->ibp_version     = version;
-                peer->ibp_incarnation = incarnation;
+               peer->ibp_connecting++;
+               peer->ibp_version     = version;
+               peer->ibp_incarnation = incarnation;
         }
 
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
@@ -2470,6 +2487,32 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
                 reason = "Unknown";
                 break;
 
+       case IBLND_REJECT_RDMA_FRAGS:
+               if (conn->ibc_max_frags <= cp->ibcp_max_frags) {
+                       CNETERR("Unsupported max frags, peer supports %d\n",
+                               cp->ibcp_max_frags);
+                       goto failed;
+               } else if (*kiblnd_tunables.kib_map_on_demand == 0) {
+                       CNETERR("map_on_demand must be enabled to support "
+                               "map_on_demand peers\n");
+                       goto failed;
+               }
+
+               conn->ibc_max_frags = cp->ibcp_max_frags;
+               reason = "rdma fragments";
+               break;
+
+       case IBLND_REJECT_MSG_QUEUE_SIZE:
+               if (conn->ibc_queue_depth <= cp->ibcp_queue_depth) {
+                       CNETERR("Unsupported queue depth, peer supports %d\n",
+                               cp->ibcp_queue_depth);
+                       goto failed;
+               }
+
+               conn->ibc_queue_depth = cp->ibcp_queue_depth;
+               reason = "queue depth";
+               break;
+
         case IBLND_REJECT_CONN_STALE:
                 reason = "stale";
                 break;
@@ -2479,15 +2522,22 @@ kiblnd_reconnect (kib_conn_t *conn, int version,
                 break;
         }
 
-        CNETERR("%s: retrying (%s), %x, %x, "
-                "queue_dep: %d, max_frag: %d, msg_size: %d\n",
-                libcfs_nid2str(peer->ibp_nid),
-                reason, IBLND_MSG_VERSION, version,
-                cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
-                cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
-                cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+       CNETERR("%s: retrying (%s), %x, %x, "
+               "queue_depth: %d, max_frags: %d, msg_size: %d\n",
+               libcfs_nid2str(peer->ibp_nid),
+               reason, IBLND_MSG_VERSION, version,
+               conn->ibc_queue_depth, conn->ibc_max_frags,
+               cp != NULL ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE);
 
         kiblnd_connect_peer(peer);
+       return;
+
+ failed:
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       peer->ibp_connecting--;
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       return;
 }
 
 static void
@@ -2581,26 +2631,12 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                         case IBLND_REJECT_CONN_RACE:
                         case IBLND_REJECT_CONN_STALE:
                         case IBLND_REJECT_CONN_UNCOMPAT:
+                       case IBLND_REJECT_MSG_QUEUE_SIZE:
+                       case IBLND_REJECT_RDMA_FRAGS:
                                 kiblnd_reconnect(conn, rej->ibr_version,
                                                  incarnation, rej->ibr_why, cp);
                                 break;
 
-                        case IBLND_REJECT_MSG_QUEUE_SIZE:
-                                CERROR("%s rejected: incompatible message queue depth %d, %d\n",
-                                      libcfs_nid2str(peer->ibp_nid),
-                                      cp != NULL ? cp->ibcp_queue_depth :
-                                      IBLND_MSG_QUEUE_SIZE(rej->ibr_version),
-                                      IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
-                                break;
-
-                        case IBLND_REJECT_RDMA_FRAGS:
-                                CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
-                                      libcfs_nid2str(peer->ibp_nid),
-                                      cp != NULL ? cp->ibcp_max_frags :
-                                      IBLND_RDMA_FRAGS(rej->ibr_version),
-                                      IBLND_RDMA_FRAGS(conn->ibc_version));
-                                break;
-
                         case IBLND_REJECT_NO_RESOURCES:
                                 CERROR("%s rejected: o2iblnd no resources\n",
                                        libcfs_nid2str(peer->ibp_nid));
@@ -2663,25 +2699,25 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
                 goto failed;
         }
 
-        if (msg->ibm_u.connparams.ibcp_queue_depth !=
-            IBLND_MSG_QUEUE_SIZE(ver)) {
-                CERROR("%s has incompatible queue depth %d(%d wanted)\n",
-                       libcfs_nid2str(peer->ibp_nid),
-                       msg->ibm_u.connparams.ibcp_queue_depth,
-                       IBLND_MSG_QUEUE_SIZE(ver));
-                rc = -EPROTO;
-                goto failed;
-        }
+       if (msg->ibm_u.connparams.ibcp_queue_depth >
+           conn->ibc_queue_depth) {
+               CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_queue_depth,
+                      conn->ibc_queue_depth);
+               rc = -EPROTO;
+               goto failed;
+       }
 
-        if (msg->ibm_u.connparams.ibcp_max_frags !=
-            IBLND_RDMA_FRAGS(ver)) {
-                CERROR("%s has incompatible max_frags %d (%d wanted)\n",
-                       libcfs_nid2str(peer->ibp_nid),
-                       msg->ibm_u.connparams.ibcp_max_frags,
-                       IBLND_RDMA_FRAGS(ver));
-                rc = -EPROTO;
-                goto failed;
-        }
+       if (msg->ibm_u.connparams.ibcp_max_frags >
+           conn->ibc_max_frags) {
+               CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_max_frags,
+                      conn->ibc_max_frags);
+               rc = -EPROTO;
+               goto failed;
+       }
 
         if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
                 CERROR("%s max message size %d too big (%d max)\n",
@@ -2708,11 +2744,13 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
                 goto failed;
         }
 
-        conn->ibc_incarnation      = msg->ibm_srcstamp;
-        conn->ibc_credits          =
-        conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
-        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
-                 <= IBLND_RX_MSGS(ver));
+       conn->ibc_incarnation      = msg->ibm_srcstamp;
+       conn->ibc_credits          = msg->ibm_u.connparams.ibcp_queue_depth;
+       conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
+       conn->ibc_queue_depth      = msg->ibm_u.connparams.ibcp_queue_depth;
+       conn->ibc_max_frags        = msg->ibm_u.connparams.ibcp_max_frags;
+       LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
+               IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
 
         kiblnd_connreq_done(conn, 0);
         return;
@@ -2748,7 +2786,8 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 
        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-        conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
+                                 version, NULL);
         if (conn == NULL) {
                 kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
                 kiblnd_peer_decref(peer); /* lose cmid's ref */
@@ -2761,11 +2800,11 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 
         msg = &conn->ibc_connvars->cv_msg;
 
-        memset(msg, 0, sizeof(*msg));
-        kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
-        msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
-        msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
-        msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+       memset(msg, 0, sizeof(*msg));
+       kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+       msg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
+       msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
+       msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
         kiblnd_pack_msg(peer->ibp_ni, msg, version,
                         0, peer->ibp_nid, incarnation);