Whamcloud - gitweb
LU-5718 o2iblnd: multiple sges for work request 51/12451/12
authorLiang Zhen <liang.zhen@intel.com>
Tue, 28 Oct 2014 03:34:26 +0000 (11:34 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 26 Apr 2017 03:36:38 +0000 (03:36 +0000)
In current protocol, lnet router cannot align buffer for rdma,
o2iblnd may run into "too fragmented RDMA" issue while routing
non-page-aligned IO larger than 512K, because each page will
be splited into two fragments by kiblnd_init_rdma().

With this patch, o2iblnd can have multiple sges for each work
request, and combine multiple remote fragments of the same page
into one work request to resovle the "too fragmented RDMA" issue.

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: Id57a74dc92801b012956ab785233aa87cac14263
Reviewed-on: https://review.whamcloud.com/12451
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: Nathaniel Clark <nathaniel.l.clark@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/o2iblnd/o2iblnd_modparams.c

index a5e7543..b17c5e3 100644 (file)
@@ -840,16 +840,16 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
-        init_qp_attr->event_handler = kiblnd_qp_event;
-        init_qp_attr->qp_context = conn;
+       init_qp_attr->event_handler = kiblnd_qp_event;
+       init_qp_attr->qp_context = conn;
        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-        init_qp_attr->cap.max_send_sge = 1;
-        init_qp_attr->cap.max_recv_sge = 1;
-        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-        init_qp_attr->qp_type = IB_QPT_RC;
-        init_qp_attr->send_cq = cq;
-        init_qp_attr->recv_cq = cq;
+       init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+       init_qp_attr->cap.max_recv_sge = 1;
+       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr->qp_type = IB_QPT_RC;
+       init_qp_attr->send_cq = cq;
+       init_qp_attr->recv_cq = cq;
 
        conn->ibc_sched = sched;
 
@@ -862,9 +862,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
        } while (rc);
 
        if (rc) {
-               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
-                       rc, init_qp_attr->cap.max_send_wr,
-                       init_qp_attr->cap.max_recv_wr);
+               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+                      "send_sge: %d, recv_sge: %d\n",
+                      rc, init_qp_attr->cap.max_send_wr,
+                      init_qp_attr->cap.max_recv_wr,
+                      init_qp_attr->cap.max_send_sge,
+                      init_qp_attr->cap.max_recv_sge);
                goto failed_2;
        }
 
@@ -2230,7 +2233,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                list_del(&tx->tx_list);
                 if (tx->tx_pages != NULL)
@@ -2245,10 +2249,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                         LIBCFS_FREE(tx->tx_wrq,
                                     (1 + IBLND_MAX_RDMA_FRAGS) *
                                     sizeof(*tx->tx_wrq));
-                if (tx->tx_sge != NULL)
-                        LIBCFS_FREE(tx->tx_sge,
-                                    (1 + IBLND_MAX_RDMA_FRAGS) *
-                                    sizeof(*tx->tx_sge));
+               if (tx->tx_sge != NULL)
+                       LIBCFS_FREE(tx->tx_sge,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+                                   sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
                                     offsetof(kib_rdma_desc_t,
@@ -2306,7 +2310,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
 
         for (i = 0; i < size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
                if (ps->ps_net->ibn_fmr_ps != NULL) {
@@ -2332,7 +2337,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
                        break;
 
                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
                                 sizeof(*tx->tx_sge));
                if (tx->tx_sge == NULL)
                        break;
index e0409b5..6e64fc5 100644 (file)
@@ -108,6 +108,7 @@ typedef struct
        int              *kib_use_priv_port;    /* use privileged port for active connect */
        /* # threads on each CPT */
        int              *kib_nscheds;
+       int              *kib_wrq_sge;          /* # sg elements per wrq */
 } kib_tunables_t;
 
 extern kib_tunables_t  kiblnd_tunables;
@@ -617,8 +618,12 @@ typedef struct kib_tx                           /* transmit message */
        __u64                   tx_msgaddr;
        /* for dma_unmap_single() */
        DECLARE_PCI_UNMAP_ADDR(tx_msgunmap);
+       /** sge for tx_msgaddr */
+       struct ib_sge           tx_msgsge;
        /* # send work items */
        int                     tx_nwrq;
+       /* # used scatter/gather elements */
+       int                     tx_nsge;
        /* send work items... */
        struct ib_rdma_wr       *tx_wrq;
        /* ...and their memory */
index 603013d..aa14b1f 100644 (file)
@@ -75,7 +75,7 @@ kiblnd_tx_done(kib_tx_t *tx)
                tx->tx_conn = NULL;
        }
 
-       tx->tx_nwrq = 0;
+       tx->tx_nwrq = tx->tx_nsge = 0;
        tx->tx_status = 0;
 
        kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
@@ -422,7 +422,7 @@ kiblnd_handle_rx (kib_rx_t *rx)
                  * (a) I can overwrite tx_msg since my peer_ni has received it!
                  * (b) tx_waiting set tells tx_complete() it's not done. */
 
-                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+               tx->tx_nwrq = tx->tx_nsge = 0;  /* overwrite PUT_REQ */
 
                 rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
                                        kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
@@ -765,7 +765,7 @@ __must_hold(&conn->ibc_lock)
 
        LASSERT(tx->tx_queued);
        /* We rely on this for QP sizing */
-       LASSERT(tx->tx_nwrq > 0);
+       LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0);
        LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
 
        LASSERT(credit == 0 || credit == 1);
@@ -1024,7 +1024,7 @@ static void
 kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
 {
        kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
-       struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
+       struct ib_sge *sge = &tx->tx_msgsge;
        struct ib_rdma_wr *wrq;
        int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
 #ifdef HAVE_IB_GET_DMA_MR
@@ -1065,34 +1065,43 @@ static int
 kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-       kib_msg_t         *ibmsg = tx->tx_msg;
+       kib_msg_t         *ibmsg = tx->tx_msg;
        kib_rdma_desc_t   *srcrd = tx->tx_rd;
-       struct ib_sge     *sge = &tx->tx_sge[0];
-       struct ib_rdma_wr *wrq;
-       int                rc  = resid;
-       int                srcidx;
-       int                dstidx;
-       int                wrknob;
+       struct ib_rdma_wr *wrq = NULL;
+       struct ib_sge     *sge;
+       int                rc  = resid;
+       int                srcidx;
+       int                dstidx;
+       int                sge_nob;
+       int                wrq_sge;
 
-       LASSERT (!in_interrupt());
-       LASSERT (tx->tx_nwrq == 0);
-       LASSERT (type == IBLND_MSG_GET_DONE ||
-                type == IBLND_MSG_PUT_DONE);
+       LASSERT(!in_interrupt());
+       LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0);
+       LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE);
+
+       if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
+               CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                      conn->ibc_max_frags << PAGE_SHIFT,
+                      kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
+               GOTO(too_big, rc = -EMSGSIZE);
+       }
 
-       srcidx = dstidx = 0;
+       for (srcidx = dstidx = wrq_sge = sge_nob = 0;
+            resid > 0; resid -= sge_nob) {
+               int     prev = dstidx;
 
-        while (resid > 0) {
-                if (srcidx >= srcrd->rd_nfrags) {
-                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
-                        rc = -EPROTO;
-                        break;
-                }
+               if (srcidx >= srcrd->rd_nfrags) {
+                       CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                       rc = -EPROTO;
+                       break;
+               }
 
-                if (dstidx == dstrd->rd_nfrags) {
-                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
-                        rc = -EPROTO;
-                        break;
-                }
+               if (dstidx >= dstrd->rd_nfrags) {
+                       CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                       rc = -EPROTO;
+                       break;
+               }
 
                if (tx->tx_nwrq >= conn->ibc_max_frags) {
                        CERROR("RDMA has too many fragments for peer_ni %s (%d), "
@@ -1105,43 +1114,51 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                        break;
                }
 
-                wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
-                                 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+               sge_nob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+                                 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
 
-                sge = &tx->tx_sge[tx->tx_nwrq];
-                sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
-                sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
-                sge->length = wrknob;
+               sge = &tx->tx_sge[tx->tx_nsge];
+               sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+               sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+               sge->length = sge_nob;
 
-                wrq = &tx->tx_wrq[tx->tx_nwrq];
+               if (wrq_sge == 0) {
+                       wrq = &tx->tx_wrq[tx->tx_nwrq];
 
-               wrq->wr.next            = &(wrq + 1)->wr;
-               wrq->wr.wr_id           = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
-               wrq->wr.sg_list         = sge;
-               wrq->wr.num_sge         = 1;
-               wrq->wr.opcode          = IB_WR_RDMA_WRITE;
-               wrq->wr.send_flags      = 0;
+                       wrq->wr.next    = &(wrq + 1)->wr;
+                       wrq->wr.wr_id   = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+                       wrq->wr.sg_list = sge;
+                       wrq->wr.opcode  = IB_WR_RDMA_WRITE;
+                       wrq->wr.send_flags = 0;
 
 #ifdef HAVE_IB_RDMA_WR
-               wrq->remote_addr        = kiblnd_rd_frag_addr(dstrd, dstidx);
-               wrq->rkey               = kiblnd_rd_frag_key(dstrd, dstidx);
+                       wrq->remote_addr        = kiblnd_rd_frag_addr(dstrd,
+                                                                     dstidx);
+                       wrq->rkey               = kiblnd_rd_frag_key(dstrd,
+                                                                    dstidx);
 #else
-               wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
-               wrq->wr.wr.rdma.rkey    = kiblnd_rd_frag_key(dstrd, dstidx);
+                       wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd,
+                                                                       dstidx);
+                       wrq->wr.wr.rdma.rkey    = kiblnd_rd_frag_key(dstrd,
+                                                                    dstidx);
 #endif
+               }
 
-                srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
-                dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
-
-                resid -= wrknob;
+               srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob);
+               dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob);
 
-                tx->tx_nwrq++;
-                wrq++;
-                sge++;
-        }
+               wrq_sge++;
+               if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) {
+                       tx->tx_nwrq++;
+                       wrq->wr.num_sge = wrq_sge;
+                       wrq_sge = 0;
+               }
+               tx->tx_nsge++;
+       }
 
-        if (rc < 0)                             /* no RDMA if completing with failure */
-                tx->tx_nwrq = 0;
+too_big:
+       if (rc < 0)     /* no RDMA if completing with failure */
+               tx->tx_nwrq = tx->tx_nsge = 0;
 
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
@@ -3677,11 +3694,11 @@ kiblnd_scheduler(void *arg)
 int
 kiblnd_failover_thread(void *arg)
 {
-       rwlock_t                *glock = &kiblnd_data.kib_global_lock;
-       kib_dev_t               *dev;
-       wait_queue_t            wait;
-       unsigned long           flags;
-       int                     rc;
+       rwlock_t        *glock = &kiblnd_data.kib_global_lock;
+       kib_dev_t       *dev;
+       wait_queue_t     wait;
+       unsigned long    flags;
+       int              rc;
 
        LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
 
index 4911431..f7f90e1 100644 (file)
@@ -146,6 +146,10 @@ static int use_privileged_port = 1;
 module_param(use_privileged_port, int, 0644);
 MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
 
+static unsigned int wrq_sge = 1;
+module_param(wrq_sge, uint, 0444);
+MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
+
 kib_tunables_t kiblnd_tunables = {
         .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
@@ -159,7 +163,8 @@ kib_tunables_t kiblnd_tunables = {
         .kib_ib_mtu                 = &ib_mtu,
         .kib_require_priv_port      = &require_privileged_port,
        .kib_use_priv_port          = &use_privileged_port,
-       .kib_nscheds                = &nscheds
+       .kib_nscheds                = &nscheds,
+       .kib_wrq_sge                = &wrq_sge,
 };
 
 static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;