From: Liang Zhen Date: Tue, 28 Oct 2014 03:34:26 +0000 (+0800) Subject: LU-5718 o2iblnd: multiple sges for work request X-Git-Tag: 2.9.57~69 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=fda19c748016c9f57f71278b597fd8a651268f66 LU-5718 o2iblnd: multiple sges for work request In current protocol, lnet router cannot align buffer for rdma, o2iblnd may run into "too fragmented RDMA" issue while routing non-page-aligned IO larger than 512K, because each page will be splited into two fragments by kiblnd_init_rdma(). With this patch, o2iblnd can have multiple sges for each work request, and combine multiple remote fragments of the same page into one work request to resovle the "too fragmented RDMA" issue. Signed-off-by: Liang Zhen Change-Id: Id57a74dc92801b012956ab785233aa87cac14263 Reviewed-on: https://review.whamcloud.com/12451 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Nathaniel Clark Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index a5e7543..b17c5e3 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -840,16 +840,16 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid, goto failed_2; } - init_qp_attr->event_handler = kiblnd_qp_event; - init_qp_attr->qp_context = conn; + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); - init_qp_attr->cap.max_send_sge = 1; - init_qp_attr->cap.max_recv_sge = 1; - init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - init_qp_attr->qp_type = IB_QPT_RC; - init_qp_attr->send_cq = cq; - init_qp_attr->recv_cq = cq; + init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; conn->ibc_sched = sched; @@ -862,9 +862,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid, } while (rc); if (rc) { - CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", - rc, init_qp_attr->cap.max_send_wr, - init_qp_attr->cap.max_recv_wr); + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, " + "send_sge: %d, recv_sge: %d\n", + rc, init_qp_attr->cap.max_send_wr, + init_qp_attr->cap.max_recv_wr, + init_qp_attr->cap.max_send_sge, + init_qp_attr->cap.max_recv_sge); goto failed_2; } @@ -2230,7 +2233,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool) goto out; for (i = 0; i < pool->po_size; i++) { - kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; list_del(&tx->tx_list); if (tx->tx_pages != NULL) @@ -2245,10 +2249,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool) LIBCFS_FREE(tx->tx_wrq, (1 + IBLND_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); - if (tx->tx_sge != NULL) - LIBCFS_FREE(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * + sizeof(*tx->tx_sge)); if (tx->tx_rd != NULL) LIBCFS_FREE(tx->tx_rd, offsetof(kib_rdma_desc_t, @@ -2306,7 +2310,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); for (i = 0; i < size; i++) { - kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; tx->tx_pool = tpo; if (ps->ps_net->ibn_fmr_ps != NULL) { @@ -2332,7 +2337,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) break; LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, - (1 + IBLND_MAX_RDMA_FRAGS) * + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * sizeof(*tx->tx_sge)); if (tx->tx_sge == NULL) break; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index e0409b5..6e64fc5 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -108,6 +108,7 @@ typedef struct int *kib_use_priv_port; /* use privileged port for active connect */ /* # threads on each CPT */ int *kib_nscheds; + int *kib_wrq_sge; /* # sg elements per wrq */ } kib_tunables_t; extern kib_tunables_t kiblnd_tunables; @@ -617,8 +618,12 @@ typedef struct kib_tx /* transmit message */ __u64 tx_msgaddr; /* for dma_unmap_single() */ DECLARE_PCI_UNMAP_ADDR(tx_msgunmap); + /** sge for tx_msgaddr */ + struct ib_sge tx_msgsge; /* # send work items */ int tx_nwrq; + /* # used scatter/gather elements */ + int tx_nsge; /* send work items... */ struct ib_rdma_wr *tx_wrq; /* ...and their memory */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 603013d..aa14b1f 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -75,7 +75,7 @@ kiblnd_tx_done(kib_tx_t *tx) tx->tx_conn = NULL; } - tx->tx_nwrq = 0; + tx->tx_nwrq = tx->tx_nsge = 0; tx->tx_status = 0; kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); @@ -422,7 +422,7 @@ kiblnd_handle_rx (kib_rx_t *rx) * (a) I can overwrite tx_msg since my peer_ni has received it! * (b) tx_waiting set tells tx_complete() it's not done. */ - tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + tx->tx_nwrq = tx->tx_nsge = 0; /* overwrite PUT_REQ */ rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), @@ -765,7 +765,7 @@ __must_hold(&conn->ibc_lock) LASSERT(tx->tx_queued); /* We rely on this for QP sizing */ - LASSERT(tx->tx_nwrq > 0); + LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0); LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags); LASSERT(credit == 0 || credit == 1); @@ -1024,7 +1024,7 @@ static void kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob) { kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; - struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; + struct ib_sge *sge = &tx->tx_msgsge; struct ib_rdma_wr *wrq; int nob = offsetof(kib_msg_t, ibm_u) + body_nob; #ifdef HAVE_IB_GET_DMA_MR @@ -1065,34 +1065,43 @@ static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie) { - kib_msg_t *ibmsg = tx->tx_msg; + kib_msg_t *ibmsg = tx->tx_msg; kib_rdma_desc_t *srcrd = tx->tx_rd; - struct ib_sge *sge = &tx->tx_sge[0]; - struct ib_rdma_wr *wrq; - int rc = resid; - int srcidx; - int dstidx; - int wrknob; + struct ib_rdma_wr *wrq = NULL; + struct ib_sge *sge; + int rc = resid; + int srcidx; + int dstidx; + int sge_nob; + int wrq_sge; - LASSERT (!in_interrupt()); - LASSERT (tx->tx_nwrq == 0); - LASSERT (type == IBLND_MSG_GET_DONE || - type == IBLND_MSG_PUT_DONE); + LASSERT(!in_interrupt()); + LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0); + LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE); + + if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { + CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_max_frags << PAGE_SHIFT, + kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); + GOTO(too_big, rc = -EMSGSIZE); + } - srcidx = dstidx = 0; + for (srcidx = dstidx = wrq_sge = sge_nob = 0; + resid > 0; resid -= sge_nob) { + int prev = dstidx; - while (resid > 0) { - if (srcidx >= srcrd->rd_nfrags) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); - rc = -EPROTO; - break; - } + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } - if (dstidx == dstrd->rd_nfrags) { - CERROR("Dst buffer exhausted: %d frags\n", dstidx); - rc = -EPROTO; - break; - } + if (dstidx >= dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } if (tx->tx_nwrq >= conn->ibc_max_frags) { CERROR("RDMA has too many fragments for peer_ni %s (%d), " @@ -1105,43 +1114,51 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, break; } - wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), - kiblnd_rd_frag_size(dstrd, dstidx)), resid); + sge_nob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), + kiblnd_rd_frag_size(dstrd, dstidx)), resid); - sge = &tx->tx_sge[tx->tx_nwrq]; - sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); - sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); - sge->length = wrknob; + sge = &tx->tx_sge[tx->tx_nsge]; + sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); + sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); + sge->length = sge_nob; - wrq = &tx->tx_wrq[tx->tx_nwrq]; + if (wrq_sge == 0) { + wrq = &tx->tx_wrq[tx->tx_nwrq]; - wrq->wr.next = &(wrq + 1)->wr; - wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); - wrq->wr.sg_list = sge; - wrq->wr.num_sge = 1; - wrq->wr.opcode = IB_WR_RDMA_WRITE; - wrq->wr.send_flags = 0; + wrq->wr.next = &(wrq + 1)->wr; + wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->wr.sg_list = sge; + wrq->wr.opcode = IB_WR_RDMA_WRITE; + wrq->wr.send_flags = 0; #ifdef HAVE_IB_RDMA_WR - wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); - wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx); + wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->rkey = kiblnd_rd_frag_key(dstrd, + dstidx); #else - wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); - wrq->wr.wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, dstidx); + wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, + dstidx); + wrq->wr.wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, + dstidx); #endif + } - srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob); - dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob); - - resid -= wrknob; + srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob); + dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob); - tx->tx_nwrq++; - wrq++; - sge++; - } + wrq_sge++; + if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) { + tx->tx_nwrq++; + wrq->wr.num_sge = wrq_sge; + wrq_sge = 0; + } + tx->tx_nsge++; + } - if (rc < 0) /* no RDMA if completing with failure */ - tx->tx_nwrq = 0; +too_big: + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = tx->tx_nsge = 0; ibmsg->ibm_u.completion.ibcm_status = rc; ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; @@ -3677,11 +3694,11 @@ kiblnd_scheduler(void *arg) int kiblnd_failover_thread(void *arg) { - rwlock_t *glock = &kiblnd_data.kib_global_lock; - kib_dev_t *dev; - wait_queue_t wait; - unsigned long flags; - int rc; + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_dev_t *dev; + wait_queue_t wait; + unsigned long flags; + int rc; LASSERT(*kiblnd_tunables.kib_dev_failover != 0); diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 4911431..f7f90e1 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -146,6 +146,10 @@ static int use_privileged_port = 1; module_param(use_privileged_port, int, 0644); MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); +static unsigned int wrq_sge = 1; +module_param(wrq_sge, uint, 0444); +MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request"); + kib_tunables_t kiblnd_tunables = { .kib_dev_failover = &dev_failover, .kib_service = &service, @@ -159,7 +163,8 @@ kib_tunables_t kiblnd_tunables = { .kib_ib_mtu = &ib_mtu, .kib_require_priv_port = &require_privileged_port, .kib_use_priv_port = &use_privileged_port, - .kib_nscheds = &nscheds + .kib_nscheds = &nscheds, + .kib_wrq_sge = &wrq_sge, }; static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;