From: Liang Zhen <liang.zhen@intel.com>
Date: Tue, 28 Oct 2014 03:34:26 +0000 (+0800)
Subject: LU-5718 o2iblnd: multiple sges for work request
X-Git-Tag: 2.9.57~69
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=fda19c748016c9f57f71278b597fd8a651268f66

LU-5718 o2iblnd: multiple sges for work request

In current protocol, lnet router cannot align buffer for rdma,
o2iblnd may run into "too fragmented RDMA" issue while routing
non-page-aligned IO larger than 512K, because each page will
be splited into two fragments by kiblnd_init_rdma().

With this patch, o2iblnd can have multiple sges for each work
request, and combine multiple remote fragments of the same page
into one work request to resovle the "too fragmented RDMA" issue.

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Change-Id: Id57a74dc92801b012956ab785233aa87cac14263
Reviewed-on: https://review.whamcloud.com/12451
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Reviewed-by: Nathaniel Clark <nathaniel.l.clark@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c
index a5e7543..b17c5e3 100644
--- a/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/lnet/klnds/o2iblnd/o2iblnd.c
@@ -840,16 +840,16 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 		goto failed_2;
 	}
 
-        init_qp_attr->event_handler = kiblnd_qp_event;
-        init_qp_attr->qp_context = conn;
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
 	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
 	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-        init_qp_attr->cap.max_send_sge = 1;
-        init_qp_attr->cap.max_recv_sge = 1;
-        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-        init_qp_attr->qp_type = IB_QPT_RC;
-        init_qp_attr->send_cq = cq;
-        init_qp_attr->recv_cq = cq;
+	init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
 
 	conn->ibc_sched = sched;
 
@@ -862,9 +862,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
 	} while (rc);
 
 	if (rc) {
-		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
-			rc, init_qp_attr->cap.max_send_wr,
-			init_qp_attr->cap.max_recv_wr);
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+		       "send_sge: %d, recv_sge: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr,
+		       init_qp_attr->cap.max_send_sge,
+		       init_qp_attr->cap.max_recv_sge);
 		goto failed_2;
 	}
 
@@ -2230,7 +2233,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
 		list_del(&tx->tx_list);
                 if (tx->tx_pages != NULL)
@@ -2245,10 +2249,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                         LIBCFS_FREE(tx->tx_wrq,
                                     (1 + IBLND_MAX_RDMA_FRAGS) *
                                     sizeof(*tx->tx_wrq));
-                if (tx->tx_sge != NULL)
-                        LIBCFS_FREE(tx->tx_sge,
-                                    (1 + IBLND_MAX_RDMA_FRAGS) *
-                                    sizeof(*tx->tx_sge));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+				    sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
                                     offsetof(kib_rdma_desc_t,
@@ -2306,7 +2310,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
 
         for (i = 0; i < size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+		int	  wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
 		if (ps->ps_net->ibn_fmr_ps != NULL) {
@@ -2332,7 +2337,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
 			break;
 
 		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
-				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
 				 sizeof(*tx->tx_sge));
 		if (tx->tx_sge == NULL)
 			break;
diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h
index e0409b5..6e64fc5 100644
--- a/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/lnet/klnds/o2iblnd/o2iblnd.h
@@ -108,6 +108,7 @@ typedef struct
 	int              *kib_use_priv_port;    /* use privileged port for active connect */
 	/* # threads on each CPT */
 	int		 *kib_nscheds;
+	int		 *kib_wrq_sge;		/* # sg elements per wrq */
 } kib_tunables_t;
 
 extern kib_tunables_t  kiblnd_tunables;
@@ -617,8 +618,12 @@ typedef struct kib_tx                           /* transmit message */
 	__u64			tx_msgaddr;
 	/* for dma_unmap_single() */
 	DECLARE_PCI_UNMAP_ADDR(tx_msgunmap);
+	/** sge for tx_msgaddr */
+	struct ib_sge		tx_msgsge;
 	/* # send work items */
 	int			tx_nwrq;
+	/* # used scatter/gather elements */
+	int			tx_nsge;
 	/* send work items... */
 	struct ib_rdma_wr	*tx_wrq;
 	/* ...and their memory */
diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 603013d..aa14b1f 100644
--- a/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -75,7 +75,7 @@ kiblnd_tx_done(kib_tx_t *tx)
 		tx->tx_conn = NULL;
 	}
 
-	tx->tx_nwrq = 0;
+	tx->tx_nwrq = tx->tx_nsge = 0;
 	tx->tx_status = 0;
 
 	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
@@ -422,7 +422,7 @@ kiblnd_handle_rx (kib_rx_t *rx)
                  * (a) I can overwrite tx_msg since my peer_ni has received it!
                  * (b) tx_waiting set tells tx_complete() it's not done. */
 
-                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+		tx->tx_nwrq = tx->tx_nsge = 0;	/* overwrite PUT_REQ */
 
                 rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
                                        kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
@@ -765,7 +765,7 @@ __must_hold(&conn->ibc_lock)
 
 	LASSERT(tx->tx_queued);
 	/* We rely on this for QP sizing */
-	LASSERT(tx->tx_nwrq > 0);
+	LASSERT(tx->tx_nwrq > 0 && tx->tx_nsge >= 0);
 	LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
 
 	LASSERT(credit == 0 || credit == 1);
@@ -1024,7 +1024,7 @@ static void
 kiblnd_init_tx_msg(struct lnet_ni *ni, kib_tx_t *tx, int type, int body_nob)
 {
 	kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
-	struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
+	struct ib_sge *sge = &tx->tx_msgsge;
 	struct ib_rdma_wr *wrq;
 	int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
 #ifdef HAVE_IB_GET_DMA_MR
@@ -1065,34 +1065,43 @@ static int
 kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
 		 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-	kib_msg_t         *ibmsg = tx->tx_msg;
+	kib_msg_t	  *ibmsg = tx->tx_msg;
 	kib_rdma_desc_t   *srcrd = tx->tx_rd;
-	struct ib_sge     *sge = &tx->tx_sge[0];
-	struct ib_rdma_wr *wrq;
-	int                rc  = resid;
-	int                srcidx;
-	int                dstidx;
-	int                wrknob;
+	struct ib_rdma_wr *wrq = NULL;
+	struct ib_sge	  *sge;
+	int		   rc  = resid;
+	int		   srcidx;
+	int		   dstidx;
+	int		   sge_nob;
+	int		   wrq_sge;
 
-	LASSERT (!in_interrupt());
-	LASSERT (tx->tx_nwrq == 0);
-	LASSERT (type == IBLND_MSG_GET_DONE ||
-		 type == IBLND_MSG_PUT_DONE);
+	LASSERT(!in_interrupt());
+	LASSERT(tx->tx_nwrq == 0 && tx->tx_nsge == 0);
+	LASSERT(type == IBLND_MSG_GET_DONE || type == IBLND_MSG_PUT_DONE);
+
+	if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
+		CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+		       conn->ibc_max_frags << PAGE_SHIFT,
+		       kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
+		GOTO(too_big, rc = -EMSGSIZE);
+	}
 
-	srcidx = dstidx = 0;
+	for (srcidx = dstidx = wrq_sge = sge_nob = 0;
+	     resid > 0; resid -= sge_nob) {
+		int	prev = dstidx;
 
-        while (resid > 0) {
-                if (srcidx >= srcrd->rd_nfrags) {
-                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
-                        rc = -EPROTO;
-                        break;
-                }
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
 
-                if (dstidx == dstrd->rd_nfrags) {
-                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
-                        rc = -EPROTO;
-                        break;
-                }
+		if (dstidx >= dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
 
 		if (tx->tx_nwrq >= conn->ibc_max_frags) {
 			CERROR("RDMA has too many fragments for peer_ni %s (%d), "
@@ -1105,43 +1114,51 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
 			break;
 		}
 
-                wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
-                                 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+		sge_nob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+				  kiblnd_rd_frag_size(dstrd, dstidx)), resid);
 
-                sge = &tx->tx_sge[tx->tx_nwrq];
-                sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
-                sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
-                sge->length = wrknob;
+		sge = &tx->tx_sge[tx->tx_nsge];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = sge_nob;
 
-                wrq = &tx->tx_wrq[tx->tx_nwrq];
+		if (wrq_sge == 0) {
+			wrq = &tx->tx_wrq[tx->tx_nwrq];
 
-		wrq->wr.next		= &(wrq + 1)->wr;
-		wrq->wr.wr_id		= kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
-		wrq->wr.sg_list		= sge;
-		wrq->wr.num_sge		= 1;
-		wrq->wr.opcode		= IB_WR_RDMA_WRITE;
-		wrq->wr.send_flags	= 0;
+			wrq->wr.next	= &(wrq + 1)->wr;
+			wrq->wr.wr_id	= kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+			wrq->wr.sg_list	= sge;
+			wrq->wr.opcode	= IB_WR_RDMA_WRITE;
+			wrq->wr.send_flags = 0;
 
 #ifdef HAVE_IB_RDMA_WR
-		wrq->remote_addr	= kiblnd_rd_frag_addr(dstrd, dstidx);
-		wrq->rkey		= kiblnd_rd_frag_key(dstrd, dstidx);
+			wrq->remote_addr	= kiblnd_rd_frag_addr(dstrd,
+								      dstidx);
+			wrq->rkey		= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
 #else
-		wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
-		wrq->wr.wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd, dstidx);
+			wrq->wr.wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd,
+									dstidx);
+			wrq->wr.wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd,
+								     dstidx);
 #endif
+		}
 
-                srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
-                dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
-
-                resid -= wrknob;
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, sge_nob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, sge_nob);
 
-                tx->tx_nwrq++;
-                wrq++;
-                sge++;
-        }
+		wrq_sge++;
+		if (wrq_sge == *kiblnd_tunables.kib_wrq_sge || dstidx != prev) {
+			tx->tx_nwrq++;
+			wrq->wr.num_sge = wrq_sge;
+			wrq_sge = 0;
+		}
+		tx->tx_nsge++;
+	}
 
-        if (rc < 0)                             /* no RDMA if completing with failure */
-                tx->tx_nwrq = 0;
+too_big:
+	if (rc < 0)	/* no RDMA if completing with failure */
+		tx->tx_nwrq = tx->tx_nsge = 0;
 
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
@@ -3677,11 +3694,11 @@ kiblnd_scheduler(void *arg)
 int
 kiblnd_failover_thread(void *arg)
 {
-	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
-	kib_dev_t		*dev;
-	wait_queue_t		wait;
-	unsigned long		flags;
-	int			rc;
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	*dev;
+	wait_queue_t	 wait;
+	unsigned long	 flags;
+	int		 rc;
 
 	LASSERT(*kiblnd_tunables.kib_dev_failover != 0);
 
diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c
index 4911431..f7f90e1 100644
--- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -146,6 +146,10 @@ static int use_privileged_port = 1;
 module_param(use_privileged_port, int, 0644);
 MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
 
+static unsigned int wrq_sge = 1;
+module_param(wrq_sge, uint, 0444);
+MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
+
 kib_tunables_t kiblnd_tunables = {
         .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
@@ -159,7 +163,8 @@ kib_tunables_t kiblnd_tunables = {
         .kib_ib_mtu                 = &ib_mtu,
         .kib_require_priv_port      = &require_privileged_port,
 	.kib_use_priv_port	    = &use_privileged_port,
-	.kib_nscheds		    = &nscheds
+	.kib_nscheds		    = &nscheds,
+	.kib_wrq_sge		    = &wrq_sge,
 };
 
 static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;