From 959304eac7ec5b156b4bfa57f47cbbf9ef3c8315 Mon Sep 17 00:00:00 2001
From: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Date: Mon, 7 Feb 2022 18:02:14 +0300
Subject: [PATCH] LU-15189 lnet: fix memory mapping.

Nvidia GDS have a bug which caused incorrect page type detect.
It may return an GPU flag for the kmalloc buffer (ptlrpc_message
in my case).
To workaround this - Whamcloud have both mapping calls, but it's
costly and caused an extra RDMA operations as ko2iblnd trust
an msg_rdma_force flag.
Lets drop extra Nvidia calls and check just an real "user" pages
or GPU flag.

HPe-bug-id: LUS-10520
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: I5d70c5e0630b0f16e130a7db0385de2443c11a63
Reviewed-on: https://review.whamcloud.com/45482
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lnet/include/lnet/lib-types.h             |  3 +-
 lnet/include/uapi/linux/lnet/lnet-types.h |  2 +
 lnet/klnds/o2iblnd/o2iblnd.h              | 43 +++++++++----------
 lnet/klnds/o2iblnd/o2iblnd_cb.c           | 68 +++++++++++++++++--------------
 lnet/lnet/lib-md.c                        |  3 ++
 lnet/lnet/lib-move.c                      | 10 +++--
 lustre/include/lustre_net.h               |  4 +-
 lustre/osc/osc_request.c                  |  3 ++
 lustre/ptlrpc/pers.c                      |  3 ++
 9 files changed, 81 insertions(+), 58 deletions(-)

diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
index 0df6857..ff93f87 100644
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -141,8 +141,6 @@ struct lnet_msg {
 	enum lnet_msg_hstatus	msg_health_status;
 	/* This is a recovery message */
 	bool			msg_recovery;
-	/* force an RDMA even if the message size is < 4K */
-	bool			msg_rdma_force;
 	/* the number of times a transmission has been retried */
 	int			msg_retry_count;
 	/* flag to indicate that we do not want to resend this message */
@@ -248,6 +246,7 @@ struct lnet_libmd {
  */
 #define LNET_MD_FLAG_HANDLING	 BIT(3)
 #define LNET_MD_FLAG_DISCARD	 BIT(4)
+#define LNET_MD_FLAG_GPU	 BIT(5) /**< Special mapping needs */
 
 struct lnet_test_peer {
 	/* info about peers we are trying to fail */
diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h
index d7fce9d..5a2a2a0 100644
--- a/lnet/include/uapi/linux/lnet/lnet-types.h
+++ b/lnet/include/uapi/linux/lnet/lnet-types.h
@@ -467,6 +467,8 @@ struct lnet_md {
 #define LNET_MD_NO_TRACK_RESPONSE    (1 << 11)
 /** See struct lnet_md::options. */
 #define LNET_MD_GNILND               (1 << 12)
+/** Special page mapping handling */
+#define LNET_MD_GPU_ADDR	     (1 << 13)
 
 /** Infinite threshold on MD operations. See struct lnet_md::threshold */
 #define LNET_MD_THRESH_INF	 (-1)
diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h
index d3f6512..4c8581d 100644
--- a/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/lnet/klnds/o2iblnd/o2iblnd.h
@@ -513,9 +513,11 @@ struct kib_tx {					/* transmit message */
 	/* # tx callbacks outstanding */
 	short			tx_sending;
 	/* queued for sending */
-	short			tx_queued;
+	unsigned long		tx_queued:1,
 	/* waiting for peer_ni */
-	short			tx_waiting;
+				tx_waiting:1,
+	/* force RDMA */
+				tx_gpu:1;
 	/* LNET completion status */
 	int			tx_status;
 	/* health status of the transmit */
@@ -1038,33 +1040,32 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
 #define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
 #define KIBLND_UNMAP_ADDR(p, m, a)      (a)
 
-static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
-				    struct scatterlist *sg, int nents,
-				    enum dma_data_direction direction)
+static inline
+int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
 {
-	int count;
+	struct scatterlist *sg = tx->tx_frags;
+	int nents = tx->tx_nfrags;
+	enum dma_data_direction direction = tx->tx_dmadir;
 
-	count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
-				       sg, nents, direction);
-
-	if (count != 0)
-		return count;
+	if (tx->tx_gpu)
+		return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+					      sg, nents, direction);
 
 	return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
-static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
-				       struct scatterlist *sg, int nents,
-				       enum dma_data_direction direction)
+static inline
+void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
 {
-	int count;
-
-	count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
-				   sg, nents, direction);
-	if (count != 0)
-		return;
+	struct scatterlist *sg = tx->tx_frags;
+	int nents = tx->tx_nfrags;
+	enum dma_data_direction direction = tx->tx_dmadir;
 
-	ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
+	if (tx->tx_gpu)
+		lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+					  sg, nents, direction);
+	else
+		ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
 #ifndef HAVE_IB_SG_DMA_ADDRESS
diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 2cbc2b8..ac69534 100644
--- a/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -670,8 +670,7 @@ kiblnd_unmap_tx(struct kib_tx *tx)
 		kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
 	if (tx->tx_nfrags != 0) {
-		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
-				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx);
 		tx->tx_nfrags = 0;
 	}
 }
@@ -724,9 +723,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	tx->tx_nfrags = nfrags;
 
-	rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
-					  tx->tx_nfrags, tx->tx_dmadir);
-
+	rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx);
         for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
                 rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
                         hdev->ibh_ibdev, &tx->tx_frags[i]);
@@ -1174,7 +1171,8 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
 		int	prev = dstidx;
 
 		if (srcidx >= srcrd->rd_nfrags) {
-			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			CERROR("Src buffer exhausted: %d frags %px\n",
+				srcidx, tx);
 			rc = -EPROTO;
 			break;
 		}
@@ -1647,11 +1645,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	struct bio_vec   *payload_kiov = lntmsg->msg_kiov;
 	unsigned int      payload_offset = lntmsg->msg_offset;
 	unsigned int      payload_nob = lntmsg->msg_len;
+	bool		 gpu;
 	struct kib_msg *ibmsg;
 	struct kib_rdma_desc *rd;
-	struct kib_tx *tx;
-	int               nob;
-	int               rc;
+	struct kib_tx	*tx;
+	int		 nob;
+	int		 rc;
 
 	/* NB 'private' is different depending on what we're sending.... */
 
@@ -1672,26 +1671,28 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 		return -ENOMEM;
 	}
 	ibmsg = tx->tx_msg;
+	gpu = (lntmsg->msg_md->md_flags & LNET_MD_FLAG_GPU);
 
 	switch (type) {
 	default:
 		LBUG();
 		return (-EIO);
 
-        case LNET_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
+	case LNET_MSG_ACK:
+		LASSERT(payload_nob == 0);
+		break;
 
-        case LNET_MSG_GET:
-                if (routing || target_is_router)
-                        break;                  /* send IMMEDIATE */
+	case LNET_MSG_GET:
+		if (routing || target_is_router)
+			break;                  /* send IMMEDIATE */
 
-                /* is the REPLY message too small for RDMA? */
+		/* is the REPLY message too small for RDMA? */
 		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
-		if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+		if (nob <= IBLND_MSG_SIZE && !gpu)
 			break;                  /* send IMMEDIATE */
 
 		rd = &ibmsg->ibm_u.get.ibgm_rd;
+		tx->tx_gpu = !!gpu;
 		rc = kiblnd_setup_rd_kiov(ni, tx, rd,
 					  lntmsg->msg_md->md_niov,
 					  lntmsg->msg_md->md_kiov,
@@ -1708,9 +1709,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
 		lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr);
 
-                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
 
-                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
 		if (tx->tx_lntmsg[1] == NULL) {
 			CERROR("Can't create reply for GET -> %s\n",
 			       libcfs_nidstr(&target->nid));
@@ -1727,9 +1728,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
 	case LNET_MSG_REPLY:
 	case LNET_MSG_PUT:
 		/* Is the payload small enough not to need RDMA? */
-		nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
-		if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
-			break;                  /* send IMMEDIATE */
+		nob = offsetof(struct kib_msg,
+				ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE && !gpu)
+			break;			/* send IMMEDIATE */
+
+		tx->tx_gpu = gpu;
 
 		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
 					  payload_niov, payload_kiov,
@@ -1813,6 +1817,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
 	struct bio_vec *kiov = lntmsg->msg_kiov;
 	unsigned int offset = lntmsg->msg_offset;
 	unsigned int nob = lntmsg->msg_len;
+	struct lnet_libmd *payload_md = lntmsg->msg_md;
 	struct kib_tx *tx;
 	int rc;
 
@@ -1823,6 +1828,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
 		goto failed_0;
 	}
 
+	tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
 	if (nob == 0)
 		rc = 0;
 	else
@@ -1920,8 +1926,9 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 	case IBLND_MSG_PUT_REQ: {
 		struct kib_msg	*txmsg;
 		struct kib_rdma_desc *rd;
-		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+		struct lnet_libmd *payload_md = lntmsg->msg_md;
 
+		ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
 		if (mlen == 0) {
 			lnet_finalize(lntmsg, 0);
 			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1930,14 +1937,15 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
 		}
 
 		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
-                if (tx == NULL) {
-                        CERROR("Can't allocate tx for %s\n",
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                        /* Not replying will break the connection */
-                        rc = -ENOMEM;
-                        break;
-                }
+		if (tx == NULL) {
+			CERROR("Can't allocate tx for %s\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			/* Not replying will break the connection */
+			rc = -ENOMEM;
+			break;
+		}
 
+		tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
 		txmsg = tx->tx_msg;
 		rd = &txmsg->ibm_u.putack.ibpam_rd;
 		rc = kiblnd_setup_rd_kiov(ni, tx, rd,
diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c
index ba318a2..72b9aa7 100644
--- a/lnet/lnet/lib-md.c
+++ b/lnet/lnet/lib-md.c
@@ -208,6 +208,9 @@ lnet_md_build(const struct lnet_md *umd, int unlink)
 	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
 	lmd->md_bulk_handle = umd->bulk_handle;
 
+	if (umd->options & LNET_MD_GPU_ADDR)
+		lmd->md_flags |= LNET_MD_FLAG_GPU;
+
 	if (umd->options & LNET_MD_KIOV) {
 		memcpy(lmd->md_kiov, umd->start,
 		       niov * sizeof(lmd->md_kiov[0]));
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c
index 9aae6ac..34aeea8 100644
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -1636,11 +1636,13 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
 	__u32 best_sel_prio;
 	unsigned int best_dev_prio;
 	unsigned int dev_idx = UINT_MAX;
-	struct page *page = lnet_get_first_page(md, offset);
-	msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+	bool gpu = md->md_flags & LNET_MD_FLAG_GPU;
+
+	if (gpu) {
+		struct page *page = lnet_get_first_page(md, offset);
 
-	if (msg->msg_rdma_force)
 		dev_idx = lnet_get_dev_idx(page);
+	}
 
 	/*
 	 * If there is no peer_ni that we can send to on this network,
@@ -1692,7 +1694,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
 		 * All distances smaller than the NUMA range
 		 * are treated equally.
 		 */
-		if (distance < lnet_numa_range)
+		if (!gpu && distance < lnet_numa_range)
 			distance = lnet_numa_range;
 
 		/*
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index fa7617c..287eec2 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -1368,7 +1368,9 @@ struct ptlrpc_bulk_desc {
 	/** completed with failure */
 	unsigned long bd_failure:1;
 	/** client side */
-	unsigned long bd_registered:1;
+	unsigned long bd_registered:1,
+	/* bulk request is RDMA transfer, use page->host as real address */
+			bd_is_rdma:1;
 	/** For serialization with callback */
 	spinlock_t bd_lock;
 	/** {put,get}{source,sink}{kvec,kiov} */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 6bc9389..d80a4a4 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1406,6 +1406,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
 	const char *obd_name = cli->cl_import->imp_obd->obd_name;
 	struct inode *inode = NULL;
 	bool directio = false;
+	bool gpu = 0;
 	bool enable_checksum = true;
 	struct cl_page *clpage;
 
@@ -1571,6 +1572,7 @@ retry_encrypt:
 	if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
 		enable_checksum = false;
 		short_io_size = 0;
+		gpu = 1;
 	}
 
 	/* Check if read/write is small enough to be a short io. */
@@ -1618,6 +1620,7 @@ retry_encrypt:
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
+	desc->bd_is_rdma = gpu;
 no_bulk:
         body = req_capsule_client_get(pill, &RMF_OST_BODY);
         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c
index 973f2b5..45af603 100644
--- a/lustre/ptlrpc/pers.c
+++ b/lustre/ptlrpc/pers.c
@@ -58,6 +58,9 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
 		return;
 	}
 
+	if (desc->bd_is_rdma)
+		md->options |= LNET_MD_GPU_ADDR;
+
 	if (mdidx == (desc->bd_md_count - 1))
 		md->length = desc->bd_iov_count - start;
 	else
-- 
1.8.3.1