From 959304eac7ec5b156b4bfa57f47cbbf9ef3c8315 Mon Sep 17 00:00:00 2001 From: Alexey Lyashkov Date: Mon, 7 Feb 2022 18:02:14 +0300 Subject: [PATCH] LU-15189 lnet: fix memory mapping. Nvidia GDS have a bug which caused incorrect page type detect. It may return an GPU flag for the kmalloc buffer (ptlrpc_message in my case). To workaround this - Whamcloud have both mapping calls, but it's costly and caused an extra RDMA operations as ko2iblnd trust an msg_rdma_force flag. Lets drop extra Nvidia calls and check just an real "user" pages or GPU flag. HPe-bug-id: LUS-10520 Signed-off-by: Alexey Lyashkov Change-Id: I5d70c5e0630b0f16e130a7db0385de2443c11a63 Reviewed-on: https://review.whamcloud.com/45482 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-types.h | 3 +- lnet/include/uapi/linux/lnet/lnet-types.h | 2 + lnet/klnds/o2iblnd/o2iblnd.h | 43 +++++++++---------- lnet/klnds/o2iblnd/o2iblnd_cb.c | 68 +++++++++++++++++-------------- lnet/lnet/lib-md.c | 3 ++ lnet/lnet/lib-move.c | 10 +++-- lustre/include/lustre_net.h | 4 +- lustre/osc/osc_request.c | 3 ++ lustre/ptlrpc/pers.c | 3 ++ 9 files changed, 81 insertions(+), 58 deletions(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 0df6857..ff93f87 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -141,8 +141,6 @@ struct lnet_msg { enum lnet_msg_hstatus msg_health_status; /* This is a recovery message */ bool msg_recovery; - /* force an RDMA even if the message size is < 4K */ - bool msg_rdma_force; /* the number of times a transmission has been retried */ int msg_retry_count; /* flag to indicate that we do not want to resend this message */ @@ -248,6 +246,7 @@ struct lnet_libmd { */ #define LNET_MD_FLAG_HANDLING BIT(3) #define LNET_MD_FLAG_DISCARD BIT(4) +#define LNET_MD_FLAG_GPU BIT(5) /**< Special mapping needs */ struct lnet_test_peer { /* info about peers we are trying to fail */ diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h index d7fce9d..5a2a2a0 100644 --- a/lnet/include/uapi/linux/lnet/lnet-types.h +++ b/lnet/include/uapi/linux/lnet/lnet-types.h @@ -467,6 +467,8 @@ struct lnet_md { #define LNET_MD_NO_TRACK_RESPONSE (1 << 11) /** See struct lnet_md::options. */ #define LNET_MD_GNILND (1 << 12) +/** Special page mapping handling */ +#define LNET_MD_GPU_ADDR (1 << 13) /** Infinite threshold on MD operations. See struct lnet_md::threshold */ #define LNET_MD_THRESH_INF (-1) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index d3f6512..4c8581d 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -513,9 +513,11 @@ struct kib_tx { /* transmit message */ /* # tx callbacks outstanding */ short tx_sending; /* queued for sending */ - short tx_queued; + unsigned long tx_queued:1, /* waiting for peer_ni */ - short tx_waiting; + tx_waiting:1, + /* force RDMA */ + tx_gpu:1; /* LNET completion status */ int tx_status; /* health status of the transmit */ @@ -1038,33 +1040,32 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev, #define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) #define KIBLND_UNMAP_ADDR(p, m, a) (a) -static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) +static inline +int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx) { - int count; + struct scatterlist *sg = tx->tx_frags; + int nents = tx->tx_nfrags; + enum dma_data_direction direction = tx->tx_dmadir; - count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device, - sg, nents, direction); - - if (count != 0) - return count; + if (tx->tx_gpu) + return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device, + sg, nents, direction); return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction); } -static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) +static inline +void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx) { - int count; - - count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device, - sg, nents, direction); - if (count != 0) - return; + struct scatterlist *sg = tx->tx_frags; + int nents = tx->tx_nfrags; + enum dma_data_direction direction = tx->tx_dmadir; - ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction); + if (tx->tx_gpu) + lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device, + sg, nents, direction); + else + ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction); } #ifndef HAVE_IB_SG_DMA_ADDRESS diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 2cbc2b8..ac69534 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -670,8 +670,7 @@ kiblnd_unmap_tx(struct kib_tx *tx) kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status); if (tx->tx_nfrags != 0) { - kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, - tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx); tx->tx_nfrags = 0; } } @@ -724,9 +723,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; tx->tx_nfrags = nfrags; - rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags, - tx->tx_nfrags, tx->tx_dmadir); - + rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx); for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( hdev->ibh_ibdev, &tx->tx_frags[i]); @@ -1174,7 +1171,8 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, int prev = dstidx; if (srcidx >= srcrd->rd_nfrags) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); + CERROR("Src buffer exhausted: %d frags %px\n", + srcidx, tx); rc = -EPROTO; break; } @@ -1647,11 +1645,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) struct bio_vec *payload_kiov = lntmsg->msg_kiov; unsigned int payload_offset = lntmsg->msg_offset; unsigned int payload_nob = lntmsg->msg_len; + bool gpu; struct kib_msg *ibmsg; struct kib_rdma_desc *rd; - struct kib_tx *tx; - int nob; - int rc; + struct kib_tx *tx; + int nob; + int rc; /* NB 'private' is different depending on what we're sending.... */ @@ -1672,26 +1671,28 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) return -ENOMEM; } ibmsg = tx->tx_msg; + gpu = (lntmsg->msg_md->md_flags & LNET_MD_FLAG_GPU); switch (type) { default: LBUG(); return (-EIO); - case LNET_MSG_ACK: - LASSERT (payload_nob == 0); - break; + case LNET_MSG_ACK: + LASSERT(payload_nob == 0); + break; - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ - /* is the REPLY message too small for RDMA? */ + /* is the REPLY message too small for RDMA? */ nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force) + if (nob <= IBLND_MSG_SIZE && !gpu) break; /* send IMMEDIATE */ rd = &ibmsg->ibm_u.get.ibgm_rd; + tx->tx_gpu = !!gpu; rc = kiblnd_setup_rd_kiov(ni, tx, rd, lntmsg->msg_md->md_niov, lntmsg->msg_md->md_kiov, @@ -1708,9 +1709,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr); - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); - tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); if (tx->tx_lntmsg[1] == NULL) { CERROR("Can't create reply for GET -> %s\n", libcfs_nidstr(&target->nid)); @@ -1727,9 +1728,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) case LNET_MSG_REPLY: case LNET_MSG_PUT: /* Is the payload small enough not to need RDMA? */ - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force) - break; /* send IMMEDIATE */ + nob = offsetof(struct kib_msg, + ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE && !gpu) + break; /* send IMMEDIATE */ + + tx->tx_gpu = gpu; rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, payload_niov, payload_kiov, @@ -1813,6 +1817,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) struct bio_vec *kiov = lntmsg->msg_kiov; unsigned int offset = lntmsg->msg_offset; unsigned int nob = lntmsg->msg_len; + struct lnet_libmd *payload_md = lntmsg->msg_md; struct kib_tx *tx; int rc; @@ -1823,6 +1828,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) goto failed_0; } + tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU); if (nob == 0) rc = 0; else @@ -1920,8 +1926,9 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, case IBLND_MSG_PUT_REQ: { struct kib_msg *txmsg; struct kib_rdma_desc *rd; - ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + struct lnet_libmd *payload_md = lntmsg->msg_md; + ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; if (mlen == 0) { lnet_finalize(lntmsg, 0); kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, @@ -1930,14 +1937,15 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, } tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - if (tx == NULL) { - CERROR("Can't allocate tx for %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* Not replying will break the connection */ - rc = -ENOMEM; - break; - } + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU); txmsg = tx->tx_msg; rd = &txmsg->ibm_u.putack.ibpam_rd; rc = kiblnd_setup_rd_kiov(ni, tx, rd, diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index ba318a2..72b9aa7 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -208,6 +208,9 @@ lnet_md_build(const struct lnet_md *umd, int unlink) lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; lmd->md_bulk_handle = umd->bulk_handle; + if (umd->options & LNET_MD_GPU_ADDR) + lmd->md_flags |= LNET_MD_FLAG_GPU; + if (umd->options & LNET_MD_KIOV) { memcpy(lmd->md_kiov, umd->start, niov * sizeof(lmd->md_kiov[0])); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 9aae6ac..34aeea8 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1636,11 +1636,13 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, __u32 best_sel_prio; unsigned int best_dev_prio; unsigned int dev_idx = UINT_MAX; - struct page *page = lnet_get_first_page(md, offset); - msg->msg_rdma_force = lnet_is_rdma_only_page(page); + bool gpu = md->md_flags & LNET_MD_FLAG_GPU; + + if (gpu) { + struct page *page = lnet_get_first_page(md, offset); - if (msg->msg_rdma_force) dev_idx = lnet_get_dev_idx(page); + } /* * If there is no peer_ni that we can send to on this network, @@ -1692,7 +1694,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * All distances smaller than the NUMA range * are treated equally. */ - if (distance < lnet_numa_range) + if (!gpu && distance < lnet_numa_range) distance = lnet_numa_range; /* diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index fa7617c..287eec2 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1368,7 +1368,9 @@ struct ptlrpc_bulk_desc { /** completed with failure */ unsigned long bd_failure:1; /** client side */ - unsigned long bd_registered:1; + unsigned long bd_registered:1, + /* bulk request is RDMA transfer, use page->host as real address */ + bd_is_rdma:1; /** For serialization with callback */ spinlock_t bd_lock; /** {put,get}{source,sink}{kvec,kiov} */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 6bc9389..d80a4a4 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1406,6 +1406,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa, const char *obd_name = cli->cl_import->imp_obd->obd_name; struct inode *inode = NULL; bool directio = false; + bool gpu = 0; bool enable_checksum = true; struct cl_page *clpage; @@ -1571,6 +1572,7 @@ retry_encrypt: if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) { enable_checksum = false; short_io_size = 0; + gpu = 1; } /* Check if read/write is small enough to be a short io. */ @@ -1618,6 +1620,7 @@ retry_encrypt: if (desc == NULL) GOTO(out, rc = -ENOMEM); /* NB request now owns desc and will free it when it gets freed */ + desc->bd_is_rdma = gpu; no_bulk: body = req_capsule_client_get(pill, &RMF_OST_BODY); ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c index 973f2b5..45af603 100644 --- a/lustre/ptlrpc/pers.c +++ b/lustre/ptlrpc/pers.c @@ -58,6 +58,9 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, return; } + if (desc->bd_is_rdma) + md->options |= LNET_MD_GPU_ADDR; + if (mdidx == (desc->bd_md_count - 1)) md->length = desc->bd_iov_count - start; else -- 1.8.3.1