Whamcloud - gitweb
LU-15189 lnet: fix memory mapping. 82/45482/12
authorAlexey Lyashkov <alexey.lyashkov@hpe.com>
Mon, 7 Feb 2022 15:02:14 +0000 (18:02 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 30 May 2022 18:44:18 +0000 (18:44 +0000)
Nvidia GDS have a bug which caused incorrect page type detect.
It may return an GPU flag for the kmalloc buffer (ptlrpc_message
in my case).
To workaround this - Whamcloud have both mapping calls, but it's
costly and caused an extra RDMA operations as ko2iblnd trust
an msg_rdma_force flag.
Lets drop extra Nvidia calls and check just an real "user" pages
or GPU flag.

HPe-bug-id: LUS-10520
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Change-Id: I5d70c5e0630b0f16e130a7db0385de2443c11a63
Reviewed-on: https://review.whamcloud.com/45482
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/include/uapi/linux/lnet/lnet-types.h
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/lnet/lib-md.c
lnet/lnet/lib-move.c
lustre/include/lustre_net.h
lustre/osc/osc_request.c
lustre/ptlrpc/pers.c

index 0df6857..ff93f87 100644 (file)
@@ -141,8 +141,6 @@ struct lnet_msg {
        enum lnet_msg_hstatus   msg_health_status;
        /* This is a recovery message */
        bool                    msg_recovery;
-       /* force an RDMA even if the message size is < 4K */
-       bool                    msg_rdma_force;
        /* the number of times a transmission has been retried */
        int                     msg_retry_count;
        /* flag to indicate that we do not want to resend this message */
@@ -248,6 +246,7 @@ struct lnet_libmd {
  */
 #define LNET_MD_FLAG_HANDLING   BIT(3)
 #define LNET_MD_FLAG_DISCARD    BIT(4)
+#define LNET_MD_FLAG_GPU        BIT(5) /**< Special mapping needs */
 
 struct lnet_test_peer {
        /* info about peers we are trying to fail */
index d7fce9d..5a2a2a0 100644 (file)
@@ -467,6 +467,8 @@ struct lnet_md {
 #define LNET_MD_NO_TRACK_RESPONSE    (1 << 11)
 /** See struct lnet_md::options. */
 #define LNET_MD_GNILND               (1 << 12)
+/** Special page mapping handling */
+#define LNET_MD_GPU_ADDR            (1 << 13)
 
 /** Infinite threshold on MD operations. See struct lnet_md::threshold */
 #define LNET_MD_THRESH_INF      (-1)
index d3f6512..4c8581d 100644 (file)
@@ -513,9 +513,11 @@ struct kib_tx {                                    /* transmit message */
        /* # tx callbacks outstanding */
        short                   tx_sending;
        /* queued for sending */
-       short                   tx_queued;
+       unsigned long           tx_queued:1,
        /* waiting for peer_ni */
-       short                   tx_waiting;
+                               tx_waiting:1,
+       /* force RDMA */
+                               tx_gpu:1;
        /* LNET completion status */
        int                     tx_status;
        /* health status of the transmit */
@@ -1038,33 +1040,32 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
 #define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
 #define KIBLND_UNMAP_ADDR(p, m, a)      (a)
 
-static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
-                                   struct scatterlist *sg, int nents,
-                                   enum dma_data_direction direction)
+static inline
+int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
 {
-       int count;
+       struct scatterlist *sg = tx->tx_frags;
+       int nents = tx->tx_nfrags;
+       enum dma_data_direction direction = tx->tx_dmadir;
 
-       count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
-                                      sg, nents, direction);
-
-       if (count != 0)
-               return count;
+       if (tx->tx_gpu)
+               return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+                                             sg, nents, direction);
 
        return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
-static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
-                                      struct scatterlist *sg, int nents,
-                                      enum dma_data_direction direction)
+static inline
+void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
 {
-       int count;
-
-       count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
-                                  sg, nents, direction);
-       if (count != 0)
-               return;
+       struct scatterlist *sg = tx->tx_frags;
+       int nents = tx->tx_nfrags;
+       enum dma_data_direction direction = tx->tx_dmadir;
 
-       ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
+       if (tx->tx_gpu)
+               lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+                                         sg, nents, direction);
+       else
+               ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
 }
 
 #ifndef HAVE_IB_SG_DMA_ADDRESS
index 2cbc2b8..ac69534 100644 (file)
@@ -670,8 +670,7 @@ kiblnd_unmap_tx(struct kib_tx *tx)
                kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
 
        if (tx->tx_nfrags != 0) {
-               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
-                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx);
                tx->tx_nfrags = 0;
        }
 }
@@ -724,9 +723,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        tx->tx_nfrags = nfrags;
 
-       rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
-                                         tx->tx_nfrags, tx->tx_dmadir);
-
+       rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx);
         for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
                 rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
                         hdev->ibh_ibdev, &tx->tx_frags[i]);
@@ -1174,7 +1171,8 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
                int     prev = dstidx;
 
                if (srcidx >= srcrd->rd_nfrags) {
-                       CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                       CERROR("Src buffer exhausted: %d frags %px\n",
+                               srcidx, tx);
                        rc = -EPROTO;
                        break;
                }
@@ -1647,11 +1645,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
        struct bio_vec   *payload_kiov = lntmsg->msg_kiov;
        unsigned int      payload_offset = lntmsg->msg_offset;
        unsigned int      payload_nob = lntmsg->msg_len;
+       bool             gpu;
        struct kib_msg *ibmsg;
        struct kib_rdma_desc *rd;
-       struct kib_tx *tx;
-       int               nob;
-       int               rc;
+       struct kib_tx   *tx;
+       int              nob;
+       int              rc;
 
        /* NB 'private' is different depending on what we're sending.... */
 
@@ -1672,26 +1671,28 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                return -ENOMEM;
        }
        ibmsg = tx->tx_msg;
+       gpu = (lntmsg->msg_md->md_flags & LNET_MD_FLAG_GPU);
 
        switch (type) {
        default:
                LBUG();
                return (-EIO);
 
-        case LNET_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
+       case LNET_MSG_ACK:
+               LASSERT(payload_nob == 0);
+               break;
 
-        case LNET_MSG_GET:
-                if (routing || target_is_router)
-                        break;                  /* send IMMEDIATE */
+       case LNET_MSG_GET:
+               if (routing || target_is_router)
+                       break;                  /* send IMMEDIATE */
 
-                /* is the REPLY message too small for RDMA? */
+               /* is the REPLY message too small for RDMA? */
                nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
-               if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+               if (nob <= IBLND_MSG_SIZE && !gpu)
                        break;                  /* send IMMEDIATE */
 
                rd = &ibmsg->ibm_u.get.ibgm_rd;
+               tx->tx_gpu = !!gpu;
                rc = kiblnd_setup_rd_kiov(ni, tx, rd,
                                          lntmsg->msg_md->md_niov,
                                          lntmsg->msg_md->md_kiov,
@@ -1708,9 +1709,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
                lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr);
 
-                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
 
-                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+               tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
                if (tx->tx_lntmsg[1] == NULL) {
                        CERROR("Can't create reply for GET -> %s\n",
                               libcfs_nidstr(&target->nid));
@@ -1727,9 +1728,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
        case LNET_MSG_REPLY:
        case LNET_MSG_PUT:
                /* Is the payload small enough not to need RDMA? */
-               nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
-               if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
-                       break;                  /* send IMMEDIATE */
+               nob = offsetof(struct kib_msg,
+                               ibm_u.immediate.ibim_payload[payload_nob]);
+               if (nob <= IBLND_MSG_SIZE && !gpu)
+                       break;                  /* send IMMEDIATE */
+
+               tx->tx_gpu = gpu;
 
                rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
                                          payload_niov, payload_kiov,
@@ -1813,6 +1817,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
        struct bio_vec *kiov = lntmsg->msg_kiov;
        unsigned int offset = lntmsg->msg_offset;
        unsigned int nob = lntmsg->msg_len;
+       struct lnet_libmd *payload_md = lntmsg->msg_md;
        struct kib_tx *tx;
        int rc;
 
@@ -1823,6 +1828,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
                goto failed_0;
        }
 
+       tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
        if (nob == 0)
                rc = 0;
        else
@@ -1920,8 +1926,9 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
        case IBLND_MSG_PUT_REQ: {
                struct kib_msg  *txmsg;
                struct kib_rdma_desc *rd;
-               ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+               struct lnet_libmd *payload_md = lntmsg->msg_md;
 
+               ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
                if (mlen == 0) {
                        lnet_finalize(lntmsg, 0);
                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1930,14 +1937,15 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                }
 
                tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
-                if (tx == NULL) {
-                        CERROR("Can't allocate tx for %s\n",
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                        /* Not replying will break the connection */
-                        rc = -ENOMEM;
-                        break;
-                }
+               if (tx == NULL) {
+                       CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       /* Not replying will break the connection */
+                       rc = -ENOMEM;
+                       break;
+               }
 
+               tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
                txmsg = tx->tx_msg;
                rd = &txmsg->ibm_u.putack.ibpam_rd;
                rc = kiblnd_setup_rd_kiov(ni, tx, rd,
index ba318a2..72b9aa7 100644 (file)
@@ -208,6 +208,9 @@ lnet_md_build(const struct lnet_md *umd, int unlink)
        lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
        lmd->md_bulk_handle = umd->bulk_handle;
 
+       if (umd->options & LNET_MD_GPU_ADDR)
+               lmd->md_flags |= LNET_MD_FLAG_GPU;
+
        if (umd->options & LNET_MD_KIOV) {
                memcpy(lmd->md_kiov, umd->start,
                       niov * sizeof(lmd->md_kiov[0]));
index 9aae6ac..34aeea8 100644 (file)
@@ -1636,11 +1636,13 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        __u32 best_sel_prio;
        unsigned int best_dev_prio;
        unsigned int dev_idx = UINT_MAX;
-       struct page *page = lnet_get_first_page(md, offset);
-       msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+       bool gpu = md->md_flags & LNET_MD_FLAG_GPU;
+
+       if (gpu) {
+               struct page *page = lnet_get_first_page(md, offset);
 
-       if (msg->msg_rdma_force)
                dev_idx = lnet_get_dev_idx(page);
+       }
 
        /*
         * If there is no peer_ni that we can send to on this network,
@@ -1692,7 +1694,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 * All distances smaller than the NUMA range
                 * are treated equally.
                 */
-               if (distance < lnet_numa_range)
+               if (!gpu && distance < lnet_numa_range)
                        distance = lnet_numa_range;
 
                /*
index fa7617c..287eec2 100644 (file)
@@ -1368,7 +1368,9 @@ struct ptlrpc_bulk_desc {
        /** completed with failure */
        unsigned long bd_failure:1;
        /** client side */
-       unsigned long bd_registered:1;
+       unsigned long bd_registered:1,
+       /* bulk request is RDMA transfer, use page->host as real address */
+                       bd_is_rdma:1;
        /** For serialization with callback */
        spinlock_t bd_lock;
        /** {put,get}{source,sink}{kvec,kiov} */
index 6bc9389..d80a4a4 100644 (file)
@@ -1406,6 +1406,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
        const char *obd_name = cli->cl_import->imp_obd->obd_name;
        struct inode *inode = NULL;
        bool directio = false;
+       bool gpu = 0;
        bool enable_checksum = true;
        struct cl_page *clpage;
 
@@ -1571,6 +1572,7 @@ retry_encrypt:
        if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
                enable_checksum = false;
                short_io_size = 0;
+               gpu = 1;
        }
 
        /* Check if read/write is small enough to be a short io. */
@@ -1618,6 +1620,7 @@ retry_encrypt:
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
         /* NB request now owns desc and will free it when it gets freed */
+       desc->bd_is_rdma = gpu;
 no_bulk:
         body = req_capsule_client_get(pill, &RMF_OST_BODY);
         ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
index 973f2b5..45af603 100644 (file)
@@ -58,6 +58,9 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
                return;
        }
 
+       if (desc->bd_is_rdma)
+               md->options |= LNET_MD_GPU_ADDR;
+
        if (mdidx == (desc->bd_md_count - 1))
                md->length = desc->bd_iov_count - start;
        else