LU-15189 lnet: fix memory mapping.

author Alexey Lyashkov <alexey.lyashkov@hpe.com>

Mon, 7 Feb 2022 15:02:14 +0000 (18:02 +0300)

committer Oleg Drokin <green@whamcloud.com>

Mon, 30 May 2022 18:44:18 +0000 (18:44 +0000)
author Alexey Lyashkov <alexey.lyashkov@hpe.com>
Mon, 7 Feb 2022 15:02:14 +0000 (18:02 +0300)
committer Oleg Drokin <green@whamcloud.com>
Mon, 30 May 2022 18:44:18 +0000 (18:44 +0000)
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h

index 0df6857..ff93f87 100644 (file)
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -141,8 +141,6 @@ struct lnet_msg {
         enum lnet_msg_hstatus   msg_health_status;
         /* This is a recovery message */
         bool                    msg_recovery;
-       /* force an RDMA even if the message size is < 4K */
-       bool                    msg_rdma_force;
         /* the number of times a transmission has been retried */
         int                     msg_retry_count;
         /* flag to indicate that we do not want to resend this message */
@@ -248,6 +246,7 @@ struct lnet_libmd {
   */
  #define LNET_MD_FLAG_HANDLING   BIT(3)
  #define LNET_MD_FLAG_DISCARD    BIT(4)
+#define LNET_MD_FLAG_GPU        BIT(5) /**< Special mapping needs */
  
  struct lnet_test_peer {
         /* info about peers we are trying to fail */
diff --git a/lnet/include/uapi/linux/lnet/lnet-types.h b/lnet/include/uapi/linux/lnet/lnet-types.h

index d7fce9d..5a2a2a0 100644 (file)
--- a/lnet/include/uapi/linux/lnet/lnet-types.h
+++ b/lnet/include/uapi/linux/lnet/lnet-types.h
@@ -467,6 +467,8 @@ struct lnet_md {
  #define LNET_MD_NO_TRACK_RESPONSE    (1 << 11)
  /** See struct lnet_md::options. */
  #define LNET_MD_GNILND               (1 << 12)
+/** Special page mapping handling */
+#define LNET_MD_GPU_ADDR            (1 << 13)
  
  /** Infinite threshold on MD operations. See struct lnet_md::threshold */
  #define LNET_MD_THRESH_INF      (-1)
diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h

index d3f6512..4c8581d 100644 (file)
--- a/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/lnet/klnds/o2iblnd/o2iblnd.h
@@ -513,9 +513,11 @@ struct kib_tx {                                    /* transmit message */
         /* # tx callbacks outstanding */
         short                   tx_sending;
         /* queued for sending */
-       short                   tx_queued;
+       unsigned long           tx_queued:1,
         /* waiting for peer_ni */
-       short                   tx_waiting;
+                               tx_waiting:1,
+       /* force RDMA */
+                               tx_gpu:1;
         /* LNET completion status */
         int                     tx_status;
         /* health status of the transmit */
@@ -1038,33 +1040,32 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
  #define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
  #define KIBLND_UNMAP_ADDR(p, m, a)      (a)
  
-static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
-                                   struct scatterlist *sg, int nents,
-                                   enum dma_data_direction direction)
+static inline
+int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
  {
-       int count;
+       struct scatterlist *sg = tx->tx_frags;
+       int nents = tx->tx_nfrags;
+       enum dma_data_direction direction = tx->tx_dmadir;
  
-       count = lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
-                                      sg, nents, direction);
-
-       if (count != 0)
-               return count;
+       if (tx->tx_gpu)
+               return lnet_rdma_map_sg_attrs(hdev->ibh_ibdev->dma_device,
+                                             sg, nents, direction);
  
         return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
  }
  
-static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
-                                      struct scatterlist *sg, int nents,
-                                      enum dma_data_direction direction)
+static inline
+void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
  {
-       int count;
-
-       count = lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
-                                  sg, nents, direction);
-       if (count != 0)
-               return;
+       struct scatterlist *sg = tx->tx_frags;
+       int nents = tx->tx_nfrags;
+       enum dma_data_direction direction = tx->tx_dmadir;
  
-       ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
+       if (tx->tx_gpu)
+               lnet_rdma_unmap_sg(hdev->ibh_ibdev->dma_device,
+                                         sg, nents, direction);
+       else
+               ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
  }
  
  #ifndef HAVE_IB_SG_DMA_ADDRESS
diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c

index 2cbc2b8..ac69534 100644 (file)
--- a/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -670,8 +670,7 @@ kiblnd_unmap_tx(struct kib_tx *tx)
                 kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
  
         if (tx->tx_nfrags != 0) {
-               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
-                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx);
                 tx->tx_nfrags = 0;
         }
  }
@@ -724,9 +723,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
          tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
         tx->tx_nfrags = nfrags;
  
-       rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
-                                         tx->tx_nfrags, tx->tx_dmadir);
-
+       rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx);
          for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
                  rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
                          hdev->ibh_ibdev, &tx->tx_frags[i]);
@@ -1174,7 +1171,8 @@ kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
                 int     prev = dstidx;
  
                 if (srcidx >= srcrd->rd_nfrags) {
-                       CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                       CERROR("Src buffer exhausted: %d frags %px\n",
+                               srcidx, tx);
                         rc = -EPROTO;
                         break;
                 }
@@ -1647,11 +1645,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         struct bio_vec   *payload_kiov = lntmsg->msg_kiov;
         unsigned int      payload_offset = lntmsg->msg_offset;
         unsigned int      payload_nob = lntmsg->msg_len;
+       bool             gpu;
         struct kib_msg *ibmsg;
         struct kib_rdma_desc *rd;
-       struct kib_tx *tx;
-       int               nob;
-       int               rc;
+       struct kib_tx   *tx;
+       int              nob;
+       int              rc;
  
         /* NB 'private' is different depending on what we're sending.... */
  
@@ -1672,26 +1671,28 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 return -ENOMEM;
         }
         ibmsg = tx->tx_msg;
+       gpu = (lntmsg->msg_md->md_flags & LNET_MD_FLAG_GPU);
  
         switch (type) {
         default:
                 LBUG();
                 return (-EIO);
  
-        case LNET_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
+       case LNET_MSG_ACK:
+               LASSERT(payload_nob == 0);
+               break;
  
-        case LNET_MSG_GET:
-                if (routing || target_is_router)
-                        break;                  /* send IMMEDIATE */
+       case LNET_MSG_GET:
+               if (routing || target_is_router)
+                       break;                  /* send IMMEDIATE */
  
-                /* is the REPLY message too small for RDMA? */
+               /* is the REPLY message too small for RDMA? */
                 nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
-               if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+               if (nob <= IBLND_MSG_SIZE && !gpu)
                         break;                  /* send IMMEDIATE */
  
                 rd = &ibmsg->ibm_u.get.ibgm_rd;
+               tx->tx_gpu = !!gpu;
                 rc = kiblnd_setup_rd_kiov(ni, tx, rd,
                                           lntmsg->msg_md->md_niov,
                                           lntmsg->msg_md->md_kiov,
@@ -1708,9 +1709,9 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
                 lnet_hdr_to_nid4(hdr, &ibmsg->ibm_u.get.ibgm_hdr);
  
-                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
  
-                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+               tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
                 if (tx->tx_lntmsg[1] == NULL) {
                         CERROR("Can't create reply for GET -> %s\n",
                                libcfs_nidstr(&target->nid));
@@ -1727,9 +1728,12 @@ kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
         case LNET_MSG_REPLY:
         case LNET_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
-               nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
-               if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
-                       break;                  /* send IMMEDIATE */
+               nob = offsetof(struct kib_msg,
+                               ibm_u.immediate.ibim_payload[payload_nob]);
+               if (nob <= IBLND_MSG_SIZE && !gpu)
+                       break;                  /* send IMMEDIATE */
+
+               tx->tx_gpu = gpu;
  
                 rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
                                           payload_niov, payload_kiov,
@@ -1813,6 +1817,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
         struct bio_vec *kiov = lntmsg->msg_kiov;
         unsigned int offset = lntmsg->msg_offset;
         unsigned int nob = lntmsg->msg_len;
+       struct lnet_libmd *payload_md = lntmsg->msg_md;
         struct kib_tx *tx;
         int rc;
  
@@ -1823,6 +1828,7 @@ kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
                 goto failed_0;
         }
  
+       tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
         if (nob == 0)
                 rc = 0;
         else
@@ -1920,8 +1926,9 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
         case IBLND_MSG_PUT_REQ: {
                 struct kib_msg  *txmsg;
                 struct kib_rdma_desc *rd;
-               ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+               struct lnet_libmd *payload_md = lntmsg->msg_md;
  
+               ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
                 if (mlen == 0) {
                         lnet_finalize(lntmsg, 0);
                         kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK,
@@ -1930,14 +1937,15 @@ kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                 }
  
                 tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
-                if (tx == NULL) {
-                        CERROR("Can't allocate tx for %s\n",
-                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                        /* Not replying will break the connection */
-                        rc = -ENOMEM;
-                        break;
-                }
+               if (tx == NULL) {
+                       CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       /* Not replying will break the connection */
+                       rc = -ENOMEM;
+                       break;
+               }
  
+               tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
                 txmsg = tx->tx_msg;
                 rd = &txmsg->ibm_u.putack.ibpam_rd;
                 rc = kiblnd_setup_rd_kiov(ni, tx, rd,
diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c

index ba318a2..72b9aa7 100644 (file)
--- a/lnet/lnet/lib-md.c
+++ b/lnet/lnet/lib-md.c
@@ -208,6 +208,9 @@ lnet_md_build(const struct lnet_md *umd, int unlink)
         lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
         lmd->md_bulk_handle = umd->bulk_handle;
  
+       if (umd->options & LNET_MD_GPU_ADDR)
+               lmd->md_flags |= LNET_MD_FLAG_GPU;
+
         if (umd->options & LNET_MD_KIOV) {
                 memcpy(lmd->md_kiov, umd->start,
                        niov * sizeof(lmd->md_kiov[0]));
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c

index 9aae6ac..34aeea8 100644 (file)
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -1636,11 +1636,13 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
         __u32 best_sel_prio;
         unsigned int best_dev_prio;
         unsigned int dev_idx = UINT_MAX;
-       struct page *page = lnet_get_first_page(md, offset);
-       msg->msg_rdma_force = lnet_is_rdma_only_page(page);
+       bool gpu = md->md_flags & LNET_MD_FLAG_GPU;
+
+       if (gpu) {
+               struct page *page = lnet_get_first_page(md, offset);
  
-       if (msg->msg_rdma_force)
                 dev_idx = lnet_get_dev_idx(page);
+       }
  
         /*
          * If there is no peer_ni that we can send to on this network,
@@ -1692,7 +1694,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                  * All distances smaller than the NUMA range
                  * are treated equally.
                  */
-               if (distance < lnet_numa_range)
+               if (!gpu && distance < lnet_numa_range)
                         distance = lnet_numa_range;
  
                 /*
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index fa7617c..287eec2 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -1368,7 +1368,9 @@ struct ptlrpc_bulk_desc {
         /** completed with failure */
         unsigned long bd_failure:1;
         /** client side */
-       unsigned long bd_registered:1;
+       unsigned long bd_registered:1,
+       /* bulk request is RDMA transfer, use page->host as real address */
+                       bd_is_rdma:1;
         /** For serialization with callback */
         spinlock_t bd_lock;
         /** {put,get}{source,sink}{kvec,kiov} */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 6bc9389..d80a4a4 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1406,6 +1406,7 @@ osc_brw_prep_request(int cmd, struct client_obd *cli, struct obdo *oa,
         const char *obd_name = cli->cl_import->imp_obd->obd_name;
         struct inode *inode = NULL;
         bool directio = false;
+       bool gpu = 0;
         bool enable_checksum = true;
         struct cl_page *clpage;
  
@@ -1571,6 +1572,7 @@ retry_encrypt:
         if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
                 enable_checksum = false;
                 short_io_size = 0;
+               gpu = 1;
         }
  
         /* Check if read/write is small enough to be a short io. */
@@ -1618,6 +1620,7 @@ retry_encrypt:
          if (desc == NULL)
                  GOTO(out, rc = -ENOMEM);
          /* NB request now owns desc and will free it when it gets freed */
+       desc->bd_is_rdma = gpu;
  no_bulk:
          body = req_capsule_client_get(pill, &RMF_OST_BODY);
          ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c

index 973f2b5..45af603 100644 (file)
--- a/lustre/ptlrpc/pers.c
+++ b/lustre/ptlrpc/pers.c
@@ -58,6 +58,9 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
                 return;
         }
  
+       if (desc->bd_is_rdma)
+               md->options |= LNET_MD_GPU_ADDR;
+
         if (mdidx == (desc->bd_md_count - 1))
                 md->length = desc->bd_iov_count - start;
         else
author	Alexey Lyashkov <alexey.lyashkov@hpe.com>
	Mon, 7 Feb 2022 15:02:14 +0000 (18:02 +0300)
committer	Oleg Drokin <green@whamcloud.com>
	Mon, 30 May 2022 18:44:18 +0000 (18:44 +0000)
lnet/include/lnet/lib-types.h		patch \| blob \| history
lnet/include/uapi/linux/lnet/lnet-types.h		patch \| blob \| history
lnet/klnds/o2iblnd/o2iblnd.h		patch \| blob \| history
lnet/klnds/o2iblnd/o2iblnd_cb.c		patch \| blob \| history
lnet/lnet/lib-md.c		patch \| blob \| history
lnet/lnet/lib-move.c		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/ptlrpc/pers.c		patch \| blob \| history