Whamcloud - gitweb
LU-15092 o2iblnd: Fix logic for unaligned transfer 16/45216/4
authorChris Horn <chris.horn@hpe.com>
Thu, 16 Sep 2021 17:12:38 +0000 (12:12 -0500)
committerOleg Drokin <green@whamcloud.com>
Sat, 20 Nov 2021 06:26:07 +0000 (06:26 +0000)
It's possible for there to be an offset for the first page of a
transfer. However, there are two bugs with this code in o2iblnd.

The first is that this use-case will require LNET_MAX_IOV + 1 local
RDMA fragments, but we do not specify the correct corresponding values
for the max page list to ib_alloc_fast_reg_page_list(),
ib_alloc_fast_reg_mr(), etc.

The second issue is that the logic in kiblnd_setup_rd_kiov() attempts
to obtain one more scatterlist entry than is actually needed. This
causes the transfer to fail with -EFAULT.

Test-Parameters: trivial
HPE-bug-id: LUS-10407
Fixes: d226464aca ("LU-8057 ko2iblnd: Replace sg++ with sg = sg_next(sg)")
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ifb843f11ae34a99b7d8f93d94966e3dfa1ce90e5
Reviewed-on: https://review.whamcloud.com/45216
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 341bef6..e3a8845 100644 (file)
@@ -1464,7 +1464,7 @@ static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps,
                                 struct kib_fmr_pool *fpo)
 {
        struct ib_fmr_pool_param param = {
                                 struct kib_fmr_pool *fpo)
 {
        struct ib_fmr_pool_param param = {
-               .max_pages_per_fmr = LNET_MAX_IOV,
+               .max_pages_per_fmr = IBLND_MAX_RDMA_FRAGS,
                .page_shift        = PAGE_SHIFT,
                .access            = (IB_ACCESS_LOCAL_WRITE |
                                      IB_ACCESS_REMOTE_WRITE),
                .page_shift        = PAGE_SHIFT,
                .access            = (IB_ACCESS_LOCAL_WRITE |
                                      IB_ACCESS_REMOTE_WRITE),
@@ -1515,7 +1515,7 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 
 #ifndef HAVE_IB_MAP_MR_SG
                frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
 
 #ifndef HAVE_IB_MAP_MR_SG
                frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
-                                                           LNET_MAX_IOV);
+                                                           IBLND_MAX_RDMA_FRAGS);
                if (IS_ERR(frd->frd_frpl)) {
                        rc = PTR_ERR(frd->frd_frpl);
                        CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
                if (IS_ERR(frd->frd_frpl)) {
                        rc = PTR_ERR(frd->frd_frpl);
                        CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
@@ -1527,7 +1527,7 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 
 #ifdef HAVE_IB_ALLOC_FAST_REG_MR
                frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
 
 #ifdef HAVE_IB_ALLOC_FAST_REG_MR
                frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
-                                                  LNET_MAX_IOV);
+                                                  IBLND_MAX_RDMA_FRAGS);
 #else
                /*
                 * it is expected to get here if this is an MLX-5 card.
 #else
                /*
                 * it is expected to get here if this is an MLX-5 card.
@@ -1545,7 +1545,7 @@ static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps,
 #else
                                                IB_MR_TYPE_MEM_REG,
 #endif
 #else
                                                IB_MR_TYPE_MEM_REG,
 #endif
-                                         LNET_MAX_IOV);
+                                         IBLND_MAX_RDMA_FRAGS);
                if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
                    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
                        CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
                if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
                    (dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
                        CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
@@ -2223,13 +2223,13 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool)
                        CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
                if (tx->tx_frags != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_frags,
                        CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
                if (tx->tx_frags != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_frags,
-                                          (1 + IBLND_MAX_RDMA_FRAGS));
+                                          IBLND_MAX_RDMA_FRAGS);
                if (tx->tx_wrq != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_wrq,
                if (tx->tx_wrq != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_wrq,
-                                          (1 + IBLND_MAX_RDMA_FRAGS));
+                                          IBLND_MAX_RDMA_FRAGS);
                if (tx->tx_sge != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_sge,
                if (tx->tx_sge != NULL)
                        CFS_FREE_PTR_ARRAY(tx->tx_sge,
-                                          (1 + IBLND_MAX_RDMA_FRAGS) *
+                                          IBLND_MAX_RDMA_FRAGS *
                                           wrq_sge);
                if (tx->tx_rd != NULL)
                        LIBCFS_FREE(tx->tx_rd,
                                           wrq_sge);
                if (tx->tx_rd != NULL)
                        LIBCFS_FREE(tx->tx_rd,
@@ -2304,21 +2304,21 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
                }
 
                LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
                }
 
                LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                IBLND_MAX_RDMA_FRAGS *
                                 sizeof(*tx->tx_frags));
                if (tx->tx_frags == NULL)
                        break;
 
                                 sizeof(*tx->tx_frags));
                if (tx->tx_frags == NULL)
                        break;
 
-               sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
+               sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
 
                LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
 
                LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                IBLND_MAX_RDMA_FRAGS *
                                 sizeof(*tx->tx_wrq));
                if (tx->tx_wrq == NULL)
                        break;
 
                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
                                 sizeof(*tx->tx_wrq));
                if (tx->tx_wrq == NULL)
                        break;
 
                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+                                IBLND_MAX_RDMA_FRAGS * wrq_sge *
                                 sizeof(*tx->tx_sge));
                if (tx->tx_sge == NULL)
                        break;
                                 sizeof(*tx->tx_sge));
                if (tx->tx_sge == NULL)
                        break;
index 3d3f9f0..f6d2a06 100644 (file)
@@ -149,8 +149,10 @@ extern struct kib_tunables  kiblnd_tunables;
 #define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
 #define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
 
 #define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
 #define IBLND_OOB_MSGS(v)           (IBLND_OOB_CAPABLE(v) ? 2 : 0)
 
-#define IBLND_MSG_SIZE              (4<<10)                 /* max size of queued messages (inc hdr) */
-#define IBLND_MAX_RDMA_FRAGS         LNET_MAX_IOV           /* max # of fragments supported */
+/* max size of queued messages (inc hdr) */
+#define IBLND_MSG_SIZE              (4<<10)
+/* max # of fragments supported. + 1 for unaligned case */
+#define IBLND_MAX_RDMA_FRAGS        (LNET_MAX_IOV + 1)
 
 /************************/
 /* derived constants... */
 
 /************************/
 /* derived constants... */
index 07c2efa..bb5910e 100644 (file)
@@ -749,8 +749,9 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
 {
        struct kib_net *net = ni->ni_data;
        struct scatterlist *sg;
 {
        struct kib_net *net = ni->ni_data;
        struct scatterlist *sg;
-       int                 fragnob;
-       int                 max_nkiov;
+       int fragnob;
+       int max_nkiov;
+       int sg_count = 0;
 
        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
 
        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
@@ -771,6 +772,12 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
        do {
                LASSERT(nkiov > 0);
 
        do {
                LASSERT(nkiov > 0);
 
+               if (!sg) {
+                       CERROR("lacking enough sg entries to map tx\n");
+                       return -EFAULT;
+               }
+               sg_count++;
+
                fragnob = min((int)(kiov->bv_len - offset), nob);
 
                /*
                fragnob = min((int)(kiov->bv_len - offset), nob);
 
                /*
@@ -790,10 +797,6 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
                sg_set_page(sg, kiov->bv_page, fragnob,
                            kiov->bv_offset + offset);
                sg = sg_next(sg);
                sg_set_page(sg, kiov->bv_page, fragnob,
                            kiov->bv_offset + offset);
                sg = sg_next(sg);
-               if (!sg) {
-                       CERROR("lacking enough sg entries to map tx\n");
-                       return -EFAULT;
-               }
 
                offset = 0;
                kiov++;
 
                offset = 0;
                kiov++;
@@ -801,7 +804,7 @@ static int kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
                nob -= fragnob;
        } while (nob > 0);
 
                nob -= fragnob;
        } while (nob > 0);
 
-       return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+       return kiblnd_map_tx(ni, tx, rd, sg_count);
 }
 
 static int
 }
 
 static int
@@ -1100,7 +1103,7 @@ kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
 #endif
 
        LASSERT(tx->tx_nwrq >= 0);
 #endif
 
        LASSERT(tx->tx_nwrq >= 0);
-       LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+       LASSERT(tx->tx_nwrq <= IBLND_MAX_RDMA_FRAGS);
        LASSERT(nob <= IBLND_MSG_SIZE);
 #ifdef HAVE_IB_GET_DMA_MR
        LASSERT(mr != NULL);
        LASSERT(nob <= IBLND_MSG_SIZE);
 #ifdef HAVE_IB_GET_DMA_MR
        LASSERT(mr != NULL);