It's possible for there to be an offset for the first page of a
transfer. However, there are two bugs with this code in o2iblnd.
The first is that this use-case will require LNET_MAX_IOV + 1 local
RDMA fragments, but we do not specify the correct corresponding values
for the max page list to ib_alloc_fast_reg_page_list(),
ib_alloc_fast_reg_mr(), etc.
The second issue is that the logic in kiblnd_setup_rd_kiov() attempts
to obtain one more scatterlist entry than is actually needed. This
causes the transfer to fail with -EFAULT.
Test-Parameters: trivial
HPE-bug-id: LUS-10407
Fixes:
d226464aca ("LU-8057 ko2iblnd: Replace sg++ with sg = sg_next(sg)")
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ifb843f11ae34a99b7d8f93d94966e3dfa1ce90e5
Reviewed-on: https://review.whamcloud.com/45216
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
struct kib_fmr_pool *fpo)
{
struct ib_fmr_pool_param param = {
struct kib_fmr_pool *fpo)
{
struct ib_fmr_pool_param param = {
- .max_pages_per_fmr = LNET_MAX_IOV,
+ .max_pages_per_fmr = IBLND_MAX_RDMA_FRAGS,
.page_shift = PAGE_SHIFT,
.access = (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE),
.page_shift = PAGE_SHIFT,
.access = (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE),
#ifndef HAVE_IB_MAP_MR_SG
frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
#ifndef HAVE_IB_MAP_MR_SG
frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
if (IS_ERR(frd->frd_frpl)) {
rc = PTR_ERR(frd->frd_frpl);
CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
if (IS_ERR(frd->frd_frpl)) {
rc = PTR_ERR(frd->frd_frpl);
CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
#ifdef HAVE_IB_ALLOC_FAST_REG_MR
frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
#ifdef HAVE_IB_ALLOC_FAST_REG_MR
frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
#else
/*
* it is expected to get here if this is an MLX-5 card.
#else
/*
* it is expected to get here if this is an MLX-5 card.
#else
IB_MR_TYPE_MEM_REG,
#endif
#else
IB_MR_TYPE_MEM_REG,
#endif
if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
(dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
if ((*kiblnd_tunables.kib_use_fastreg_gaps == 1) &&
(dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT))
CWARN("using IB_MR_TYPE_SG_GAPS, expect a performance drop\n");
CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
if (tx->tx_frags != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_frags,
CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
if (tx->tx_frags != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_frags,
- (1 + IBLND_MAX_RDMA_FRAGS));
if (tx->tx_wrq != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_wrq,
if (tx->tx_wrq != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_wrq,
- (1 + IBLND_MAX_RDMA_FRAGS));
if (tx->tx_sge != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_sge,
if (tx->tx_sge != NULL)
CFS_FREE_PTR_ARRAY(tx->tx_sge,
- (1 + IBLND_MAX_RDMA_FRAGS) *
wrq_sge);
if (tx->tx_rd != NULL)
LIBCFS_FREE(tx->tx_rd,
wrq_sge);
if (tx->tx_rd != NULL)
LIBCFS_FREE(tx->tx_rd,
}
LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
}
LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_frags));
if (tx->tx_frags == NULL)
break;
sizeof(*tx->tx_frags));
if (tx->tx_frags == NULL)
break;
- sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
+ sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_wrq));
if (tx->tx_wrq == NULL)
break;
LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
sizeof(*tx->tx_wrq));
if (tx->tx_wrq == NULL)
break;
LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
- (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+ IBLND_MAX_RDMA_FRAGS * wrq_sge *
sizeof(*tx->tx_sge));
if (tx->tx_sge == NULL)
break;
sizeof(*tx->tx_sge));
if (tx->tx_sge == NULL)
break;
#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
-#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
-#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */
+/* max size of queued messages (inc hdr) */
+#define IBLND_MSG_SIZE (4<<10)
+/* max # of fragments supported. + 1 for unaligned case */
+#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_IOV + 1)
/************************/
/* derived constants... */
/************************/
/* derived constants... */
{
struct kib_net *net = ni->ni_data;
struct scatterlist *sg;
{
struct kib_net *net = ni->ni_data;
struct scatterlist *sg;
- int fragnob;
- int max_nkiov;
+ int fragnob;
+ int max_nkiov;
+ int sg_count = 0;
CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+ if (!sg) {
+ CERROR("lacking enough sg entries to map tx\n");
+ return -EFAULT;
+ }
+ sg_count++;
+
fragnob = min((int)(kiov->bv_len - offset), nob);
/*
fragnob = min((int)(kiov->bv_len - offset), nob);
/*
sg_set_page(sg, kiov->bv_page, fragnob,
kiov->bv_offset + offset);
sg = sg_next(sg);
sg_set_page(sg, kiov->bv_page, fragnob,
kiov->bv_offset + offset);
sg = sg_next(sg);
- if (!sg) {
- CERROR("lacking enough sg entries to map tx\n");
- return -EFAULT;
- }
nob -= fragnob;
} while (nob > 0);
nob -= fragnob;
} while (nob > 0);
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+ return kiblnd_map_tx(ni, tx, rd, sg_count);
#endif
LASSERT(tx->tx_nwrq >= 0);
#endif
LASSERT(tx->tx_nwrq >= 0);
- LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+ LASSERT(tx->tx_nwrq <= IBLND_MAX_RDMA_FRAGS);
LASSERT(nob <= IBLND_MSG_SIZE);
#ifdef HAVE_IB_GET_DMA_MR
LASSERT(mr != NULL);
LASSERT(nob <= IBLND_MSG_SIZE);
#ifdef HAVE_IB_GET_DMA_MR
LASSERT(mr != NULL);