peer_ni->ibp_nid = nid;
peer_ni->ibp_error = 0;
peer_ni->ibp_last_alive = 0;
- peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
+ peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS;
peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
atomic_set(&peer_ni->ibp_refcount, 1); /* 1 ref for caller */
kiblnd_get_completion_vector(conn, cpt));
#endif
if (IS_ERR(cq)) {
+ /*
+ * on MLX-5 (possibly MLX-4 as well) this error could be
+ * hit if the concurrent_sends and/or peer_tx_credits is set
+ * too high. Or due to an MLX-5 bug which tries to
+ * allocate 256kb via kmalloc for WR cookie array
+ */
CERROR("Failed to create CQ with %d CQEs: %ld\n",
IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
goto failed_2;
}
}
-#ifdef HAVE_IB_GET_DMA_MR
-struct ib_mr *
-kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
- int negotiated_nfrags)
-{
- kib_net_t *net = ni->ni_data;
- kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev;
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int mod;
- __u16 nfrags;
-
- tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
- mod = tunables->lnd_map_on_demand;
- nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
-
- LASSERT(hdev->ibh_mrs != NULL);
-
- if (mod > 0 && nfrags <= rd->rd_nfrags)
- return NULL;
-
- return hdev->ibh_mrs;
-}
-#endif
-
static void
kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
{
LASSERT(fpo->fpo_map_count == 0);
- if (fpo->fpo_is_fmr) {
- if (fpo->fmr.fpo_fmr_pool)
- ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+ if (fpo->fpo_is_fmr && fpo->fmr.fpo_fmr_pool) {
+ ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
} else {
struct kib_fast_reg_descriptor *frd, *tmp;
int i = 0;
int
kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, kib_rdma_desc_t *rd,
- __u32 nob, __u64 iov, kib_fmr_t *fmr, bool *is_fastreg)
+ __u32 nob, __u64 iov, kib_fmr_t *fmr)
{
kib_fmr_pool_t *fpo;
__u64 *pages = tx->tx_pages;
if (fpo->fpo_is_fmr) {
struct ib_pool_fmr *pfmr;
- *is_fastreg = 0;
spin_unlock(&fps->fps_lock);
if (!tx_pages_mapped) {
}
rc = PTR_ERR(pfmr);
} else {
- *is_fastreg = 1;
if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
struct kib_fast_reg_descriptor *frd;
#ifdef HAVE_IB_MAP_MR_SG
#ifdef HAVE_IB_GET_DMA_MR
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- if (tunables->lnd_map_on_demand == 0) {
+ /*
+ * if lnd_map_on_demand is zero then we have effectively disabled
+ * FMR or FastReg and we're using global memory regions
+ * exclusively.
+ */
+ if (!tunables->lnd_map_on_demand) {
read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
flags);
goto create_tx_pool;
LASSERT (tx->tx_lntmsg[1] == NULL);
LASSERT (tx->tx_nfrags == 0);
+ tx->tx_gaps = false;
+
return tx;
}
kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
{
kib_hca_dev_t *hdev;
+ kib_dev_t *dev;
kib_fmr_poolset_t *fps;
int cpt;
int rc;
int i;
- bool is_fastreg = 0;
LASSERT(tx->tx_pool != NULL);
LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+ dev = net->ibn_dev;
hdev = tx->tx_pool->tpo_hdev;
cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+ /*
+ * If we're dealing with FastReg, but the device doesn't
+ * support GAPS and the tx has GAPS, then there is no real point
+ * in trying to map the memory, because it'll just fail. So
+ * preemptively fail with an appropriate message
+ */
+ if ((dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED) &&
+ !(dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_GAPS_SUPPORT) &&
+ tx->tx_gaps) {
+ CERROR("Using FastReg with no GAPS support, but tx has gaps\n");
+ return -EPROTONOSUPPORT;
+ }
+
+ /*
+ * FMR does not support gaps but the tx has gaps then
+ * we should make sure that the number of fragments we'll be sending
+ * over fits within the number of fragments negotiated on the
+ * connection, otherwise, we won't be able to RDMA the data.
+ * We need to maintain the number of fragments negotiation on the
+ * connection for backwards compatibility.
+ */
+ if (tx->tx_gaps && (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)) {
+ if (tx->tx_conn &&
+ tx->tx_conn->ibc_max_frags <= rd->rd_nfrags) {
+ CERROR("TX number of frags (%d) is <= than connection"
+ " number of frags (%d). Consider setting peer's"
+ " map_on_demand to 256\n", tx->tx_nfrags,
+ tx->tx_conn->ibc_max_frags);
+ return -EFBIG;
+ }
+ }
+
fps = net->ibn_fmr_ps[cpt];
- rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr, &is_fastreg);
+ rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->tx_fmr);
if (rc != 0) {
CERROR("Can't map %u pages: %d\n", nob, rc);
return rc;
}
- /* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
- * the rkey */
- rd->rd_key = tx->fmr.fmr_key;
- if (!is_fastreg) {
+ /*
+ * If rd is not tx_rd, it's going to get sent to a peer_ni, who will
+ * need the rkey
+ */
+ rd->rd_key = tx->tx_fmr.fmr_key;
+ /*
+ * for FastReg or FMR with no gaps we can accumulate all
+ * the fragments in one FastReg or FMR fragment.
+ */
+ if (((dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED) && !tx->tx_gaps) ||
+ (dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)) {
+ /* FMR requires zero based address */
+ if (dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED)
+ rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+ rd->rd_frags[0].rf_nob = nob;
+ rd->rd_nfrags = 1;
+ } else {
+ /*
+ * We're transmitting with gaps using FMR.
+ * We'll need to use multiple fragments and identify the
+ * zero based address of each fragment.
+ */
for (i = 0; i < rd->rd_nfrags; i++) {
rd->rd_frags[i].rf_addr &= ~hdev->ibh_page_mask;
rd->rd_frags[i].rf_addr += i << hdev->ibh_page_shift;
}
- } else {
- rd->rd_frags[0].rf_nob = nob;
- rd->rd_nfrags = 1;
}
return 0;
static void
kiblnd_unmap_tx(kib_tx_t *tx)
{
- if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
- kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
+ if (tx->tx_fmr.fmr_pfmr || tx->tx_fmr.fmr_frd)
+ kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
if (tx->tx_nfrags != 0) {
kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
}
}
+#ifdef HAVE_IB_GET_DMA_MR
+static struct ib_mr *
+kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd)
+{
+ kib_net_t *net = ni->ni_data;
+ kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev;
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+ tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+
+ /*
+ * if map-on-demand is turned on and the device supports
+ * either FMR or FastReg then use that. Otherwise use global
+ * memory regions. If that's not available either, then you're
+ * dead in the water and fail the operation.
+ */
+ if (tunables->lnd_map_on_demand &&
+ (net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED ||
+ net->ibn_dev->ibd_dev_caps & IBLND_DEV_CAPS_FMR_ENABLED))
+ return NULL;
+
+ /*
+ * hdev->ibh_mrs can be NULL. This case is dealt with gracefully
+ * in the call chain. The mapping will fail with appropriate error
+ * message.
+ */
+ return hdev->ibh_mrs;
+}
+#endif
+
static int
kiblnd_map_tx(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
{
}
#ifdef HAVE_IB_GET_DMA_MR
- mr = kiblnd_find_rd_dma_mr(ni, rd,
- (tx->tx_conn != NULL) ?
- tx->tx_conn->ibc_max_frags : -1);
+ mr = kiblnd_find_rd_dma_mr(ni, rd);
if (mr != NULL) {
/* found pre-mapping MR */
rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
return -EINVAL;
}
-
static int
kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
unsigned int niov, struct kvec *iov, int offset, int nob)
fragnob = min((int)(iov->iov_len - offset), nob);
fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+ if ((fragnob < (int)PAGE_SIZE - page_offset) && (niov > 1)) {
+ CDEBUG(D_NET, "fragnob %d < available page %d: with"
+ " remaining %d iovs\n",
+ fragnob, (int)PAGE_SIZE - page_offset, niov);
+ tx->tx_gaps = true;
+ }
+
sg_set_page(sg, page, fragnob, page_offset);
sg = sg_next(sg);
if (!sg) {
kiblnd_setup_rd_kiov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
int nkiov, lnet_kiov_t *kiov, int offset, int nob)
{
- kib_net_t *net = ni->ni_data;
- struct scatterlist *sg;
- int fragnob;
+ kib_net_t *net = ni->ni_data;
+ struct scatterlist *sg;
+ int fragnob;
- CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
- LASSERT (nob > 0);
- LASSERT (nkiov > 0);
- LASSERT (net != NULL);
+ LASSERT(nob > 0);
+ LASSERT(nkiov > 0);
+ LASSERT(net != NULL);
- while (offset >= kiov->kiov_len) {
- offset -= kiov->kiov_len;
- nkiov--;
- kiov++;
- LASSERT (nkiov > 0);
- }
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT(nkiov > 0);
+ }
- sg = tx->tx_frags;
- do {
- LASSERT (nkiov > 0);
+ sg = tx->tx_frags;
+ do {
+ LASSERT(nkiov > 0);
+
+ fragnob = min((int)(kiov->kiov_len - offset), nob);
- fragnob = min((int)(kiov->kiov_len - offset), nob);
+ if ((fragnob < (int)(kiov->kiov_len - offset)) && nkiov > 1) {
+ CDEBUG(D_NET, "fragnob %d < available page %d: with"
+ " remaining %d kiovs\n",
+ fragnob, (int)(kiov->kiov_len - offset), nkiov);
+ tx->tx_gaps = true;
+ }
sg_set_page(sg, kiov->kiov_page, fragnob,
kiov->kiov_offset + offset);
return -EFAULT;
}
- offset = 0;
- kiov++;
- nkiov--;
- nob -= fragnob;
- } while (nob > 0);
+ offset = 0;
+ kiov++;
+ nkiov--;
+ nob -= fragnob;
+ } while (nob > 0);
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+ return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
}
static int
/* close_conn will launch failover */
rc = -ENETDOWN;
} else {
- struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
+ struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
struct ib_send_wr *wr = &tx->tx_wrq[0].wr;
}
if (reqmsg->ibm_u.connparams.ibcp_max_frags >
- kiblnd_rdma_frags(version, ni)) {
+ IBLND_MAX_RDMA_FRAGS) {
CWARN("Can't accept conn from %s (version %x): "
"max_frags %d too large (%d wanted)\n",
libcfs_nid2str(nid), version,
reqmsg->ibm_u.connparams.ibcp_max_frags,
- kiblnd_rdma_frags(version, ni));
+ IBLND_MAX_RDMA_FRAGS);
if (version >= IBLND_MSG_VERSION)
rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
goto failed;
} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
- kiblnd_rdma_frags(version, ni) &&
+ IBLND_MAX_RDMA_FRAGS &&
net->ibn_fmr_ps == NULL) {
CWARN("Can't accept conn from %s (version %x): "
"max_frags %d incompatible without FMR pool "
"(%d wanted)\n",
libcfs_nid2str(nid), version,
reqmsg->ibm_u.connparams.ibcp_max_frags,
- kiblnd_rdma_frags(version, ni));
+ IBLND_MAX_RDMA_FRAGS);
if (version == IBLND_MSG_VERSION)
rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
if (ni != NULL) {
rej.ibr_cp.ibcp_queue_depth =
kiblnd_msg_queue_size(version, ni);
- rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni);
+ rej.ibr_cp.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS;
lnet_ni_decref(ni);
}
goto out;
}
tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+#ifdef HAVE_IB_GET_DMA_MR
+ /*
+ * This check only makes sense if the kernel supports global
+ * memory registration. Otherwise, map_on_demand will never == 0
+ */
if (!tunables->lnd_map_on_demand) {
reason = "map_on_demand must be enabled";
goto out;
}
+#endif
if (conn->ibc_max_frags <= frag_num) {
reason = "unsupported max frags";
goto out;
module_param(concurrent_sends, int, 0444);
MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+/*
+ * map_on_demand is a flag used to determine if we can use FMR or FastReg.
+ * This is applicable for kernels which support global memory regions. For
+ * later kernels this flag is always enabled, since we will always either
+ * use FMR or FastReg
+ * For kernels which support global memory regions map_on_demand defaults
+ * to 0 which means we will be using global memory regions exclusively.
+ * If it is set to a value other than 0, then we will behave as follows:
+ * 1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ * 2. Create FMR/FastReg pools
+ * 3. Negotiate the supported number of fragments per connection
+ * 4. Attempt to transmit using global memory regions only if
+ * map-on-demand is not turned on, otherwise use FMR or FastReg
+ * 5. In case of transmitting tx with GAPS over FMR we will need to
+ * transmit it with multiple fragments. Look at the comments in
+ * kiblnd_fmr_map_tx() for an explanation of the behavior.
+ *
+ * For later kernels we default map_on_demand to 1 and not allow
+ * it to be set to 0, since there is no longer support for global memory
+ * regions. Behavior:
+ * 1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ * 2. Create FMR/FastReg pools
+ * 3. Negotiate the supported number of fragments per connection
+ * 4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
+ * the behavior when transmit with GAPS verses contiguous.
+ */
#ifdef HAVE_IB_GET_DMA_MR
#define IBLND_DEFAULT_MAP_ON_DEMAND 0
-#define IBLND_MIN_MAP_ON_DEMAND 0
+#define MOD_STR "map on demand"
#else
-#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
-#define IBLND_MIN_MAP_ON_DEMAND 1
+#define IBLND_DEFAULT_MAP_ON_DEMAND 1
+#define MOD_STR "map on demand (obsolete)"
#endif
static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, "map on demand");
+MODULE_PARM_DESC(map_on_demand, MOD_STR);
/* NB: this value is shared by all CPTs, it can grow at runtime */
static int fmr_pool_size = 512;
net_tunables->lct_peer_tx_credits =
net_tunables->lct_max_tx_credits;
+#ifndef HAVE_IB_GET_DMA_MR
+ /*
+ * For kernels which do not support global memory regions, always
+ * enable map_on_demand
+ */
+ if (tunables->lnd_map_on_demand == 0)
+ tunables->lnd_map_on_demand = 1;
+#endif
+
if (!tunables->lnd_peercredits_hiw)
tunables->lnd_peercredits_hiw = peer_credits_hiw;
if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
- if (tunables->lnd_map_on_demand < IBLND_MIN_MAP_ON_DEMAND ||
- tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
- /* Use the default */
- CWARN("Invalid map_on_demand (%d), expects %d - %d. Using default of %d\n",
- tunables->lnd_map_on_demand, IBLND_MIN_MAP_ON_DEMAND,
- IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
- tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
- }
-
- if (tunables->lnd_map_on_demand == 1) {
- /* don't make sense to create map if only one fragment */
- tunables->lnd_map_on_demand = 2;
- }
-
- if (tunables->lnd_concurrent_sends == 0) {
- if (tunables->lnd_map_on_demand > 0 &&
- tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
- tunables->lnd_concurrent_sends =
- net_tunables->lct_peer_tx_credits * 2;
- } else {
- tunables->lnd_concurrent_sends =
- net_tunables->lct_peer_tx_credits;
- }
- }
+ if (tunables->lnd_concurrent_sends == 0)
+ tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
kiblnd_tunables_init(void)
{
default_tunables.lnd_version = CURRENT_LND_VERSION;
- default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
+ default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
default_tunables.lnd_map_on_demand = map_on_demand;
default_tunables.lnd_concurrent_sends = concurrent_sends;
default_tunables.lnd_fmr_pool_size = fmr_pool_size;