}
static void
-kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
{
- LASSERT (pool->fpo_map_count == 0);
+ LASSERT(fpo->fpo_map_count == 0);
- if (pool->fpo_fmr_pool != NULL)
- ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+ if (fpo->fpo_is_fmr) {
+ if (fpo->fmr.fpo_fmr_pool)
+ ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+ } else {
+ struct kib_fast_reg_descriptor *frd, *tmp;
+ int i = 0;
+
+ list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+ frd_list) {
+ list_del(&frd->frd_list);
+ ib_free_fast_reg_page_list(frd->frd_frpl);
+ ib_dereg_mr(frd->frd_mr);
+ LIBCFS_FREE(frd, sizeof(*frd));
+ i++;
+ }
+ if (i < fpo->fast_reg.fpo_pool_size)
+ CERROR("FastReg pool still has %d regions registered\n",
+ fpo->fast_reg.fpo_pool_size - i);
+ }
- if (pool->fpo_hdev != NULL)
- kiblnd_hdev_decref(pool->fpo_hdev);
+ if (fpo->fpo_hdev)
+ kiblnd_hdev_decref(fpo->fpo_hdev);
- LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+ LIBCFS_FREE(fpo, sizeof(*fpo));
}
static void
kiblnd_destroy_fmr_pool_list(struct list_head *head)
{
- kib_fmr_pool_t *pool;
+ kib_fmr_pool_t *fpo, *tmp;
- while (!list_empty(head)) {
- pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
- list_del(&pool->fpo_list);
- kiblnd_destroy_fmr_pool(pool);
+ list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
+ list_del(&fpo->fpo_list);
+ kiblnd_destroy_fmr_pool(fpo);
}
}
return max(IBLND_FMR_POOL_FLUSH, size);
}
-static int
-kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
{
- /* FMR pool for RDMA */
- kib_dev_t *dev = fps->fps_net->ibn_dev;
- kib_fmr_pool_t *fpo;
- struct ib_fmr_pool_param param = {
- .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
- .page_shift = PAGE_SHIFT,
- .access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE),
+ struct ib_fmr_pool_param param = {
+ .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+ .page_shift = PAGE_SHIFT,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE),
.pool_size = fps->fps_pool_size,
.dirty_watermark = fps->fps_flush_trigger,
.flush_function = NULL,
.flush_arg = NULL,
.cache = !!*kiblnd_tunables.kib_fmr_cache};
+ int rc = 0;
+
+ fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
+ ¶m);
+ if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
+ rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
+ if (rc != -ENOSYS)
+ CERROR("Failed to create FMR pool: %d\n", rc);
+ else
+ CERROR("FMRs are not supported\n");
+ }
+
+ return rc;
+}
+
+static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+{
+ struct kib_fast_reg_descriptor *frd, *tmp;
+ int i, rc;
+
+ INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
+ fpo->fast_reg.fpo_pool_size = 0;
+ for (i = 0; i < fps->fps_pool_size; i++) {
+ LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
+ sizeof(*frd));
+ if (!frd) {
+ CERROR("Failed to allocate a new fast_reg descriptor\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ frd->frd_mr = NULL;
+
+ frd->frd_frpl = ib_alloc_fast_reg_page_list(fpo->fpo_hdev->ibh_ibdev,
+ LNET_MAX_PAYLOAD/PAGE_SIZE);
+ if (IS_ERR(frd->frd_frpl)) {
+ rc = PTR_ERR(frd->frd_frpl);
+ CERROR("Failed to allocate ib_fast_reg_page_list: %d\n",
+ rc);
+ goto out_middle;
+ }
+
+ frd->frd_mr = ib_alloc_fast_reg_mr(fpo->fpo_hdev->ibh_pd,
+ LNET_MAX_PAYLOAD/PAGE_SIZE);
+ if (IS_ERR(frd->frd_mr)) {
+ rc = PTR_ERR(frd->frd_mr);
+ CERROR("Failed to allocate ib_fast_reg_mr: %d\n", rc);
+ goto out_middle;
+ }
+
+ frd->frd_valid = true;
+
+ list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+ fpo->fast_reg.fpo_pool_size++;
+ }
+
+ return 0;
+
+out_middle:
+ if (frd->frd_mr)
+ ib_dereg_mr(frd->frd_mr);
+ if (frd->frd_frpl)
+ ib_free_fast_reg_page_list(frd->frd_frpl);
+ LIBCFS_FREE(frd, sizeof(*frd));
+
+out:
+ list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+ frd_list) {
+ list_del(&frd->frd_list);
+ ib_free_fast_reg_page_list(frd->frd_frpl);
+ ib_dereg_mr(frd->frd_mr);
+ LIBCFS_FREE(frd, sizeof(*frd));
+ }
+
+ return rc;
+}
+
+static int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+ struct ib_device_attr *dev_attr;
+ kib_dev_t *dev = fps->fps_net->ibn_dev;
+ kib_fmr_pool_t *fpo;
int rc;
- LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
- if (fpo == NULL)
+ dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
+ if (!dev_attr)
return -ENOMEM;
+ LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+ if (!fpo) {
+ rc = -ENOMEM;
+ goto out_dev_attr;
+ }
+
fpo->fpo_hdev = kiblnd_current_hdev(dev);
- fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m);
- if (IS_ERR(fpo->fpo_fmr_pool)) {
- rc = PTR_ERR(fpo->fpo_fmr_pool);
- CERROR("Failed to create FMR pool: %d\n", rc);
+ rc = ib_query_device(fpo->fpo_hdev->ibh_ibdev, dev_attr);
+ if (rc) {
+ CERROR("Query device failed for %s: %d\n",
+ fpo->fpo_hdev->ibh_ibdev->name, rc);
+ goto out_dev_attr;
+ }
- kiblnd_hdev_decref(fpo->fpo_hdev);
- LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
- return rc;
- }
+ /* Check for FMR or FastReg support */
+ fpo->fpo_is_fmr = 0;
+ if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
+ LCONSOLE_INFO("Using FMR for registration\n");
+ fpo->fpo_is_fmr = 1;
+ } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ LCONSOLE_INFO("Using FastReg for registration\n");
+ } else {
+ rc = -ENOSYS;
+ LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
+ goto out_dev_attr;
+ }
- fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
- fpo->fpo_owner = fps;
- *pp_fpo = fpo;
+ if (fpo->fpo_is_fmr)
+ rc = kiblnd_alloc_fmr_pool(fps, fpo);
+ else
+ rc = kiblnd_alloc_freg_pool(fps, fpo);
+ if (rc)
+ goto out_fpo;
- return 0;
+ kfree(dev_attr);
+ fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ fpo->fpo_owner = fps;
+ *pp_fpo = fpo;
+
+ return 0;
+
+out_fpo:
+ kiblnd_hdev_decref(fpo->fpo_hdev);
+ LIBCFS_FREE(fpo, sizeof(*fpo));
+
+out_dev_attr:
+ kfree(dev_attr);
+
+ return rc;
}
static void
{
struct list_head zombies = LIST_HEAD_INIT(zombies);
kib_fmr_pool_t *fpo = fmr->fmr_pool;
- kib_fmr_poolset_t *fps = fpo->fpo_owner;
+ kib_fmr_poolset_t *fps;
cfs_time_t now = cfs_time_current();
kib_fmr_pool_t *tmp;
int rc;
- rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
- LASSERT(rc == 0);
+ if (!fpo)
+ return;
- if (status != 0) {
- rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
- LASSERT(rc == 0);
- }
+ fps = fpo->fpo_owner;
+ if (fpo->fpo_is_fmr) {
+ if (fmr->fmr_pfmr) {
+ rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+ LASSERT(!rc);
+ fmr->fmr_pfmr = NULL;
+ }
+ if (status) {
+ rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
+ LASSERT(!rc);
+ }
+ } else {
+ struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
+
+ if (frd) {
+ frd->frd_valid = false;
+ spin_lock(&fps->fps_lock);
+ list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+ spin_unlock(&fps->fps_lock);
+ fmr->fmr_frd = NULL;
+ }
+ }
fmr->fmr_pool = NULL;
- fmr->fmr_pfmr = NULL;
spin_lock(&fps->fps_lock);
fpo->fpo_map_count--; /* decref the pool */
int
kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
- __u64 iov, kib_fmr_t *fmr)
+ __u32 nob, __u64 iov, bool is_rx, kib_fmr_t *fmr)
{
- struct ib_pool_fmr *pfmr;
- kib_fmr_pool_t *fpo;
- __u64 version;
- int rc;
+ kib_fmr_pool_t *fpo;
+ __u64 version;
+ int rc;
again:
spin_lock(&fps->fps_lock);
list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
fpo->fpo_map_count++;
- spin_unlock(&fps->fps_lock);
- pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
- pages, npages, iov);
- if (likely(!IS_ERR(pfmr))) {
- fmr->fmr_pool = fpo;
- fmr->fmr_pfmr = pfmr;
- return 0;
- }
+ if (fpo->fpo_is_fmr) {
+ struct ib_pool_fmr *pfmr;
+
+ spin_unlock(&fps->fps_lock);
+ pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
+ pages, npages, iov);
+ if (likely(!IS_ERR(pfmr))) {
+ fmr->fmr_key = is_rx ? pfmr->fmr->rkey
+ : pfmr->fmr->lkey;
+ fmr->fmr_frd = NULL;
+ fmr->fmr_pfmr = pfmr;
+ fmr->fmr_pool = fpo;
+ return 0;
+ }
+ rc = PTR_ERR(pfmr);
+ } else {
+ if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
+ struct ib_send_wr *wr;
+ struct kib_fast_reg_descriptor *frd;
+ struct ib_fast_reg_page_list *frpl;
+ struct ib_mr *mr;
+
+ frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
+ struct kib_fast_reg_descriptor,
+ frd_list);
+ list_del(&frd->frd_list);
+ spin_unlock(&fps->fps_lock);
+
+ frpl = frd->frd_frpl;
+ mr = frd->frd_mr;
+
+ if (!frd->frd_valid) {
+ struct ib_send_wr *inv_wr;
+ __u32 key = is_rx ? mr->rkey : mr->lkey;
+
+ inv_wr = &frd->frd_inv_wr;
+ memset(inv_wr, 0, sizeof(*inv_wr));
+ inv_wr->opcode = IB_WR_LOCAL_INV;
+ inv_wr->wr_id = IBLND_WID_MR;
+ inv_wr->ex.invalidate_rkey = key;
+
+ /* Bump the key */
+ key = ib_inc_rkey(key);
+ ib_update_fast_reg_key(mr, key);
+ }
+
+ LASSERT(npages <= frpl->max_page_list_len);
+ memcpy(frpl->page_list, pages,
+ sizeof(*pages) * npages);
+
+ /* Prepare FastReg WR */
+ wr = &frd->frd_fastreg_wr;
+ memset(wr, 0, sizeof(*wr));
+ wr->opcode = IB_WR_FAST_REG_MR;
+ wr->wr_id = IBLND_WID_MR;
+ wr->wr.fast_reg.iova_start = iov;
+ wr->wr.fast_reg.page_list = frpl;
+ wr->wr.fast_reg.page_list_len = npages;
+ wr->wr.fast_reg.page_shift = PAGE_SHIFT;
+ wr->wr.fast_reg.length = nob;
+ wr->wr.fast_reg.rkey = is_rx ? mr->rkey
+ : mr->lkey;
+ wr->wr.fast_reg.access_flags =
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+
+ fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
+ fmr->fmr_frd = frd;
+ fmr->fmr_pfmr = NULL;
+ fmr->fmr_pool = fpo;
+ return 0;
+ }
+ spin_unlock(&fps->fps_lock);
+ rc = -EBUSY;
+ }
spin_lock(&fps->fps_lock);
fpo->fpo_map_count--;
- if (PTR_ERR(pfmr) != -EAGAIN) {
+ if (rc != -EAGAIN) {
spin_unlock(&fps->fps_lock);
- return PTR_ERR(pfmr);
+ return rc;
}
/* EAGAIN and ... */
cfs_time_t fps_next_retry;
} kib_fmr_poolset_t;
+struct kib_fast_reg_descriptor { /* For fast registration */
+ struct list_head frd_list;
+ struct ib_send_wr frd_inv_wr;
+ struct ib_send_wr frd_fastreg_wr;
+ struct ib_mr *frd_mr;
+ struct ib_fast_reg_page_list *frd_frpl;
+ bool frd_valid;
+};
+
typedef struct
{
struct list_head fpo_list; /* chain on pool list */
struct kib_hca_dev *fpo_hdev; /* device for this pool */
kib_fmr_poolset_t *fpo_owner; /* owner of this pool */
- struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ union {
+ struct {
+ struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ } fmr;
+ struct { /* For fast registration */
+ struct list_head fpo_pool_list;
+ int fpo_pool_size;
+ } fast_reg;
+ };
cfs_time_t fpo_deadline; /* deadline of this pool */
int fpo_failed; /* fmr pool is failed */
int fpo_map_count; /* # of mapped FMR */
+ int fpo_is_fmr;
} kib_fmr_pool_t;
typedef struct {
- struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
- kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+ kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+ struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
+ struct kib_fast_reg_descriptor *fmr_frd;
+ u32 fmr_key;
} kib_fmr_t;
typedef struct kib_net
__u16 ibp_queue_depth;
} kib_peer_t;
+#ifndef HAVE_IB_INC_RKEY
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+ const u32 mask = 0x000000ff;
+ return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+#endif
+
extern kib_data_t kiblnd_data;
extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
* lowest bits of the work request id to stash the work item type. */
-#define IBLND_WID_INVAL 0
-#define IBLND_WID_TX 1
-#define IBLND_WID_RX 2
-#define IBLND_WID_RDMA 3
-#define IBLND_WID_MASK 3UL
+#define IBLND_WID_INVAL 0
+#define IBLND_WID_TX 1
+#define IBLND_WID_RX 2
+#define IBLND_WID_RDMA 3
+#define IBLND_WID_MR 4
+#define IBLND_WID_MASK 7UL
static inline __u64
kiblnd_ptr2wreqid (void *ptr, int type)
void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
-int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
- int npages, __u64 iov, kib_fmr_t *fmr);
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+ __u32 nob, __u64 iov, bool is_rx, kib_fmr_t *fmr);
void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
int kiblnd_tunables_init(void);
}
static int
-kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
{
kib_hca_dev_t *hdev;
__u64 *pages = tx->tx_pages;
cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
fps = net->ibn_fmr_ps[cpt];
- rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->fmr);
- if (rc != 0) {
- CERROR ("Can't map %d pages: %d\n", npages, rc);
- return rc;
- }
+ rc = kiblnd_fmr_pool_map(fps, pages, npages, nob, 0, (rd != tx->tx_rd),
+ &tx->fmr);
+ if (rc != 0) {
+ CERROR("Can't map %d pages: %d\n", npages, rc);
+ return rc;
+ }
/* If rd is not tx_rd, it's going to get sent to a peer, who will need
* the rkey */
- rd->rd_key = (rd != tx->tx_rd) ? tx->fmr.fmr_pfmr->fmr->rkey :
- tx->fmr.fmr_pfmr->fmr->lkey;
+ rd->rd_key = tx->fmr.fmr_key;
rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
rd->rd_frags[0].rf_nob = nob;
rd->rd_nfrags = 1;
LASSERT(net != NULL);
- if (net->ibn_fmr_ps != NULL && tx->fmr.fmr_pfmr != NULL) {
+ if (net->ibn_fmr_ps != NULL)
kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
- tx->fmr.fmr_pfmr = NULL;
- }
if (tx->tx_nfrags != 0) {
kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
kib_net_t *net = ni->ni_data;
struct ib_mr *mr = NULL;
- __u32 nob;
- int i;
+ __u32 nob;
+ int i;
/* If rd is not tx_rd, it's going to get sent to a peer and I'm the
* RDMA sink */
/* close_conn will launch failover */
rc = -ENETDOWN;
} else {
- struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1];
+ struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
+ struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1];
+ struct ib_send_wr *wrq = tx->tx_wrq;
+
+ if (frd != NULL) {
+ if (!frd->frd_valid) {
+ wrq = &frd->frd_inv_wr;
+ wrq->next = &frd->frd_fastreg_wr;
+ } else {
+ wrq = &frd->frd_fastreg_wr;
+ }
+ frd->frd_fastreg_wr.next = tx->tx_wrq;
+ }
- LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+ LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
"bad wr_id "LPX64", opc %d, flags %d, peer: %s\n",
- wrq->wr_id, wrq->opcode, wrq->send_flags,
+ bad->wr_id, bad->opcode, bad->send_flags,
libcfs_nid2str(conn->ibc_peer->ibp_nid));
- wrq = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &wrq);
+ bad = NULL;
+ rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
}
conn->ibc_last_send = jiffies;
static void
kiblnd_complete (struct ib_wc *wc)
{
- switch (kiblnd_wreqid2type(wc->wr_id)) {
- default:
- LBUG();
+ switch (kiblnd_wreqid2type(wc->wr_id)) {
+ default:
+ LBUG();
+
+ case IBLND_WID_MR:
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ CNETERR("FastReg failed: %d\n", wc->status);
+ return;
case IBLND_WID_RDMA:
/* We only get RDMA completion notification if it fails. All