From 9b02beed217855f295eb51555fb9b4d2110af233 Mon Sep 17 00:00:00 2001 From: liangzhen Date: Fri, 3 Jul 2009 06:32:48 +0000 Subject: [PATCH] Branch HEAD b=18451 Growing TX/FMR/PMR pool at runtime for o2iblnd, these runtime allocated pools can be freed if not accessed for long time. i=isaac --- lnet/klnds/o2iblnd/o2iblnd.c | 949 +++++++++++++++++++++++---------- lnet/klnds/o2iblnd/o2iblnd.h | 163 ++++-- lnet/klnds/o2iblnd/o2iblnd_cb.c | 117 ++-- lnet/klnds/o2iblnd/o2iblnd_modparams.c | 9 +- 4 files changed, 820 insertions(+), 418 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 3dddc0e..419448d 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -1216,32 +1216,31 @@ kiblnd_map_rx_descs(kib_conn_t *conn) conn->ibc_rx_pages->ibp_device = conn->ibc_cmid->device; } -void -kiblnd_unmap_tx_descs(lnet_ni_t *ni) +static void +kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo) { - int i; kib_tx_t *tx; - kib_net_t *net = ni->ni_data; - - LASSERT (net->ibn_tx_pages != NULL); - LASSERT (net->ibn_tx_pages->ibp_device != NULL); + int i; - for (i = 0; i < IBLND_TX_MSGS(); i++) { - tx = &net->ibn_tx_descs[i]; + LASSERT (tpo->tpo_pool.po_allocated == 0); + LASSERT (tpo->tpo_tx_pages->ibp_device != NULL); - kiblnd_dma_unmap_single(net->ibn_tx_pages->ibp_device, + for (i = 0; i < tpo->tpo_pool.po_size; i++) { + tx = &tpo->tpo_tx_descs[i]; + kiblnd_dma_unmap_single(tpo->tpo_tx_pages->ibp_device, KIBLND_UNMAP_ADDR(tx, tx_msgunmap, tx->tx_msgaddr), IBLND_MSG_SIZE, DMA_TO_DEVICE); } - - net->ibn_tx_pages->ibp_device = NULL; + tpo->tpo_tx_pages->ibp_device = NULL; } -void -kiblnd_map_tx_descs (lnet_ni_t *ni) +static void +kiblnd_map_tx_pool(kib_tx_pool_t *tpo) { - kib_net_t *net = ni->ni_data; + kib_pages_t *txpgs = tpo->tpo_tx_pages; + kib_pool_t *pool = &tpo->tpo_pool; + kib_net_t *net = pool->po_owner->ps_net; struct page *page; kib_tx_t *tx; int page_offset; @@ -1249,6 +1248,8 @@ kiblnd_map_tx_descs (lnet_ni_t *ni) int i; LASSERT (net != NULL); + LASSERT (net->ibn_dev->ibd_cmid != NULL && + net->ibn_dev->ibd_cmid->device != NULL); /* pre-mapped messages are not bigger than 1 page */ CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE); @@ -1256,22 +1257,23 @@ kiblnd_map_tx_descs (lnet_ni_t *ni) /* No fancy arithmetic when we do the buffer calculations */ CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0); + txpgs->ibp_device = net->ibn_dev->ibd_cmid->device; - for (ipage = page_offset = i = 0; i < IBLND_TX_MSGS(); i++) { - page = net->ibn_tx_pages->ibp_pages[ipage]; - tx = &net->ibn_tx_descs[i]; + for (ipage = page_offset = i = 0; i < pool->po_size; i++) { + page = txpgs->ibp_pages[ipage]; + tx = &tpo->tpo_tx_descs[i]; tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_msgaddr = kiblnd_dma_map_single( - net->ibn_dev->ibd_cmid->device, - tx->tx_msg, IBLND_MSG_SIZE, DMA_TO_DEVICE); - LASSERT (!kiblnd_dma_mapping_error(net->ibn_dev->ibd_cmid->device, + txpgs->ibp_device, tx->tx_msg, + IBLND_MSG_SIZE, DMA_TO_DEVICE); + LASSERT (!kiblnd_dma_mapping_error(txpgs->ibp_device, tx->tx_msgaddr)); KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); - list_add(&tx->tx_list, &net->ibn_idle_txs); + list_add(&tx->tx_list, &pool->po_free_list); page_offset += IBLND_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1279,135 +1281,9 @@ kiblnd_map_tx_descs (lnet_ni_t *ni) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBLND_TX_MSG_PAGES()); + LASSERT (ipage <= txpgs->ibp_npages); } } - - net->ibn_tx_pages->ibp_device = net->ibn_dev->ibd_cmid->device; -} - -void -kiblnd_free_tx_descs (lnet_ni_t *ni) -{ - int i; - kib_net_t *net = ni->ni_data; - - LASSERT (net != NULL); - - if (net->ibn_tx_pages != NULL) - kiblnd_free_pages(net->ibn_tx_pages); - - if (net->ibn_tx_descs == NULL) - return; - - for (i = 0; i < IBLND_TX_MSGS(); i++) { - kib_tx_t *tx = &net->ibn_tx_descs[i]; - - if (tx->tx_pages != NULL) - LIBCFS_FREE(tx->tx_pages, - LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - - if (tx->tx_ipb != NULL) - LIBCFS_FREE(tx->tx_ipb, - IBLND_MAX_RDMA_FRAGS * - sizeof(*tx->tx_ipb)); - - if (tx->tx_frags != NULL) - LIBCFS_FREE(tx->tx_frags, - IBLND_MAX_RDMA_FRAGS * - sizeof(*tx->tx_frags)); - - if (tx->tx_wrq != NULL) - LIBCFS_FREE(tx->tx_wrq, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - - if (tx->tx_sge != NULL) - LIBCFS_FREE(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge)); - - if (tx->tx_rd != NULL) - LIBCFS_FREE(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBLND_MAX_RDMA_FRAGS])); - } - - LIBCFS_FREE(net->ibn_tx_descs, - IBLND_TX_MSGS() * sizeof(kib_tx_t)); -} - -int -kiblnd_alloc_tx_descs (lnet_ni_t *ni) -{ - int i; - int rc; - kib_net_t *net = ni->ni_data; - - LASSERT (net != NULL); - - rc = kiblnd_alloc_pages(&net->ibn_tx_pages, IBLND_TX_MSG_PAGES()); - - if (rc != 0) { - CERROR("Can't allocate tx pages\n"); - return rc; - } - - LIBCFS_ALLOC (net->ibn_tx_descs, - IBLND_TX_MSGS() * sizeof(kib_tx_t)); - if (net->ibn_tx_descs == NULL) { - CERROR("Can't allocate %d tx descriptors\n", IBLND_TX_MSGS()); - return -ENOMEM; - } - - memset(net->ibn_tx_descs, 0, - IBLND_TX_MSGS() * sizeof(kib_tx_t)); - - for (i = 0; i < IBLND_TX_MSGS(); i++) { - kib_tx_t *tx = &net->ibn_tx_descs[i]; - - if (net->ibn_fmrpool != NULL) { - LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - if (tx->tx_pages == NULL) - return -ENOMEM; - } - - if (net->ibn_pmrpool != NULL) { - LIBCFS_ALLOC(tx->tx_ipb, - IBLND_MAX_RDMA_FRAGS * - sizeof(*tx->tx_ipb)); - if (tx->tx_ipb == NULL) - return -ENOMEM; - } - - LIBCFS_ALLOC(tx->tx_frags, - IBLND_MAX_RDMA_FRAGS * - sizeof(*tx->tx_frags)); - if (tx->tx_frags == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - if (tx->tx_wrq == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge)); - if (tx->tx_sge == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBLND_MAX_RDMA_FRAGS])); - if (tx->tx_rd == NULL) - return -ENOMEM; - } - - return 0; } struct ib_mr * @@ -1464,29 +1340,33 @@ kiblnd_find_rd_dma_mr(kib_net_t *net, kib_rdma_desc_t *rd) } void -kiblnd_dev_cleanup(kib_dev_t *ibdev) +kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool) { - int i; + LASSERT (pool->fpo_map_count == 0); - if (ibdev->ibd_mrs == NULL) - return; + if (pool->fpo_fmr_pool != NULL) + ib_destroy_fmr_pool(pool->fpo_fmr_pool); - for (i = 0; i < ibdev->ibd_nmrs; i++) { - if (ibdev->ibd_mrs[i] == NULL) - break; + LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t)); +} - ib_dereg_mr(ibdev->ibd_mrs[i]); - } +void +kiblnd_destroy_fmr_pool_list(struct list_head *head) +{ + kib_fmr_pool_t *pool; - LIBCFS_FREE(ibdev->ibd_mrs, sizeof(*ibdev->ibd_mrs) * ibdev->ibd_nmrs); - ibdev->ibd_mrs = NULL; + while (!list_empty(head)) { + pool = list_entry(head->next, kib_fmr_pool_t, fpo_list); + list_del(&pool->fpo_list); + kiblnd_destroy_fmr_pool(pool); + } } int -kiblnd_ib_create_fmr_pool(kib_dev_t *ibdev, struct ib_fmr_pool **fmrpp) +kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo) { /* FMR pool for RDMA */ - struct ib_fmr_pool *fmrpool; + kib_fmr_pool_t *fpo; struct ib_fmr_pool_param param = { .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, .page_shift = PAGE_SHIFT, @@ -1496,158 +1376,686 @@ kiblnd_ib_create_fmr_pool(kib_dev_t *ibdev, struct ib_fmr_pool **fmrpp) .dirty_watermark = *kiblnd_tunables.kib_fmr_flush_trigger, .flush_function = NULL, .flush_arg = NULL, - .cache = *kiblnd_tunables.kib_fmr_cache}; + .cache = !!*kiblnd_tunables.kib_fmr_cache}; + int rc; - if (*kiblnd_tunables.kib_fmr_pool_size < - *kiblnd_tunables.kib_ntx) { - CERROR("Can't set fmr pool size (%d) < ntx(%d)\n", - *kiblnd_tunables.kib_fmr_pool_size, - *kiblnd_tunables.kib_ntx); - return -EINVAL; - } + LASSERT (fps->fps_net->ibn_dev != NULL && + fps->fps_net->ibn_dev->ibd_pd != NULL); - fmrpool = ib_create_fmr_pool(ibdev->ibd_pd, ¶m); - if (IS_ERR(fmrpool)) - return PTR_ERR(fmrpool); + LIBCFS_ALLOC(fpo, sizeof(kib_fmr_pool_t)); + if (fpo == NULL) + return -ENOMEM; - *fmrpp = fmrpool; + memset(fpo, 0, sizeof(kib_fmr_pool_t)); + fpo->fpo_fmr_pool = ib_create_fmr_pool(fps->fps_net->ibn_dev->ibd_pd, ¶m); + if (IS_ERR(fpo->fpo_fmr_pool)) { + CERROR("Failed to create FMR pool: %ld\n", + PTR_ERR(fpo->fpo_fmr_pool)); + rc = PTR_ERR(fpo->fpo_fmr_pool); + LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t)); + return rc; + } + + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_owner = fps; + *pp_fpo = fpo; return 0; } +static void +kiblnd_fini_fmr_pool_set(kib_fmr_poolset_t *fps) +{ + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); +} + +static int +kiblnd_init_fmr_pool_set(kib_fmr_poolset_t *fps, kib_net_t *net) +{ + kib_fmr_pool_t *fpo; + int rc; + + memset(fps, 0, sizeof(kib_fmr_poolset_t)); + + fps->fps_net = net; + spin_lock_init(&fps->fps_lock); + CFS_INIT_LIST_HEAD(&fps->fps_pool_list); + rc = kiblnd_create_fmr_pool(fps, &fpo); + if (rc == 0) + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + + return rc; +} + void -kiblnd_phys_mr_unmap(kib_net_t *net, kib_phys_mr_t *pmr) +kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) { - kib_phys_mr_pool_t *pool = net->ibn_pmrpool; - struct ib_mr *mr; + CFS_LIST_HEAD (zombies); + kib_fmr_pool_t *fpo = fmr->fmr_pool; + kib_fmr_poolset_t *fps = fpo->fpo_owner; + kib_fmr_pool_t *tmp; + int rc; + + rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); + LASSERT (rc == 0); - spin_lock(&pool->ibmp_lock); + if (status != 0) { + rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool); + LASSERT (rc == 0); + } + + fmr->fmr_pool = NULL; + fmr->fmr_pfmr = NULL; - mr = pmr->ibpm_mr; + spin_lock(&fps->fps_lock); + fpo->fpo_map_count --; /* decref the pool */ - list_add(&pmr->ibpm_link, &pool->ibmp_free_list); - pool->ibmp_allocated --; + list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { + /* the first pool is persistent */ + if (fps->fps_pool_list.next == &fpo->fpo_list) + continue; - spin_unlock(&pool->ibmp_lock); + if (fpo->fpo_map_count == 0 && /* no more reference */ + cfs_time_aftereq(cfs_time_current(), fpo->fpo_deadline)) { + list_move(&fpo->fpo_list, &zombies); + fps->fps_version ++; + } + } + spin_unlock(&fps->fps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_fmr_pool_list(&zombies); +} +int +kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, + __u64 iov, kib_fmr_t *fmr) +{ + struct ib_pool_fmr *pfmr; + kib_fmr_pool_t *fpo; + __u64 version; + int rc; + + LASSERT (fps->fps_net->ibn_with_fmr); + again: + spin_lock(&fps->fps_lock); + version = fps->fps_version; + list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_map_count ++; + spin_unlock(&fps->fps_lock); + + pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_pool = fpo; + fmr->fmr_pfmr = pfmr; + return 0; + } + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count --; + if (PTR_ERR(pfmr) != -EAGAIN) { + spin_unlock(&fps->fps_lock); + return PTR_ERR(pfmr); + } + + /* EAGAIN and ... */ + if (version != fps->fps_version) { + spin_unlock(&fps->fps_lock); + goto again; + } + } + + if (fps->fps_increasing) { + spin_unlock(&fps->fps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "FMR pool, waiting for her to complete\n"); + schedule(); + goto again; + + } + + if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) { + /* someone failed recently */ + spin_unlock(&fps->fps_lock); + return -EAGAIN; + } + + fps->fps_increasing = 1; + spin_unlock(&fps->fps_lock); + + CDEBUG(D_NET, "Allocate new FMR pool\n"); + rc = kiblnd_create_fmr_pool(fps, &fpo); + spin_lock(&fps->fps_lock); + fps->fps_increasing = 0; + if (rc == 0) { + fps->fps_version ++; + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + } else { + fps->fps_next_retry = cfs_time_shift(10); + } + spin_unlock(&fps->fps_lock); + + goto again; +} + +static void +kiblnd_fini_pool(kib_pool_t *pool) +{ + LASSERT (list_empty(&pool->po_free_list)); + LASSERT (pool->po_allocated == 0); + + CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); +} + +static void +kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size) +{ + CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); + + memset(pool, 0, sizeof(kib_pool_t)); + CFS_INIT_LIST_HEAD(&pool->po_free_list); + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + pool->po_owner = ps; + pool->po_size = size; +} + +void +kiblnd_destroy_pool_list(kib_poolset_t *ps, struct list_head *head) +{ + kib_pool_t *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, kib_pool_t, po_list); + list_del(&pool->po_list); + ps->ps_pool_destroy(pool); + } +} + +static void +kiblnd_fini_pool_set(kib_poolset_t *ps) +{ + kiblnd_destroy_pool_list(ps, &ps->ps_pool_list); +} + +static int +kiblnd_init_pool_set(kib_poolset_t *ps, kib_net_t *net, + char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) +{ + kib_pool_t *pool; + int rc; + + memset(ps, 0, sizeof(kib_poolset_t)); + + ps->ps_net = net; + ps->ps_pool_create = po_create; + ps->ps_pool_destroy = po_destroy; + ps->ps_node_init = nd_init; + ps->ps_node_fini = nd_fini; + ps->ps_pool_size = size; + strncpy(ps->ps_name, name, IBLND_POOL_NAME_LEN); + spin_lock_init(&ps->ps_lock); + CFS_INIT_LIST_HEAD(&ps->ps_pool_list); + + rc = ps->ps_pool_create(ps, size, &pool); + if (rc == 0) + list_add(&pool->po_list, &ps->ps_pool_list); + else + CERROR("Failed to create the first pool for %s\n", ps->ps_name); + + return rc; +} + +void +kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node) +{ + CFS_LIST_HEAD (zombies); + kib_poolset_t *ps = pool->po_owner; + kib_pool_t *tmp; + cfs_time_t now = cfs_time_current(); + + spin_lock(&ps->ps_lock); + + if (ps->ps_node_fini != NULL) + ps->ps_node_fini(pool, node); + + LASSERT (pool->po_allocated > 0); + list_add(node, &pool->po_free_list); + pool->po_allocated --; + + list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { + /* the first pool is persistent */ + if (ps->ps_pool_list.next == &pool->po_list) + continue; + + if (pool->po_allocated == 0 && + cfs_time_aftereq(now, pool->po_deadline)) + list_move(&pool->po_list, &zombies); + } + spin_unlock(&ps->ps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_pool_list(ps, &zombies); +} + +struct list_head * +kiblnd_pool_alloc_node(kib_poolset_t *ps) +{ + struct list_head *node; + kib_pool_t *pool; + int rc; + + again: + spin_lock(&ps->ps_lock); + list_for_each_entry(pool, &ps->ps_pool_list, po_list) { + if (list_empty(&pool->po_free_list)) + continue; + + pool->po_allocated ++; + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + node = pool->po_free_list.next; + list_del(node); + + if (ps->ps_node_init != NULL) { + /* still hold the lock */ + ps->ps_node_init(pool, node); + } + spin_unlock(&ps->ps_lock); + return node; + } + + /* no available tx pool and ... */ + if (ps->ps_increasing) { + /* another thread is allocating a new pool */ + spin_unlock(&ps->ps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "%s pool, waiting for her to complete\n", + ps->ps_name); + schedule(); + goto again; + } + + if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) { + /* someone failed recently */ + spin_unlock(&ps->ps_lock); + return NULL; + } + + ps->ps_increasing = 1; + spin_unlock(&ps->ps_lock); + + CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); + + rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); + + spin_lock(&ps->ps_lock); + ps->ps_increasing = 0; + if (rc == 0) { + list_add_tail(&pool->po_list, &ps->ps_pool_list); + } else { + /* retry 10 seconds later */ + ps->ps_next_retry = cfs_time_shift(10); + CERROR("Can't allocate new %s pool because out of memory\n", + ps->ps_name); + } + spin_unlock(&ps->ps_lock); + + goto again; +} + +void +kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr) +{ + kib_pmr_pool_t *ppo = pmr->pmr_pool; + struct ib_mr *mr = pmr->pmr_mr; + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list); if (mr != NULL) ib_dereg_mr(mr); } -kib_phys_mr_t * -kiblnd_phys_mr_map(kib_net_t *net, kib_rdma_desc_t *rd, - struct ib_phys_buf *ipb, __u64 *iova) +int +kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_rdma_desc_t *rd, + __u64 *iova, kib_phys_mr_t **pp_pmr) { - kib_phys_mr_pool_t *pool = net->ibn_pmrpool; kib_phys_mr_t *pmr; + struct list_head *node; + int rc; int i; - LASSERT (ipb != NULL); + node = kiblnd_pool_alloc_node(&pps->pps_poolset); + if (node == NULL) { + CERROR("Failed to allocate PMR descriptor\n"); + return -ENOMEM; + } - spin_lock(&pool->ibmp_lock); - if (list_empty(&pool->ibmp_free_list)) { - spin_unlock(&pool->ibmp_lock); - CERROR("pre-allocated MRs is not enough\n"); + pmr = container_of(node, kib_phys_mr_t, pmr_list); + for (i = 0; i < rd->rd_nfrags; i ++) { + pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr; + pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob; + } + + pmr->pmr_mr = ib_reg_phys_mr(pps->pps_poolset.ps_net->ibn_dev->ibd_pd, + pmr->pmr_ipb, rd->rd_nfrags, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE, + iova); + if (!IS_ERR(pmr->pmr_mr)) { + pmr->pmr_iova = *iova; + *pp_pmr = pmr; + return 0; + } - return NULL; + rc = PTR_ERR(pmr->pmr_mr); + CERROR("Failed ib_reg_phys_mr: %d\n", rc); + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node); + + return rc; +} + +static void +kiblnd_destroy_pmr_pool(kib_pool_t *pool) +{ + kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool); + kib_phys_mr_t *pmr; + + LASSERT (pool->po_allocated == 0); + + while (!list_empty(&pool->po_free_list)) { + pmr = list_entry(pool->po_free_list.next, + kib_phys_mr_t, pmr_list); + + LASSERT (pmr->pmr_mr == NULL); + list_del(&pmr->pmr_list); + + if (pmr->pmr_ipb != NULL) { + LIBCFS_FREE(pmr->pmr_ipb, + IBLND_MAX_RDMA_FRAGS * + sizeof(struct ib_phys_buf)); + } + + LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t)); } - pmr = list_entry(pool->ibmp_free_list.next, - kib_phys_mr_t, ibpm_link); - list_del_init(&pmr->ibpm_link); - pool->ibmp_allocated ++; + kiblnd_fini_pool(pool); + LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t)); +} - spin_unlock(&pool->ibmp_lock); +static int +kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) +{ + kib_pmr_pool_t *ppo; + kib_pool_t *pool; + kib_phys_mr_t *pmr; + int i; - for (i = 0; i < rd->rd_nfrags; i ++) { - ipb[i].addr = rd->rd_frags[i].rf_addr; - ipb[i].size = rd->rd_frags[i].rf_nob; + LIBCFS_ALLOC(ppo, sizeof(kib_pmr_pool_t)); + if (ppo == NULL) { + CERROR("Failed to allocate PMR pool\n"); + return -ENOMEM; + } + + pool = &ppo->ppo_pool; + kiblnd_init_pool(ps, pool, size); + + for (i = 0; i < size; i++) { + LIBCFS_ALLOC(pmr, sizeof(kib_phys_mr_t)); + if (pmr == NULL) + break; + + memset(pmr, 0, sizeof(kib_phys_mr_t)); + pmr->pmr_pool = ppo; + LIBCFS_ALLOC(pmr->pmr_ipb, + IBLND_MAX_RDMA_FRAGS * + sizeof(struct ib_phys_buf)); + if (pmr->pmr_ipb == NULL) + break; + + list_add(&pmr->pmr_list, &pool->po_free_list); } - pmr->ibpm_mr = ib_reg_phys_mr(net->ibn_dev->ibd_pd, ipb, - rd->rd_nfrags, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE, - iova); - if (!IS_ERR(pmr->ibpm_mr)) { - pmr->ibpm_iova = *iova; - return pmr; + if (i < size) { + ps->ps_pool_destroy(pool); + return -ENOMEM; } - CERROR("Failed ib_reg_phys_mr: %ld\n", PTR_ERR(pmr->ibpm_mr)); - pmr->ibpm_mr = NULL; + *pp_po = pool; + return 0; +} - spin_lock(&pool->ibmp_lock); +static void +kiblnd_destroy_tx_pool(kib_pool_t *pool) +{ + kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool); + int i; - list_add(&pmr->ibpm_link, &pool->ibmp_free_list); - pool->ibmp_allocated --; + LASSERT (pool->po_allocated == 0); - spin_unlock(&pool->ibmp_lock); + if (tpo->tpo_tx_pages != NULL) { + if (tpo->tpo_tx_pages->ibp_device != NULL) + kiblnd_unmap_tx_pool(tpo); + kiblnd_free_pages(tpo->tpo_tx_pages); + } - return NULL; + if (tpo->tpo_tx_descs == NULL) + goto out; + + for (i = 0; i < pool->po_size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + list_del(&tx->tx_list); + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, + LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_frags != NULL) + LIBCFS_FREE(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } + + LIBCFS_FREE(tpo->tpo_tx_descs, + pool->po_size * sizeof(kib_tx_t)); +out: + kiblnd_fini_pool(pool); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); } -void -kiblnd_destroy_pmr_pool(kib_phys_mr_pool_t *pool) +static int +kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) { - kib_phys_mr_t *pmr; + int i; + int npg; + kib_pool_t *pool; + kib_tx_pool_t *tpo; - LASSERT (pool->ibmp_allocated == 0); + LIBCFS_ALLOC(tpo, sizeof(kib_tx_pool_t)); + if (tpo == NULL) { + CERROR("Failed to allocate TX pool\n"); + return -ENOMEM; + } - while (!list_empty(&pool->ibmp_free_list)) { - pmr = list_entry(pool->ibmp_free_list.next, - kib_phys_mr_t, ibpm_link); + pool = &tpo->tpo_pool; + kiblnd_init_pool(ps, pool, size); + tpo->tpo_tx_descs = NULL; + tpo->tpo_tx_pages = NULL; - LASSERT (pmr->ibpm_mr == NULL); + npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); + return -ENOMEM; + } - list_del(&pmr->ibpm_link); + LIBCFS_ALLOC (tpo->tpo_tx_descs, size * sizeof(kib_tx_t)); + if (tpo->tpo_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", size); + ps->ps_pool_destroy(pool); + return -ENOMEM; + } - LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t)); + memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); + + for (i = 0; i < size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + tx->tx_pool = tpo; + if (ps->ps_net->ibn_with_fmr){ + LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_ALLOC(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + LIBCFS_ALLOC(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; } - LIBCFS_FREE(pool, sizeof(kib_phys_mr_pool_t)); + if (i == size) { + kiblnd_map_tx_pool(tpo); + *pp_po = pool; + return 0; + } + + ps->ps_pool_destroy(pool); + return -ENOMEM; +} + +static void +kiblnd_tx_init(kib_pool_t *pool, struct list_head *node) +{ + kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t, tps_poolset); + kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list); + + tx->tx_cookie = tps->tps_next_tx_cookie ++; +} + +void +kiblnd_ni_fini_pools(kib_net_t *net) +{ + kiblnd_fini_pool_set(&net->ibn_tx_ps.tps_poolset); + if (net->ibn_with_fmr) + kiblnd_fini_fmr_pool_set(&net->ibn_fmr_ps); + else if (net->ibn_with_pmr) + kiblnd_fini_pool_set(&net->ibn_pmr_ps.pps_poolset); } int -kiblnd_create_pmr_pool(kib_dev_t *ibdev, kib_phys_mr_pool_t **poolpp) +kiblnd_net_init_pools(kib_net_t *net) { - kib_phys_mr_pool_t *pool; - kib_phys_mr_t *pmr; - int i; + kib_fmr_poolset_t *fps = &net->ibn_fmr_ps; + kib_pmr_poolset_t *pps = &net->ibn_pmr_ps; + kib_tx_poolset_t *tps = &net->ibn_tx_ps; + int rc; + + if (*kiblnd_tunables.kib_fmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_fmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + return -EINVAL; + } if (*kiblnd_tunables.kib_pmr_pool_size < - *kiblnd_tunables.kib_ntx) { - CERROR("Can't set pmr pool size (%d) < ntx(%d)\n", + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n", *kiblnd_tunables.kib_pmr_pool_size, - *kiblnd_tunables.kib_ntx); + *kiblnd_tunables.kib_ntx / 4); return -EINVAL; } - LIBCFS_ALLOC(pool, sizeof(kib_phys_mr_pool_t)); - if (pool == NULL) - return -ENOMEM; + if (*kiblnd_tunables.kib_map_on_demand > 0 || + net->ibn_dev->ibd_nmrs > 1) { /* premapping can fail if ibd_nmr > 1, + * so we always create FMR/PMR pool and + * map-on-demand if premapping failed */ + rc = kiblnd_init_fmr_pool_set(fps, net); + if (rc == 0) { + net->ibn_with_fmr = 1; + } else if (rc == -ENOSYS) { + rc = kiblnd_init_pool_set(&pps->pps_poolset, net, "PMR", + *kiblnd_tunables.kib_pmr_pool_size, + kiblnd_create_pmr_pool, + kiblnd_destroy_pmr_pool, + NULL, NULL); + if (rc == 0) + net->ibn_with_pmr = 1; + } + if (rc != 0) + return rc; + } - spin_lock_init(&pool->ibmp_lock); + rc = kiblnd_init_pool_set(&tps->tps_poolset, net, "TX", IBLND_TX_MSGS(), + kiblnd_create_tx_pool, kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc == 0) + return 0; - pool->ibmp_allocated = 0; - CFS_INIT_LIST_HEAD(&pool->ibmp_free_list); + if (net->ibn_with_fmr) + kiblnd_fini_fmr_pool_set(fps); + else if (net->ibn_with_pmr) + kiblnd_fini_pool_set(&pps->pps_poolset); - for (i = 0; i < *kiblnd_tunables.kib_pmr_pool_size; i++) { - LIBCFS_ALLOC(pmr, sizeof(kib_phys_mr_t)); + return rc; +} - if (pmr == NULL) { - kiblnd_destroy_pmr_pool(pool); - return -ENOMEM; - } +void +kiblnd_dev_cleanup(kib_dev_t *ibdev) +{ + int i; - memset(pmr, 0, sizeof(kib_phys_mr_t)); + if (ibdev->ibd_mrs == NULL) + return; - list_add(&pmr->ibpm_link, &pool->ibmp_free_list); - } + for (i = 0; i < ibdev->ibd_nmrs; i++) { + if (ibdev->ibd_mrs[i] == NULL) + break; - *poolpp = pool; + ib_dereg_mr(ibdev->ibd_mrs[i]); + } - return 0; + LIBCFS_FREE(ibdev->ibd_mrs, sizeof(*ibdev->ibd_mrs) * ibdev->ibd_nmrs); + ibdev->ibd_mrs = NULL; } static int @@ -1907,7 +2315,7 @@ kiblnd_shutdown (lnet_ni_t *ni) cfs_pause(cfs_time_seconds(1)); } - kiblnd_unmap_tx_descs(ni); + kiblnd_ni_fini_pools(net); LASSERT (net->ibn_dev->ibd_nnets > 0); net->ibn_dev->ibd_nnets--; @@ -1917,11 +2325,6 @@ kiblnd_shutdown (lnet_ni_t *ni) case IBLND_INIT_NOTHING: LASSERT (atomic_read(&net->ibn_nconns) == 0); - if (net->ibn_fmrpool != NULL) - ib_destroy_fmr_pool(net->ibn_fmrpool); - if (net->ibn_pmrpool != NULL) - kiblnd_destroy_pmr_pool(net->ibn_pmrpool); - if (net->ibn_dev != NULL && net->ibn_dev->ibd_nnets == 0) kiblnd_destroy_dev(net->ibn_dev); @@ -1929,14 +2332,12 @@ kiblnd_shutdown (lnet_ni_t *ni) break; } - kiblnd_free_tx_descs(ni); - CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", atomic_read(&libcfs_kmemory)); net->ibn_init = IBLND_INIT_NOTHING; ni->ni_data = NULL; - + LIBCFS_FREE(net, sizeof(*net)); out: @@ -2043,9 +2444,6 @@ kiblnd_startup (lnet_ni_t *ni) ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits; ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits; - spin_lock_init(&net->ibn_tx_lock); - INIT_LIST_HEAD(&net->ibn_idle_txs); - if (ni->ni_interfaces[0] != NULL) { /* Use the IPoIB interface specified in 'networks=' */ @@ -2154,34 +2552,11 @@ kiblnd_startup (lnet_ni_t *ni) ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); net->ibn_dev = ibdev; - if (*kiblnd_tunables.kib_map_on_demand > 0 || - ibdev->ibd_nmrs > 1) { /* premapping can fail if ibd_nmr > 1, - * so we always create FMR/PMR pool and - * map-on-demand if premapping failed */ - /* Map on demand */ - rc = kiblnd_ib_create_fmr_pool(ibdev, &net->ibn_fmrpool); - if (rc == -ENOSYS) { - CDEBUG(D_CONSOLE, "No FMR, creating physical mapping\n"); - - rc = kiblnd_create_pmr_pool(ibdev, &net->ibn_pmrpool); - } - - if (rc != 0) { - CERROR("Can't create FMR or physical mapping pool: %d, " - "please disable map_on_demand and retry\n", rc); - goto failed; - } - - } - - rc = kiblnd_alloc_tx_descs(ni); + rc = kiblnd_net_init_pools(net); if (rc != 0) { - CERROR("Can't allocate tx descs\n"); + CERROR("Failed to initialize NI pools: %d\n", rc); goto failed; } - - kiblnd_map_tx_descs(ni); - ibdev->ibd_nnets++; net->ibn_init = IBLND_INIT_ALL; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index c48306b..7d81190 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -174,8 +174,6 @@ kiblnd_concurrent_sends_v1(void) /* TX messages (shared by all connections) */ #define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) -#define IBLND_TX_MSG_BYTES() (IBLND_TX_MSGS() * IBLND_MSG_SIZE) -#define IBLND_TX_MSG_PAGES() ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1) / PAGE_SIZE) /* RX messages (per connection) */ #define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v)) @@ -189,26 +187,6 @@ kiblnd_concurrent_sends_v1(void) typedef struct { - struct ib_device *ibp_device; /* device for mapping */ - int ibp_npages; /* # pages */ - struct page *ibp_pages[0]; -} kib_pages_t; - -typedef struct { - spinlock_t ibmp_lock; /* serialize */ - int ibmp_allocated; /* MR in use */ - struct list_head ibmp_free_list; /* pre-allocated MR */ -} kib_phys_mr_pool_t; - -typedef struct { - struct list_head ibpm_link; /* link node */ - struct ib_mr *ibpm_mr; /* MR */ - __u64 ibpm_iova; /* Virtual I/O address */ - int ibpm_refcount; /* reference count */ -} kib_phys_mr_t; - -typedef struct -{ struct list_head ibd_list; /* chain on kib_devs */ __u32 ibd_ifip; /* IPoIB interface IP */ char ibd_ifname[32]; /* IPoIB interface name */ @@ -226,23 +204,121 @@ typedef struct struct ib_mr **ibd_mrs; /* MR for non RDMA I/O */ } kib_dev_t; +#define IBLND_POOL_DEADLINE 300 /* # of seconds to keep pool alive */ + +typedef struct +{ + struct ib_device *ibp_device; /* device for mapping */ + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; /* page array */ +} kib_pages_t; + +struct kib_pmr_pool; + +typedef struct { + struct list_head pmr_list; /* chain node */ + struct ib_phys_buf *pmr_ipb; /* physical buffer */ + struct ib_mr *pmr_mr; /* IB MR */ + struct kib_pmr_pool *pmr_pool; /* owner of this MR */ + __u64 pmr_iova; /* Virtual I/O address */ + int pmr_refcount; /* reference count */ +} kib_phys_mr_t; + +struct kib_pool; +struct kib_poolset; + +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, int inc, struct kib_pool **pp_po); +typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); + +struct kib_net; + +#define IBLND_POOL_NAME_LEN 32 + +typedef struct kib_poolset +{ + spinlock_t ps_lock; /* serialize */ + struct kib_net *ps_net; /* network it belongs to */ + char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */ + struct list_head ps_pool_list; /* list of pools */ + cfs_time_t ps_next_retry; /* time stamp for retry if failed to allocate */ + int ps_increasing; /* is allocating new pool */ + int ps_pool_size; /* new pool size */ + + kib_ps_pool_create_t ps_pool_create; /* create a new pool */ + kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ + kib_ps_node_init_t ps_node_init; /* initialize new allocated node */ + kib_ps_node_fini_t ps_node_fini; /* finalize node */ +} kib_poolset_t; + +typedef struct kib_pool +{ + struct list_head po_list; /* chain on pool list */ + struct list_head po_free_list; /* pre-allocated node */ + kib_poolset_t *po_owner; /* pool_set of this pool */ + cfs_time_t po_deadline; /* deadline of this pool */ + int po_allocated; /* # of elements in use */ + int po_size; /* # of pre-allocated elements */ +} kib_pool_t; + +typedef struct { + kib_poolset_t tps_poolset; /* pool-set */ + __u64 tps_next_tx_cookie; /* cookie of TX */ +} kib_tx_poolset_t; + +typedef struct { + kib_pool_t tpo_pool; /* pool */ + struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ + kib_pages_t *tpo_tx_pages; /* premapped tx msg pages */ +} kib_tx_pool_t; + +typedef struct { + kib_poolset_t pps_poolset; /* pool-set */ +} kib_pmr_poolset_t; + +typedef struct kib_pmr_pool { + kib_pool_t ppo_pool; /* pool */ +} kib_pmr_pool_t; + +typedef struct +{ + spinlock_t fps_lock; /* serialize */ + struct kib_net *fps_net; /* IB network */ + struct list_head fps_pool_list; /* FMR pool list */ + __u64 fps_version; /* validity stamp */ + int fps_increasing; /* is allocating new pool */ + cfs_time_t fps_next_retry; /* time stamp for retry if failed to allocate */ +} kib_fmr_poolset_t; + typedef struct { + struct list_head fpo_list; /* chain on pool list */ + kib_fmr_poolset_t *fpo_owner; /* owner of this pool */ + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + cfs_time_t fpo_deadline; /* deadline of this pool */ + int fpo_map_count; /* # of mapped FMR */ +} kib_fmr_pool_t; + +typedef struct { + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ + kib_fmr_pool_t *fmr_pool; /* pool of FMR */ +} kib_fmr_t; + +typedef struct kib_net +{ __u64 ibn_incarnation; /* my epoch */ int ibn_init; /* initialisation state */ int ibn_shutdown; /* shutting down? */ + unsigned int ibn_with_fmr:1; /* FMR? */ + unsigned int ibn_with_pmr:1; /* PMR? */ atomic_t ibn_npeers; /* # peers extant */ atomic_t ibn_nconns; /* # connections extant */ - __u64 ibn_tx_next_cookie; /* RDMA completion cookie */ - struct kib_tx *ibn_tx_descs; /* all the tx descriptors */ - kib_pages_t *ibn_tx_pages; /* premapped tx msg pages */ - struct list_head ibn_idle_txs; /* idle tx descriptors */ - spinlock_t ibn_tx_lock; /* serialise */ - - struct ib_fmr_pool *ibn_fmrpool; /* FMR pool for RDMA I/O */ - kib_phys_mr_pool_t *ibn_pmrpool; /* Physical MR pool for RDMA I/O */ + kib_tx_poolset_t ibn_tx_ps; /* tx pool-set */ + kib_fmr_poolset_t ibn_fmr_ps; /* fmr pool-set */ + kib_pmr_poolset_t ibn_pmr_ps; /* pmr pool-set */ kib_dev_t *ibn_dev; /* underlying IB device */ } kib_net_t; @@ -417,10 +493,11 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + kib_tx_pool_t *tx_pool; /* pool I'm from */ struct kib_conn *tx_conn; /* owning conn */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_queued; /* queued for sending */ - int tx_waiting; /* waiting for peer */ + short tx_sending; /* # tx callbacks outstanding */ + short tx_queued; /* queued for sending */ + short tx_waiting; /* waiting for peer */ int tx_status; /* LNET completion status */ unsigned long tx_deadline; /* completion deadline */ __u64 tx_cookie; /* completion cookie */ @@ -434,11 +511,10 @@ typedef struct kib_tx /* transmit message */ kib_rdma_desc_t *tx_rd; /* rdma descriptor */ int tx_nfrags; /* # entries in... */ struct scatterlist *tx_frags; /* dma_map_sg descriptor */ - struct ib_phys_buf *tx_ipb; /* physical buffer (for iWARP) */ __u64 *tx_pages; /* rdma phys page addrs */ union { - kib_phys_mr_t *pmr; /* MR for physical buffer */ - struct ib_pool_fmr *fmr; /* rdma mapping (mapped if != NULL) */ + kib_phys_mr_t *pmr; /* MR for physical buffer */ + kib_fmr_t fmr; /* FMR */ } tx_u; int tx_dmadir; /* dma direction */ } kib_tx_t; @@ -849,14 +925,19 @@ struct ib_mr *kiblnd_find_dma_mr(kib_net_t *net, __u64 addr, __u64 size); void kiblnd_map_rx_descs(kib_conn_t *conn); void kiblnd_unmap_rx_descs(kib_conn_t *conn); -void kiblnd_map_tx_descs (lnet_ni_t *ni); -void kiblnd_unmap_tx_descs(lnet_ni_t *ni); int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags); void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx); -kib_phys_mr_t *kiblnd_phys_mr_map(kib_net_t *net, kib_rdma_desc_t *rd, - struct ib_phys_buf *ipb, __u64 *iova); -void kiblnd_phys_mr_unmap(kib_net_t *net, kib_phys_mr_t *pmr); +void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node); +struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps); + +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, + int npages, __u64 iov, kib_fmr_t *fmr); +void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status); + +int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_rdma_desc_t *rd, + __u64 *iova, kib_phys_mr_t **pp_pmr); +void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr); int kiblnd_startup (lnet_ni_t *ni); void kiblnd_shutdown (lnet_ni_t *ni); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index c93a11c..73b4c13 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -53,6 +53,7 @@ kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx) LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ + LASSERT (tx->tx_pool != NULL); kiblnd_unmap_tx(ni, tx); @@ -71,11 +72,7 @@ kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx) tx->tx_nwrq = 0; tx->tx_status = 0; - spin_lock(&net->ibn_tx_lock); - - list_add(&tx->tx_list, &net->ibn_idle_txs); - - spin_unlock(&net->ibn_tx_lock); + kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); /* delay finalize until my descs have been freed */ for (i = 0; i < 2; i++) { @@ -105,27 +102,14 @@ kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status) kib_tx_t * kiblnd_get_idle_tx (lnet_ni_t *ni) { - kib_net_t *net = ni->ni_data; - kib_tx_t *tx; - - LASSERT (net != NULL); - - spin_lock(&net->ibn_tx_lock); + kib_net_t *net = (kib_net_t *)ni->ni_data; + struct list_head *node; + kib_tx_t *tx; - if (list_empty(&net->ibn_idle_txs)) { - spin_unlock(&net->ibn_tx_lock); + node = kiblnd_pool_alloc_node(&net->ibn_tx_ps.tps_poolset); + if (node == NULL) return NULL; - } - - tx = list_entry(net->ibn_idle_txs.next, kib_tx_t, tx_list); - list_del(&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_cookie = net->ibn_tx_next_cookie++; - - spin_unlock(&net->ibn_tx_lock); + tx = container_of(node, kib_tx_t, tx_list); LASSERT (tx->tx_nwrq == 0); LASSERT (!tx->tx_queued); @@ -135,7 +119,7 @@ kiblnd_get_idle_tx (lnet_ni_t *ni) LASSERT (tx->tx_conn == NULL); LASSERT (tx->tx_lntmsg[0] == NULL); LASSERT (tx->tx_lntmsg[1] == NULL); - LASSERT (tx->tx_u.fmr == NULL); + LASSERT (tx->tx_u.pmr == NULL); LASSERT (tx->tx_nfrags == 0); return tx; @@ -548,33 +532,14 @@ kiblnd_kvaddr_to_page (unsigned long vaddr) return page; } -static void -kiblnd_fmr_unmap_tx(kib_net_t *net, kib_tx_t *tx) -{ - int rc; - - if (tx->tx_u.fmr == NULL) - return; - - rc = ib_fmr_pool_unmap(tx->tx_u.fmr); - LASSERT (rc == 0); - - if (tx->tx_status != 0) { - rc = ib_flush_fmr_pool(net->ibn_fmrpool); - LASSERT (rc == 0); - } - - tx->tx_u.fmr = NULL; -} - static int kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) { - struct ib_pool_fmr *fmr; kib_dev_t *ibdev = net->ibn_dev; __u64 *pages = tx->tx_pages; int npages; int size; + int rc; int i; for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { @@ -585,59 +550,45 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) } } - fmr = ib_fmr_pool_map_phys(net->ibn_fmrpool, pages, npages, 0); - - if (IS_ERR(fmr)) { - CERROR ("Can't map %d pages: %ld\n", npages, PTR_ERR(fmr)); - return PTR_ERR(fmr); + rc = kiblnd_fmr_pool_map(&net->ibn_fmr_ps, pages, npages, 0, &tx->tx_u.fmr); + if (rc != 0) { + CERROR ("Can't map %d pages: %d\n", npages, rc); + return rc; } /* If rd is not tx_rd, it's going to get sent to a peer, who will need * the rkey */ - rd->rd_key = (rd != tx->tx_rd) ? fmr->fmr->rkey : - fmr->fmr->lkey; + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey : + tx->tx_u.fmr.fmr_pfmr->fmr->lkey; rd->rd_frags[0].rf_addr &= ~ibdev->ibd_page_mask; rd->rd_frags[0].rf_nob = nob; rd->rd_nfrags = 1; - tx->tx_u.fmr = fmr; - return 0; } -static void -kiblnd_pmr_unmap_tx(kib_net_t *net, kib_tx_t *tx) -{ - if (tx->tx_u.pmr == NULL) - return; - - kiblnd_phys_mr_unmap(net, tx->tx_u.pmr); - - tx->tx_u.pmr = NULL; -} - static int kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) { - kib_phys_mr_t *pmr; - __u64 iova; + __u64 iova; + int rc; iova = rd->rd_frags[0].rf_addr & ~net->ibn_dev->ibd_page_mask; - pmr = kiblnd_phys_mr_map(net, rd, tx->tx_ipb, &iova); - if (pmr == NULL) { - CERROR("Failed to create MR by phybuf\n"); - return -ENOMEM; + rc = kiblnd_pmr_pool_map(&net->ibn_pmr_ps, rd, &iova, &tx->tx_u.pmr); + if (rc != 0) { + CERROR("Failed to create MR by phybuf: %d\n", rc); + return rc; } - rd->rd_key = (rd != tx->tx_rd) ? pmr->ibpm_mr->rkey : - pmr->ibpm_mr->lkey; + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey : + tx->tx_u.pmr->pmr_mr->lkey; rd->rd_nfrags = 1; rd->rd_frags[0].rf_addr = iova; rd->rd_frags[0].rf_nob = nob; - tx->tx_u.pmr = pmr; - return 0; } @@ -648,10 +599,13 @@ kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) LASSERT (net != NULL); - if (net->ibn_fmrpool != NULL) - kiblnd_fmr_unmap_tx(net, tx); - else if (net->ibn_pmrpool != NULL) - kiblnd_pmr_unmap_tx(net, tx); + if (net->ibn_with_fmr && tx->tx_u.fmr.fmr_pfmr != NULL) { + kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status); + tx->tx_u.fmr.fmr_pfmr = NULL; + } else if (net->ibn_with_pmr && tx->tx_u.pmr != NULL) { + kiblnd_pmr_pool_unmap(tx->tx_u.pmr); + tx->tx_u.pmr = NULL; + } if (tx->tx_nfrags != 0) { kiblnd_dma_unmap_sg(net->ibn_dev->ibd_cmid->device, @@ -694,10 +648,9 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, return 0; } - if (net->ibn_fmrpool != NULL) + if (net->ibn_with_fmr) return kiblnd_fmr_map_tx(net, tx, rd, nob); - - if (net->ibn_pmrpool != NULL); + else if (net->ibn_with_pmr) return kiblnd_pmr_map_tx(net, tx, rd, nob); return -EINVAL; diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 6995761..f7081a9 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -106,7 +106,7 @@ CFS_MODULE_PARM(map_on_demand, "i", int, 0444, static int fmr_pool_size = 512; CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444, - "size of the fmr pool (>= ntx)"); + "size of the fmr pool (>= ntx / 4)"); static int fmr_flush_trigger = 384; CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444, @@ -417,13 +417,6 @@ kiblnd_sysctl_fini (void) int kiblnd_tunables_init (void) { - if (*kiblnd_tunables.kib_credits > *kiblnd_tunables.kib_ntx) { - CERROR("Can't set credits(%d) > ntx(%d)\n", - *kiblnd_tunables.kib_credits, - *kiblnd_tunables.kib_ntx); - return -EINVAL; - } - if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", *kiblnd_tunables.kib_ib_mtu); -- 1.8.3.1