+ cfs_list_add(&pool->po_list, &ps->ps_pool_list);
+ else
+ CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+ return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+ if (pool->po_allocated != 0) /* still in use */
+ return 0;
+ if (pool->po_failed)
+ return 1;
+ return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, cfs_list_t *node)
+{
+ CFS_LIST_HEAD (zombies);
+ kib_poolset_t *ps = pool->po_owner;
+ kib_pool_t *tmp;
+ cfs_time_t now = cfs_time_current();
+
+ cfs_spin_lock(&ps->ps_lock);
+
+ if (ps->ps_node_fini != NULL)
+ ps->ps_node_fini(pool, node);
+
+ LASSERT (pool->po_allocated > 0);
+ cfs_list_add(node, &pool->po_free_list);
+ pool->po_allocated --;
+
+ cfs_list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+ /* the first pool is persistent */
+ if (ps->ps_pool_list.next == &pool->po_list)
+ continue;
+
+ if (kiblnd_pool_is_idle(pool, now))
+ cfs_list_move(&pool->po_list, &zombies);
+ }
+ cfs_spin_unlock(&ps->ps_lock);
+
+ if (!cfs_list_empty(&zombies))
+ kiblnd_destroy_pool_list(&zombies);
+}
+
+cfs_list_t *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+ cfs_list_t *node;
+ kib_pool_t *pool;
+ int rc;
+
+ again:
+ cfs_spin_lock(&ps->ps_lock);
+ cfs_list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+ if (cfs_list_empty(&pool->po_free_list))
+ continue;
+
+ pool->po_allocated ++;
+ pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ node = pool->po_free_list.next;
+ cfs_list_del(node);
+
+ if (ps->ps_node_init != NULL) {
+ /* still hold the lock */
+ ps->ps_node_init(pool, node);
+ }
+ cfs_spin_unlock(&ps->ps_lock);
+ return node;
+ }
+
+ /* no available tx pool and ... */
+ if (ps->ps_increasing) {
+ /* another thread is allocating a new pool */
+ cfs_spin_unlock(&ps->ps_lock);
+ CDEBUG(D_NET, "Another thread is allocating new "
+ "%s pool, waiting for her to complete\n",
+ ps->ps_name);
+ cfs_schedule();
+ goto again;
+ }
+
+ if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+ /* someone failed recently */
+ cfs_spin_unlock(&ps->ps_lock);
+ return NULL;
+ }
+
+ ps->ps_increasing = 1;
+ cfs_spin_unlock(&ps->ps_lock);
+
+ CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+ rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+ cfs_spin_lock(&ps->ps_lock);
+ ps->ps_increasing = 0;
+ if (rc == 0) {
+ cfs_list_add_tail(&pool->po_list, &ps->ps_pool_list);
+ } else {
+ /* retry 10 seconds later */
+ ps->ps_next_retry = cfs_time_shift(10);
+ CERROR("Can't allocate new %s pool because out of memory\n",
+ ps->ps_name);
+ }
+ cfs_spin_unlock(&ps->ps_lock);
+
+ goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+ kib_pmr_pool_t *ppo = pmr->pmr_pool;
+ struct ib_mr *mr = pmr->pmr_mr;
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+ if (mr != NULL)
+ ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+ kib_phys_mr_t *pmr;
+ cfs_list_t *node;
+ int rc;
+ int i;
+
+ node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+ if (node == NULL) {
+ CERROR("Failed to allocate PMR descriptor\n");
+ return -ENOMEM;
+ }
+
+ pmr = container_of(node, kib_phys_mr_t, pmr_list);
+ if (pmr->pmr_pool->ppo_hdev != hdev) {
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+ return -EAGAIN;
+ }
+
+ for (i = 0; i < rd->rd_nfrags; i ++) {
+ pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+ pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+ }
+
+ pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+ pmr->pmr_ipb, rd->rd_nfrags,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE,
+ iova);
+ if (!IS_ERR(pmr->pmr_mr)) {
+ pmr->pmr_iova = *iova;
+ *pp_pmr = pmr;
+ return 0;
+ }
+
+ rc = PTR_ERR(pmr->pmr_mr);
+ CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+ return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+ kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+ kib_phys_mr_t *pmr;
+
+ LASSERT (pool->po_allocated == 0);
+
+ while (!cfs_list_empty(&pool->po_free_list)) {
+ pmr = cfs_list_entry(pool->po_free_list.next,
+ kib_phys_mr_t, pmr_list);
+
+ LASSERT (pmr->pmr_mr == NULL);
+ cfs_list_del(&pmr->pmr_list);
+
+ if (pmr->pmr_ipb != NULL) {
+ LIBCFS_FREE(pmr->pmr_ipb,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(struct ib_phys_buf));
+ }
+
+ LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+ }
+
+ kiblnd_fini_pool(pool);
+ if (ppo->ppo_hdev != NULL)
+ kiblnd_hdev_decref(ppo->ppo_hdev);
+
+ LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+ kib_pmr_pool_t *ppo;
+ kib_pool_t *pool;
+ kib_phys_mr_t *pmr;
+ int i;
+
+ LIBCFS_ALLOC(ppo, sizeof(kib_pmr_pool_t));
+ if (ppo == NULL) {
+ CERROR("Failed to allocate PMR pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &ppo->ppo_pool;
+ kiblnd_init_pool(ps, pool, size);
+
+ for (i = 0; i < size; i++) {
+ LIBCFS_ALLOC(pmr, sizeof(kib_phys_mr_t));
+ if (pmr == NULL)
+ break;
+
+ memset(pmr, 0, sizeof(kib_phys_mr_t));
+ pmr->pmr_pool = ppo;
+ LIBCFS_ALLOC(pmr->pmr_ipb,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(struct ib_phys_buf));
+ if (pmr->pmr_ipb == NULL)
+ break;
+
+ cfs_list_add(&pmr->pmr_list, &pool->po_free_list);
+ }
+
+ if (i < size) {
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+ *pp_po = pool;
+ return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+ kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+ int i;
+
+ LASSERT (pool->po_allocated == 0);
+
+ if (tpo->tpo_tx_pages != NULL) {
+ kiblnd_unmap_tx_pool(tpo);
+ kiblnd_free_pages(tpo->tpo_tx_pages);
+ }
+
+ if (tpo->tpo_tx_descs == NULL)
+ goto out;
+
+ for (i = 0; i < pool->po_size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ cfs_list_del(&tx->tx_list);
+ if (tx->tx_pages != NULL)
+ LIBCFS_FREE(tx->tx_pages,
+ LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_frags != NULL)
+ LIBCFS_FREE(tx->tx_frags,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(*tx->tx_frags));
+ if (tx->tx_wrq != NULL)
+ LIBCFS_FREE(tx->tx_wrq,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_sge != NULL)
+ LIBCFS_FREE(tx->tx_sge,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_rd != NULL)
+ LIBCFS_FREE(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ }
+
+ LIBCFS_FREE(tpo->tpo_tx_descs,
+ pool->po_size * sizeof(kib_tx_t));
+out:
+ kiblnd_fini_pool(pool);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+ int i;
+ int npg;
+ kib_pool_t *pool;
+ kib_tx_pool_t *tpo;
+
+ LIBCFS_ALLOC(tpo, sizeof(kib_tx_pool_t));
+ if (tpo == NULL) {
+ CERROR("Failed to allocate TX pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &tpo->tpo_pool;
+ kiblnd_init_pool(ps, pool, size);
+ tpo->tpo_tx_descs = NULL;
+ tpo->tpo_tx_pages = NULL;
+
+ npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, npg) != 0) {
+ CERROR("Can't allocate tx pages: %d\n", npg);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC (tpo->tpo_tx_descs, size * sizeof(kib_tx_t));
+ if (tpo->tpo_tx_descs == NULL) {
+ CERROR("Can't allocate %d tx descriptors\n", size);
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+ for (i = 0; i < size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ tx->tx_pool = tpo;
+ if (ps->ps_net->ibn_with_fmr){
+ LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_pages == NULL)
+ break;
+ }
+
+ LIBCFS_ALLOC(tx->tx_frags,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(*tx->tx_frags));
+ if (tx->tx_frags == NULL)
+ break;
+
+ LIBCFS_ALLOC(tx->tx_wrq,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_wrq == NULL)
+ break;
+
+ LIBCFS_ALLOC(tx->tx_sge,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_sge == NULL)
+ break;
+
+ LIBCFS_ALLOC(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ if (tx->tx_rd == NULL)
+ break;
+ }
+
+ if (i == size) {
+ kiblnd_map_tx_pool(tpo);
+ *pp_po = pool;
+ return 0;
+ }
+
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, cfs_list_t *node)
+{
+ kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+ tps_poolset);
+ kib_tx_t *tx = cfs_list_entry(node, kib_tx_t, tx_list);
+
+ tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_ni_fini_pools(kib_net_t *net)
+{
+ kiblnd_fini_pool_set(&net->ibn_tx_ps.tps_poolset);
+ if (net->ibn_with_fmr)
+ kiblnd_fini_fmr_pool_set(&net->ibn_fmr_ps);
+ else if (net->ibn_with_pmr)
+ kiblnd_fini_pool_set(&net->ibn_pmr_ps.pps_poolset);
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net)
+{
+ kib_fmr_poolset_t *fps = &net->ibn_fmr_ps;
+ kib_pmr_poolset_t *pps = &net->ibn_pmr_ps;
+ kib_tx_poolset_t *tps = &net->ibn_tx_ps;
+ unsigned long flags;
+ int rc;
+
+ if (*kiblnd_tunables.kib_fmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_fmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ return -EINVAL;
+ }
+
+ if (*kiblnd_tunables.kib_pmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_pmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ return -EINVAL;
+ }
+
+ cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ if (*kiblnd_tunables.kib_map_on_demand > 0 ||
+ net->ibn_dev->ibd_hdev->ibh_nmrs > 1) {
+ /* premapping can fail if ibd_nmr > 1, so we always create
+ * FMR/PMR pool and map-on-demand if premapping failed */
+ cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ rc = kiblnd_init_fmr_pool_set(fps, net);
+ if (rc == 0) {
+ net->ibn_with_fmr = 1;
+ } else if (rc == -ENOSYS) {
+ rc = kiblnd_init_pool_set(&pps->pps_poolset, net, "PMR",
+ *kiblnd_tunables.kib_pmr_pool_size,
+ kiblnd_create_pmr_pool,
+ kiblnd_destroy_pmr_pool,
+ NULL, NULL);
+ if (rc == 0)
+ net->ibn_with_pmr = 1;
+ }
+ if (rc != 0)
+ return rc;
+ } else {
+ cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ }
+
+ rc = kiblnd_init_pool_set(&tps->tps_poolset, net, "TX", IBLND_TX_MSGS(),
+ kiblnd_create_tx_pool, kiblnd_destroy_tx_pool,
+ kiblnd_tx_init, NULL);
+ if (rc == 0)
+ return 0;
+
+ if (net->ibn_with_fmr)
+ kiblnd_fini_fmr_pool_set(fps);
+ else if (net->ibn_with_pmr)
+ kiblnd_fini_pool_set(&pps->pps_poolset);
+
+ return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+ struct ib_device_attr *attr;
+ int rc;
+
+ /* It's safe to assume a HCA can handle a page size
+ * matching that of the native system */
+ hdev->ibh_page_shift = PAGE_SHIFT;
+ hdev->ibh_page_size = 1 << PAGE_SHIFT;
+ hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
+
+ LIBCFS_ALLOC(attr, sizeof(*attr));
+ if (attr == NULL) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ rc = ib_query_device(hdev->ibh_ibdev, attr);
+ if (rc == 0)
+ hdev->ibh_mr_size = attr->max_mr_size;