From b43a6b1800265608cfa18159d4d0d006a1c23015 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 10 May 2012 21:44:51 +0800 Subject: [PATCH] LU-56 o2iblnd: CPT affinity o2iblnd this patch covered a few things: - implement percpt scheduler threads for o2iblnd - decrease overall threads number for fat core machine - increase thread number only if there are more than one NIC Signed-off-by: Liang Zhen Change-Id: Ic4b72258f73baabed2e59746639e271cab4467fc Reviewed-on: http://review.whamcloud.com/2725 Reviewed-by: Lai Siyao Reviewed-by: Doug Oucharek Tested-by: Hudson Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.c | 791 ++++++++++++++++++++++----------- lnet/klnds/o2iblnd/o2iblnd.h | 124 ++++-- lnet/klnds/o2iblnd/o2iblnd_cb.c | 361 ++++++++------- lnet/klnds/o2iblnd/o2iblnd_modparams.c | 28 +- 4 files changed, 830 insertions(+), 474 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index e9ebd5f..a3cd78a 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -324,16 +324,17 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob) } int -kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) +kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) { - kib_peer_t *peer; - kib_net_t *net = ni->ni_data; - unsigned long flags; + kib_peer_t *peer; + kib_net_t *net = ni->ni_data; + int cpt = lnet_cpt_of_nid(nid); + unsigned long flags; - LASSERT (net != NULL); - LASSERT (nid != LNET_NID_ANY); + LASSERT(net != NULL); + LASSERT(nid != LNET_NID_ANY); - LIBCFS_ALLOC(peer, sizeof(*peer)); + LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); if (peer == NULL) { CERROR("Cannot allocate peer\n"); return -ENOMEM; @@ -686,6 +687,33 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) cmid->route.path_rec->mtu = mtu; } +#ifdef HAVE_OFED_IB_COMP_VECTOR +static int +kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) +{ + cpumask_t *mask; + int vectors; + int off; + int i; + + vectors = conn->ibc_cmid->device->num_comp_vectors; + if (vectors <= 1) + return 0; + + mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); + + /* hash NID to CPU id in this partition... */ + off = conn->ibc_peer->ibp_nid % cpus_weight(*mask); + for_each_cpu_mask(i, *mask) { + if (off-- == 0) + return i % vectors; + } + + LBUG(); + return 1; +} +#endif + kib_conn_t * kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, int state, int version) @@ -701,31 +729,37 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, kib_net_t *net = peer->ibp_ni->ni_data; kib_dev_t *dev = net->ibn_dev; struct ib_qp_init_attr *init_qp_attr; - kib_conn_t *conn; - struct ib_cq *cq; - unsigned long flags; - int rc; - int i; - - LASSERT (net != NULL); - LASSERT (!cfs_in_interrupt()); - - LIBCFS_ALLOC(init_qp_attr, sizeof(*init_qp_attr)); - if (init_qp_attr == NULL) { - CERROR("Can't allocate qp_attr for %s\n", - libcfs_nid2str(peer->ibp_nid)); - goto failed_0; - } - - LIBCFS_ALLOC(conn, sizeof(*conn)); + struct kib_sched_info *sched; + kib_conn_t *conn; + struct ib_cq *cq; + unsigned long flags; + int cpt; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!cfs_in_interrupt()); + + cpt = lnet_cpt_of_nid(peer->ibp_nid); + sched = kiblnd_data.kib_scheds[cpt]; + + LASSERT(sched->ibs_nthreads > 0); + + LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, + sizeof(*init_qp_attr)); + if (init_qp_attr == NULL) { + CERROR("Can't allocate qp_attr for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_0; + } + + LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); if (conn == NULL) { CERROR("Can't allocate connection for %s\n", libcfs_nid2str(peer->ibp_nid)); goto failed_1; } - memset(conn, 0, sizeof(*conn)); /* zero flags, NULL pointers etc... */ - conn->ibc_state = IBLND_CONN_INIT; conn->ibc_version = version; conn->ibc_peer = peer; /* I take the caller's ref */ @@ -740,12 +774,12 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, CFS_INIT_LIST_HEAD(&conn->ibc_active_txs); cfs_spin_lock_init(&conn->ibc_lock); - LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, + sizeof(*conn->ibc_connvars)); if (conn->ibc_connvars == NULL) { CERROR("Can't allocate in-progress connection state\n"); goto failed_2; } - memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); cfs_write_lock_irqsave(glock, flags); if (dev->ibd_failover) { @@ -775,24 +809,25 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, cfs_write_unlock_irqrestore(glock, flags); - LIBCFS_ALLOC(conn->ibc_rxs, IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); - if (conn->ibc_rxs == NULL) { - CERROR("Cannot allocate RX buffers\n"); - goto failed_2; - } - memset(conn->ibc_rxs, 0, IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); + LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, + IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } - rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, - IBLND_RX_MSG_PAGES(version)); + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, + IBLND_RX_MSG_PAGES(version)); if (rc != 0) goto failed_2; kiblnd_map_rx_descs(conn); #ifdef HAVE_OFED_IB_COMP_VECTOR - cq = ib_create_cq(cmid->device, - kiblnd_cq_completion, kiblnd_cq_event, conn, - IBLND_CQ_ENTRIES(version), 0); + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES(version), + kiblnd_get_completion_vector(conn, cpt)); #else cq = ib_create_cq(cmid->device, kiblnd_cq_completion, kiblnd_cq_event, conn, @@ -812,7 +847,6 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, goto failed_2; } - memset(init_qp_attr, 0, sizeof(*init_qp_attr)); init_qp_attr->event_handler = kiblnd_qp_event; init_qp_attr->qp_context = conn; init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version); @@ -824,6 +858,8 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, init_qp_attr->send_cq = cq; init_qp_attr->recv_cq = cq; + conn->ibc_sched = sched; + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); if (rc != 0) { CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", @@ -850,11 +886,9 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, /* correct # of posted buffers * NB locking needed now I'm racing with completion */ - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, - flags); - conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, - flags); + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); /* cmid will be destroyed by CM(ofed) after cm_callback * returned, so we can't refer it anymore @@ -1145,25 +1179,27 @@ kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) } void -kiblnd_free_pages (kib_pages_t *p) +kiblnd_free_pages(kib_pages_t *p) { - int npages = p->ibp_npages; - int i; + int npages = p->ibp_npages; + int i; - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); + for (i = 0; i < npages; i++) { + if (p->ibp_pages[i] != NULL) + cfs_free_page(p->ibp_pages[i]); + } - LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages])); } int -kiblnd_alloc_pages (kib_pages_t **pp, int npages) +kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages) { - kib_pages_t *p; - int i; + kib_pages_t *p; + int i; - LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, + offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR("Can't allocate descriptor for %d pages\n", npages); return -ENOMEM; @@ -1173,7 +1209,8 @@ kiblnd_alloc_pages (kib_pages_t **pp, int npages) p->ibp_npages = npages; for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page(GFP_KERNEL); + p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + CFS_ALLOC_IO); if (p->ibp_pages[i] == NULL) { CERROR("Can't allocate page %d of %d\n", i, npages); kiblnd_free_pages(p); @@ -1427,6 +1464,20 @@ kiblnd_destroy_fmr_pool_list(cfs_list_t *head) } } +static int kiblnd_fmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; + + return max(IBLND_FMR_POOL, size); +} + +static int kiblnd_fmr_flush_trigger(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; + + return max(IBLND_FMR_POOL_FLUSH, size); +} + int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo) { @@ -1438,24 +1489,24 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo) .page_shift = PAGE_SHIFT, .access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE), - .pool_size = *kiblnd_tunables.kib_fmr_pool_size, - .dirty_watermark = *kiblnd_tunables.kib_fmr_flush_trigger, - .flush_function = NULL, - .flush_arg = NULL, - .cache = !!*kiblnd_tunables.kib_fmr_cache}; - int rc; - - LIBCFS_ALLOC(fpo, sizeof(kib_fmr_pool_t)); - if (fpo == NULL) - return -ENOMEM; + .pool_size = fps->fps_pool_size, + .dirty_watermark = fps->fps_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = !!*kiblnd_tunables.kib_fmr_cache}; + int rc; + + LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); + if (fpo == NULL) + return -ENOMEM; + + fpo->fpo_hdev = kiblnd_current_hdev(dev); + + fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); + if (IS_ERR(fpo->fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fpo_fmr_pool); + CERROR("Failed to create FMR pool: %d\n", rc); - memset(fpo, 0, sizeof(kib_fmr_pool_t)); - fpo->fpo_hdev = kiblnd_current_hdev(dev); - fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); - if (IS_ERR(fpo->fpo_fmr_pool)) { - CERROR("Failed to create FMR pool: %ld\n", - PTR_ERR(fpo->fpo_fmr_pool)); - rc = PTR_ERR(fpo->fpo_fmr_pool); kiblnd_hdev_decref(fpo->fpo_hdev); LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t)); return rc; @@ -1469,8 +1520,11 @@ kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo) } static void -kiblnd_fail_fmr_pool_set(kib_fmr_poolset_t *fps, cfs_list_t *zombies) +kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, cfs_list_t *zombies) { + if (fps->fps_net == NULL) /* intialized? */ + return; + cfs_spin_lock(&fps->fps_lock); while (!cfs_list_empty(&fps->fps_pool_list)) { @@ -1488,14 +1542,17 @@ kiblnd_fail_fmr_pool_set(kib_fmr_poolset_t *fps, cfs_list_t *zombies) } static void -kiblnd_fini_fmr_pool_set(kib_fmr_poolset_t *fps) +kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps) { - kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); - kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + if (fps->fps_net != NULL) { /* initialized? */ + kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + } } static int -kiblnd_init_fmr_pool_set(kib_fmr_poolset_t *fps, kib_net_t *net) +kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net, + int pool_size, int flush_trigger) { kib_fmr_pool_t *fpo; int rc; @@ -1503,9 +1560,13 @@ kiblnd_init_fmr_pool_set(kib_fmr_poolset_t *fps, kib_net_t *net) memset(fps, 0, sizeof(kib_fmr_poolset_t)); fps->fps_net = net; - cfs_spin_lock_init(&fps->fps_lock); - CFS_INIT_LIST_HEAD(&fps->fps_pool_list); - CFS_INIT_LIST_HEAD(&fps->fps_failed_pool_list); + fps->fps_cpt = cpt; + fps->fps_pool_size = pool_size; + fps->fps_flush_trigger = flush_trigger; + cfs_spin_lock_init(&fps->fps_lock); + CFS_INIT_LIST_HEAD(&fps->fps_pool_list); + CFS_INIT_LIST_HEAD(&fps->fps_failed_pool_list); + rc = kiblnd_create_fmr_pool(fps, &fpo); if (rc == 0) cfs_list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); @@ -1572,7 +1633,6 @@ kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, __u64 version; int rc; - LASSERT (fps->fps_net->ibn_with_fmr); again: cfs_spin_lock(&fps->fps_lock); version = fps->fps_version; @@ -1672,8 +1732,11 @@ kiblnd_destroy_pool_list(cfs_list_t *head) } static void -kiblnd_fail_pool_set(kib_poolset_t *ps, cfs_list_t *zombies) +kiblnd_fail_poolset(kib_poolset_t *ps, cfs_list_t *zombies) { + if (ps->ps_net == NULL) /* intialized? */ + return; + cfs_spin_lock(&ps->ps_lock); while (!cfs_list_empty(&ps->ps_pool_list)) { kib_pool_t *po = cfs_list_entry(ps->ps_pool_list.next, @@ -1689,25 +1752,28 @@ kiblnd_fail_pool_set(kib_poolset_t *ps, cfs_list_t *zombies) } static void -kiblnd_fini_pool_set(kib_poolset_t *ps) +kiblnd_fini_poolset(kib_poolset_t *ps) { - kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); - kiblnd_destroy_pool_list(&ps->ps_pool_list); + if (ps->ps_net != NULL) { /* initialized? */ + kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); + kiblnd_destroy_pool_list(&ps->ps_pool_list); + } } static int -kiblnd_init_pool_set(kib_poolset_t *ps, kib_net_t *net, - char *name, int size, - kib_ps_pool_create_t po_create, - kib_ps_pool_destroy_t po_destroy, - kib_ps_node_init_t nd_init, - kib_ps_node_fini_t nd_fini) +kiblnd_init_poolset(kib_poolset_t *ps, int cpt, + kib_net_t *net, char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) { - kib_pool_t *pool; - int rc; + kib_pool_t *pool; + int rc; - memset(ps, 0, sizeof(kib_poolset_t)); + memset(ps, 0, sizeof(kib_poolset_t)); + ps->ps_cpt = cpt; ps->ps_net = net; ps->ps_pool_create = po_create; ps->ps_pool_destroy = po_destroy; @@ -1922,15 +1988,23 @@ kiblnd_destroy_pmr_pool(kib_pool_t *pool) LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t)); } +static inline int kiblnd_pmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts; + + return max(IBLND_PMR_POOL, size); +} + static int kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) { - kib_pmr_pool_t *ppo; - kib_pool_t *pool; - kib_phys_mr_t *pmr; - int i; + struct kib_pmr_pool *ppo; + struct kib_pool *pool; + kib_phys_mr_t *pmr; + int i; - LIBCFS_ALLOC(ppo, sizeof(kib_pmr_pool_t)); + LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_pmr_pool_t)); if (ppo == NULL) { CERROR("Failed to allocate PMR pool\n"); return -ENOMEM; @@ -1940,15 +2014,14 @@ kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) kiblnd_init_pool(ps, pool, size); for (i = 0; i < size; i++) { - LIBCFS_ALLOC(pmr, sizeof(kib_phys_mr_t)); - if (pmr == NULL) - break; - - memset(pmr, 0, sizeof(kib_phys_mr_t)); - pmr->pmr_pool = ppo; - LIBCFS_ALLOC(pmr->pmr_ipb, - IBLND_MAX_RDMA_FRAGS * - sizeof(struct ib_phys_buf)); + LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_phys_mr_t)); + if (pmr == NULL) + break; + + pmr->pmr_pool = ppo; + LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb)); if (pmr->pmr_ipb == NULL) break; @@ -2014,6 +2087,13 @@ out: LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); } +static int kiblnd_tx_pool_size(int ncpts) +{ + int ntx = *kiblnd_tunables.kib_ntx / ncpts; + + return max(IBLND_TX_POOL, ntx); +} + static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) { @@ -2022,7 +2102,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) kib_pool_t *pool; kib_tx_pool_t *tpo; - LIBCFS_ALLOC(tpo, sizeof(kib_tx_pool_t)); + LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); if (tpo == NULL) { CERROR("Failed to allocate TX pool\n"); return -ENOMEM; @@ -2034,13 +2114,14 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) tpo->tpo_tx_pages = NULL; npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; - if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, npg) != 0) { - CERROR("Can't allocate tx pages: %d\n", npg); - LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); - return -ENOMEM; - } - - LIBCFS_ALLOC (tpo->tpo_tx_descs, size * sizeof(kib_tx_t)); + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, + size * sizeof(kib_tx_t)); if (tpo->tpo_tx_descs == NULL) { CERROR("Can't allocate %d tx descriptors\n", size); ps->ps_pool_destroy(pool); @@ -2053,36 +2134,36 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) kib_tx_t *tx = &tpo->tpo_tx_descs[i]; tx->tx_pool = tpo; - if (ps->ps_net->ibn_with_fmr){ - LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - if (tx->tx_pages == NULL) - break; - } - - LIBCFS_ALLOC(tx->tx_frags, - IBLND_MAX_RDMA_FRAGS * - sizeof(*tx->tx_frags)); - if (tx->tx_frags == NULL) - break; - - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - if (tx->tx_wrq == NULL) - break; - - LIBCFS_ALLOC(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge)); - if (tx->tx_sge == NULL) - break; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBLND_MAX_RDMA_FRAGS])); - if (tx->tx_rd == NULL) - break; + if (ps->ps_net->ibn_fmr_ps != NULL) { + LIBCFS_CPT_ALLOC(tx->tx_pages, + lnet_cpt_table(), ps->ps_cpt, + LNET_MAX_IOV * sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; } if (i == size) { @@ -2106,76 +2187,149 @@ kiblnd_tx_init(kib_pool_t *pool, cfs_list_t *node) } void -kiblnd_ni_fini_pools(kib_net_t *net) +kiblnd_net_fini_pools(kib_net_t *net) { - kiblnd_fini_pool_set(&net->ibn_tx_ps.tps_poolset); - if (net->ibn_with_fmr) - kiblnd_fini_fmr_pool_set(&net->ibn_fmr_ps); - else if (net->ibn_with_pmr) - kiblnd_fini_pool_set(&net->ibn_pmr_ps.pps_poolset); -} + int i; -int -kiblnd_net_init_pools(kib_net_t *net) -{ - kib_fmr_poolset_t *fps = &net->ibn_fmr_ps; - kib_pmr_poolset_t *pps = &net->ibn_pmr_ps; - kib_tx_poolset_t *tps = &net->ibn_tx_ps; - unsigned long flags; - int rc; + cfs_cpt_for_each(i, lnet_cpt_table()) { + kib_tx_poolset_t *tps; + kib_fmr_poolset_t *fps; + kib_pmr_poolset_t *pps; - if (*kiblnd_tunables.kib_fmr_pool_size < - *kiblnd_tunables.kib_ntx / 4) { - CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", - *kiblnd_tunables.kib_fmr_pool_size, - *kiblnd_tunables.kib_ntx / 4); - return -EINVAL; - } + if (net->ibn_tx_ps != NULL) { + tps = net->ibn_tx_ps[i]; + kiblnd_fini_poolset(&tps->tps_poolset); + } - if (*kiblnd_tunables.kib_pmr_pool_size < - *kiblnd_tunables.kib_ntx / 4) { - CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n", - *kiblnd_tunables.kib_pmr_pool_size, - *kiblnd_tunables.kib_ntx / 4); - return -EINVAL; - } + if (net->ibn_fmr_ps != NULL) { + fps = net->ibn_fmr_ps[i]; + kiblnd_fini_fmr_poolset(fps); + } - cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - if (*kiblnd_tunables.kib_map_on_demand > 0 || - net->ibn_dev->ibd_hdev->ibh_nmrs > 1) { - /* premapping can fail if ibd_nmr > 1, so we always create - * FMR/PMR pool and map-on-demand if premapping failed */ - cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - rc = kiblnd_init_fmr_pool_set(fps, net); - if (rc == 0) { - net->ibn_with_fmr = 1; - } else if (rc == -ENOSYS) { - rc = kiblnd_init_pool_set(&pps->pps_poolset, net, "PMR", - *kiblnd_tunables.kib_pmr_pool_size, - kiblnd_create_pmr_pool, - kiblnd_destroy_pmr_pool, - NULL, NULL); - if (rc == 0) - net->ibn_with_pmr = 1; - } - if (rc != 0) - return rc; - } else { - cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - } + if (net->ibn_pmr_ps != NULL) { + pps = net->ibn_pmr_ps[i]; + kiblnd_fini_poolset(&pps->pps_poolset); + } + } - rc = kiblnd_init_pool_set(&tps->tps_poolset, net, "TX", IBLND_TX_MSGS(), - kiblnd_create_tx_pool, kiblnd_destroy_tx_pool, - kiblnd_tx_init, NULL); - if (rc == 0) - return 0; + if (net->ibn_tx_ps != NULL) { + cfs_percpt_free(net->ibn_tx_ps); + net->ibn_tx_ps = NULL; + } - if (net->ibn_with_fmr) - kiblnd_fini_fmr_pool_set(fps); - else if (net->ibn_with_pmr) - kiblnd_fini_pool_set(&pps->pps_poolset); + if (net->ibn_fmr_ps != NULL) { + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + } - return rc; + if (net->ibn_pmr_ps != NULL) { + cfs_percpt_free(net->ibn_pmr_ps); + net->ibn_pmr_ps = NULL; + } +} + +int +kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) +{ + unsigned long flags; + int cpt; + int rc; + int i; + + net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_tx_poolset_t)); + if (net->ibn_tx_ps == NULL) { + CERROR("Failed to allocate tx pool array\n"); + return -ENOMEM; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, + cpt, net, "TX", + kiblnd_tx_pool_size(ncpts), + kiblnd_create_tx_pool, + kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc != 0) { + CERROR("Failed to initialize TX pool\n"); + goto failed; + } + } + + cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (*kiblnd_tunables.kib_map_on_demand == 0 && + net->ibn_dev->ibd_hdev->ibh_nmrs == 1) { + cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return 0; + } + + cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (*kiblnd_tunables.kib_fmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_fmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + goto failed; + } + + /* premapping can fail if ibd_nmr > 1, so we always create + * FMR/PMR pool and map-on-demand if premapping failed */ + + net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_fmr_poolset_t)); + if (net->ibn_fmr_ps == NULL) { + CERROR("Failed to allocate FMR pool array\n"); + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, + kiblnd_fmr_pool_size(ncpts), + kiblnd_fmr_flush_trigger(ncpts)); + if (rc == -ENOSYS && i == 0) /* no FMR */ + break; /* create PMR pool */ + if (rc != 0) + goto failed; /* a real error */ + } + + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + + if (*kiblnd_tunables.kib_pmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_pmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + goto failed; + } + + net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_pmr_poolset_t)); + if (net->ibn_pmr_ps == NULL) { + CERROR("Failed to allocate PMR pool array\n"); + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset, + cpt, net, "PMR", + kiblnd_pmr_pool_size(ncpts), + kiblnd_create_pmr_pool, + kiblnd_destroy_pmr_pool, NULL, NULL); + if (rc != 0) + goto failed; + } + + return 0; + + failed: + kiblnd_net_fini_pools(net); + return rc; } static int @@ -2421,6 +2575,7 @@ kiblnd_dev_failover(kib_dev_t *dev) struct sockaddr_in addr; unsigned long flags; int rc = 0; + int i; LASSERT (*kiblnd_tunables.kib_dev_failover > 1 || dev->ibd_can_failover || @@ -2469,14 +2624,14 @@ kiblnd_dev_failover(kib_dev_t *dev) goto out; } - LIBCFS_ALLOC(hdev, sizeof(*hdev)); + LIBCFS_ALLOC(hdev, sizeof(*hdev)); if (hdev == NULL) { CERROR("Failed to allocate kib_hca_dev\n"); rdma_destroy_id(cmid); rc = -ENOMEM; goto out; } - memset(hdev, 0, sizeof(*hdev)); + atomic_set(&hdev->ibh_ref, 1); hdev->ibh_dev = dev; hdev->ibh_cmid = cmid; @@ -2510,12 +2665,20 @@ kiblnd_dev_failover(kib_dev_t *dev) hdev = old; cfs_list_for_each_entry(net, &dev->ibd_nets, ibn_list) { - kiblnd_fail_pool_set(&net->ibn_tx_ps.tps_poolset, &zombie_tpo); - if (net->ibn_with_pmr) - kiblnd_fail_pool_set(&net->ibn_pmr_ps.pps_poolset, &zombie_ppo); - if (net->ibn_with_fmr) - kiblnd_fail_fmr_pool_set(&net->ibn_fmr_ps, &zombie_fpo); - } + cfs_cpt_for_each(i, lnet_cpt_table()) { + kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, + &zombie_tpo); + + if (net->ibn_fmr_ps != NULL) { + kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], + &zombie_fpo); + + } else if (net->ibn_pmr_ps != NULL) { + kiblnd_fail_poolset(&net->ibn_pmr_ps[i]-> + pps_poolset, &zombie_ppo); + } + } + } cfs_write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); out: @@ -2610,9 +2773,10 @@ kiblnd_create_dev(char *ifname) } void -kiblnd_base_shutdown (void) +kiblnd_base_shutdown(void) { - int i; + struct kib_sched_info *sched; + int i; LASSERT (cfs_list_empty(&kiblnd_data.kib_devs)); @@ -2634,7 +2798,13 @@ kiblnd_base_shutdown (void) /* flag threads to terminate; wake and wait for them to die */ kiblnd_data.kib_shutdown = 1; - cfs_waitq_broadcast(&kiblnd_data.kib_sched_waitq); + + /* NB: we really want to stop scheduler threads net by net + * instead of the whole module, this should be improved + * with dynamic configuration LNet */ + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) + cfs_waitq_broadcast(&sched->ibs_waitq); + cfs_waitq_broadcast(&kiblnd_data.kib_connd_waitq); cfs_waitq_broadcast(&kiblnd_data.kib_failover_waitq); @@ -2653,10 +2823,14 @@ kiblnd_base_shutdown (void) break; } - if (kiblnd_data.kib_peers != NULL) - LIBCFS_FREE(kiblnd_data.kib_peers, - sizeof(cfs_list_t) * - kiblnd_data.kib_peer_hash_size); + if (kiblnd_data.kib_peers != NULL) { + LIBCFS_FREE(kiblnd_data.kib_peers, + sizeof(cfs_list_t) * + kiblnd_data.kib_peer_hash_size); + } + + if (kiblnd_data.kib_scheds != NULL) + cfs_percpt_free(kiblnd_data.kib_scheds); CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", cfs_atomic_read(&libcfs_kmemory)); @@ -2704,7 +2878,7 @@ kiblnd_shutdown (lnet_ni_t *ni) cfs_pause(cfs_time_seconds(1)); } - kiblnd_ni_fini_pools(net); + kiblnd_net_fini_pools(net); cfs_write_lock_irqsave(g_lock, flags); LASSERT (net->ibn_dev->ibd_nnets > 0); @@ -2739,10 +2913,11 @@ out: } int -kiblnd_base_startup (void) +kiblnd_base_startup(void) { - int i; - int rc; + struct kib_sched_info *sched; + int rc; + int i; LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING); @@ -2768,11 +2943,32 @@ kiblnd_base_startup (void) CFS_INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); CFS_INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); cfs_waitq_init(&kiblnd_data.kib_connd_waitq); + cfs_waitq_init(&kiblnd_data.kib_failover_waitq); + + kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (kiblnd_data.kib_scheds == NULL) + goto failed; - cfs_spin_lock_init(&kiblnd_data.kib_sched_lock); - CFS_INIT_LIST_HEAD(&kiblnd_data.kib_sched_conns); - cfs_waitq_init(&kiblnd_data.kib_sched_waitq); - cfs_waitq_init(&kiblnd_data.kib_failover_waitq); + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + int nthrs; + + cfs_spin_lock_init(&sched->ibs_lock); + CFS_INIT_LIST_HEAD(&sched->ibs_conns); + cfs_waitq_init(&sched->ibs_waitq); + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); + } else { + /* max to half of CPUs, another half is reserved for + * upper layer modules */ + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + } + + sched->ibs_nthreads_max = nthrs; + sched->ibs_cpt = i; + } kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; @@ -2780,15 +2976,6 @@ kiblnd_base_startup (void) kiblnd_data.kib_init = IBLND_INIT_DATA; /*****************************************************/ - for (i = 0; i < IBLND_N_SCHED; i++) { - rc = kiblnd_thread_start(kiblnd_scheduler, (void *)((long)i)); - if (rc != 0) { - CERROR("Can't spawn o2iblnd scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - rc = kiblnd_thread_start(kiblnd_connd, NULL); if (rc != 0) { CERROR("Can't spawn o2iblnd connd: %d\n", rc); @@ -2815,15 +3002,113 @@ kiblnd_base_startup (void) } int +kiblnd_start_schedulers(struct kib_sched_info *sched) +{ + int rc = 0; + int nthrs; + int i; + + if (sched->ibs_nthreads == 0) { + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = sched->ibs_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->ibs_cpt); + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + nthrs = min(IBLND_N_SCHED_HIGH, nthrs); + } + } else { + LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); + /* increase one thread if there is new interface */ + nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max); + } + + for (i = 0; i < nthrs; i++) { + long id; + + id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->ibs_cpt, sched->ibs_nthreads + i, rc); + break; + } + + sched->ibs_nthreads += i; + return rc; +} + +int +kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts) +{ + int cpt; + int rc; + int i; + + for (i = 0; i < ncpts; i++) { + struct kib_sched_info *sched; + + cpt = (cpts == NULL) ? i : cpts[i]; + sched = kiblnd_data.kib_scheds[cpt]; + + if (!newdev && sched->ibs_nthreads > 0) + continue; + + rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); + if (rc != 0) { + CERROR("Failed to start scheduler threads for %s\n", + dev->ibd_ifname); + return rc; + } + } + return 0; +} + +kib_dev_t * +kiblnd_dev_search(char *ifname) +{ + kib_dev_t *alias = NULL; + kib_dev_t *dev; + char *colon; + char *colon2; + + colon = strchr(ifname, ':'); + cfs_list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + return dev; + + if (alias != NULL) + continue; + + colon2 = strchr(dev->ibd_ifname, ':'); + if (colon != NULL) + *colon = 0; + if (colon2 != NULL) + *colon2 = 0; + + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + alias = dev; + + if (colon != NULL) + *colon = ':'; + if (colon2 != NULL) + *colon2 = ':'; + } + return alias; +} + +int kiblnd_startup (lnet_ni_t *ni) { char *ifname; kib_dev_t *ibdev = NULL; kib_net_t *net; - cfs_list_t *tmp; struct timeval tv; unsigned long flags; int rc; + int newdev; LASSERT (ni->ni_lnd == &the_o2iblnd); @@ -2867,16 +3152,11 @@ kiblnd_startup (lnet_ni_t *ni) goto failed; } - cfs_list_for_each (tmp, &kiblnd_data.kib_devs) { - ibdev = cfs_list_entry(tmp, kib_dev_t, ibd_list); - - if (!strcmp(&ibdev->ibd_ifname[0], ifname)) - break; - - ibdev = NULL; - } + ibdev = kiblnd_dev_search(ifname); - if (ibdev == NULL) + newdev = ibdev == NULL; + /* hmm...create kib_dev even for alias */ + if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) ibdev = kiblnd_create_dev(ifname); if (ibdev == NULL) @@ -2885,7 +3165,12 @@ kiblnd_startup (lnet_ni_t *ni) net->ibn_dev = ibdev; ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); - rc = kiblnd_net_init_pools(net); + rc = kiblnd_dev_start_threads(ibdev, newdev, + ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto failed; + + rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); if (rc != 0) { CERROR("Failed to initialize NI pools: %d\n", rc); goto failed; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 4ae2a30..c7a4f8c 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -79,15 +79,12 @@ #include #include -/* tunables fixed at compile time */ -#ifdef CONFIG_SMP -# define IBLND_N_SCHED cfs_num_online_cpus() /* # schedulers */ -#else -# define IBLND_N_SCHED 1 /* # schedulers */ -#endif +#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ +/* # scheduler loops before reschedule */ +#define IBLND_RESCHED 100 -#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBLND_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 typedef struct { @@ -120,6 +117,8 @@ typedef struct #endif int *kib_require_priv_port;/* accept only privileged ports */ int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; } kib_tunables_t; extern kib_tunables_t kiblnd_tunables; @@ -172,6 +171,12 @@ kiblnd_concurrent_sends_v1(void) /************************/ /* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_PMR_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 /* TX messages (shared by all connections) */ #define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) @@ -253,12 +258,11 @@ typedef struct { struct kib_pool; struct kib_poolset; -typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, int inc, struct kib_pool **pp_po); +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); -typedef void (*kib_ps_node_init_t)(struct kib_pool *po, - cfs_list_t *node); -typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, - cfs_list_t *node); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, cfs_list_t *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, cfs_list_t *node); struct kib_net; @@ -274,6 +278,7 @@ typedef struct kib_poolset cfs_time_t ps_next_retry; /* time stamp for retry if failed to allocate */ int ps_increasing; /* is allocating new pool */ int ps_pool_size; /* new pool size */ + int ps_cpt; /* CPT id */ kib_ps_pool_create_t ps_pool_create; /* create a new pool */ kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ @@ -320,8 +325,13 @@ typedef struct cfs_list_t fps_pool_list; /* FMR pool list */ cfs_list_t fps_failed_pool_list; /* FMR pool list */ __u64 fps_version; /* validity stamp */ - int fps_increasing; /* is allocating new pool */ - cfs_time_t fps_next_retry; /* time stamp for retry if failed to allocate */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + cfs_time_t fps_next_retry; } kib_fmr_poolset_t; typedef struct @@ -346,43 +356,64 @@ typedef struct kib_net __u64 ibn_incarnation; /* my epoch */ int ibn_init; /* initialisation state */ int ibn_shutdown; /* shutting down? */ - unsigned int ibn_with_fmr:1; /* FMR? */ - unsigned int ibn_with_pmr:1; /* PMR? */ - cfs_atomic_t ibn_npeers; /* # peers extant */ - cfs_atomic_t ibn_nconns; /* # connections extant */ + cfs_atomic_t ibn_npeers; /* # peers extant */ + cfs_atomic_t ibn_nconns; /* # connections extant */ - kib_tx_poolset_t ibn_tx_ps; /* tx pool-set */ - kib_fmr_poolset_t ibn_fmr_ps; /* fmr pool-set */ - kib_pmr_poolset_t ibn_pmr_ps; /* pmr pool-set */ + kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */ + kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */ + kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */ - kib_dev_t *ibn_dev; /* underlying IB device */ + kib_dev_t *ibn_dev; /* underlying IB device */ } kib_net_t; +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + cfs_spinlock_t ibs_lock; + /* schedulers sleep here */ + cfs_waitq_t ibs_waitq; + /* conns to check for rx completions */ + cfs_list_t ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + typedef struct { - int kib_init; /* initialisation state */ - int kib_shutdown; /* shut down? */ - cfs_list_t kib_devs; /* IB devices extant */ - cfs_list_t kib_failed_devs; /* list head of failed devices */ - cfs_atomic_t kib_nthreads; /* # live threads */ - cfs_rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */ - - cfs_list_t *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size;/* size of kib_peers */ - - void *kib_connd; /* the connd task (serialisation assertions) */ - cfs_list_t kib_connd_conns; /* connections to setup/teardown */ - cfs_list_t kib_connd_zombies;/* connections with zero refcount */ - cfs_waitq_t kib_connd_waitq; /* connection daemon sleeps here */ - cfs_spinlock_t kib_connd_lock; /* serialise */ - - cfs_waitq_t kib_sched_waitq; /* schedulers sleep here */ - cfs_list_t kib_sched_conns; /* conns to check for rx completions */ - cfs_spinlock_t kib_sched_lock; /* serialise */ - cfs_waitq_t kib_failover_waitq; /* schedulers sleep here */ - - struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + cfs_list_t kib_devs; /* IB devices extant */ + /* list head of failed devices */ + cfs_list_t kib_failed_devs; + /* schedulers sleep here */ + cfs_waitq_t kib_failover_waitq; + cfs_atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer/conn ops */ + cfs_rwlock_t kib_global_lock; + /* hash table of all my known peers */ + cfs_list_t *kib_peers; + /* size of kib_peers */ + int kib_peer_hash_size; + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + cfs_list_t kib_connd_conns; + /* connections with zero refcount */ + cfs_list_t kib_connd_zombies; + /* connection daemon sleeps here */ + cfs_waitq_t kib_connd_waitq; + cfs_spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; } kib_data_t; #define IBLND_INIT_NOTHING 0 @@ -565,6 +596,7 @@ typedef struct kib_connvars typedef struct kib_conn { + struct kib_sched_info *ibc_sched; /* scheduler information */ struct kib_peer *ibc_peer; /* owning peer */ kib_hca_dev_t *ibc_hdev; /* HCA bound on */ cfs_list_t ibc_list; /* stash on peer's conn list */ @@ -1040,7 +1072,7 @@ int kiblnd_scheduler(void *arg); int kiblnd_thread_start (int (*fn)(void *arg), void *arg); int kiblnd_failover_thread (void *arg); -int kiblnd_alloc_pages (kib_pages_t **pp, int npages); +int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages); void kiblnd_free_pages (kib_pages_t *p); int kiblnd_cm_callback(struct rdma_cm_id *cmid, diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 8304e71..90e35e6 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -98,13 +98,15 @@ kiblnd_txlist_done (lnet_ni_t *ni, cfs_list_t *txlist, int status) } kib_tx_t * -kiblnd_get_idle_tx (lnet_ni_t *ni) +kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target) { - kib_net_t *net = (kib_net_t *)ni->ni_data; - cfs_list_t *node; - kib_tx_t *tx; + kib_net_t *net = (kib_net_t *)ni->ni_data; + cfs_list_t *node; + kib_tx_t *tx; + kib_tx_poolset_t *tps; - node = kiblnd_pool_alloc_node(&net->ibn_tx_ps.tps_poolset); + tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)]; + node = kiblnd_pool_alloc_node(&tps->tps_poolset); if (node == NULL) return NULL; tx = container_of(node, kib_tx_t, tx_list); @@ -124,17 +126,18 @@ kiblnd_get_idle_tx (lnet_ni_t *ni) } void -kiblnd_drop_rx (kib_rx_t *rx) +kiblnd_drop_rx(kib_rx_t *rx) { - kib_conn_t *conn = rx->rx_conn; - unsigned long flags; - - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); - LASSERT (conn->ibc_nrx > 0); - conn->ibc_nrx--; - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + kib_conn_t *conn = rx->rx_conn; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; - kiblnd_conn_decref(conn); + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + LASSERT(conn->ibc_nrx > 0); + conn->ibc_nrx--; + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_conn_decref(conn); } int @@ -269,10 +272,10 @@ kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) } void -kiblnd_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) +kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie) { - lnet_ni_t *ni = conn->ibc_peer->ibp_ni; - kib_tx_t *tx = kiblnd_get_idle_tx(ni); + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); if (tx == NULL) { CERROR("Can't get tx for completion %x for %s\n", @@ -546,12 +549,14 @@ kiblnd_kvaddr_to_page (unsigned long vaddr) static int kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) { - kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; - __u64 *pages = tx->tx_pages; - int npages; - int size; - int rc; - int i; + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + __u64 *pages = tx->tx_pages; + kib_fmr_poolset_t *fps; + int npages; + int size; + int cpt; + int rc; + int i; for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { for (size = 0; size < rd->rd_frags[i].rf_nob; @@ -561,7 +566,13 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) } } - rc = kiblnd_fmr_pool_map(&net->ibn_fmr_ps, pages, npages, 0, &tx->tx_u.fmr); + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + fps = net->ibn_fmr_ps[cpt]; + rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr); if (rc != 0) { CERROR ("Can't map %d pages: %d\n", npages, rc); return rc; @@ -581,13 +592,21 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) static int kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) { - kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; - __u64 iova; - int rc; + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + kib_pmr_poolset_t *pps; + __u64 iova; + int cpt; + int rc; + + iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask; - iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask; + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); - rc = kiblnd_pmr_pool_map(&net->ibn_pmr_ps, hdev, rd, &iova, &tx->tx_u.pmr); + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + pps = net->ibn_pmr_ps[cpt]; + rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr); if (rc != 0) { CERROR("Failed to create MR by phybuf: %d\n", rc); return rc; @@ -607,17 +626,18 @@ kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) { - kib_net_t *net = ni->ni_data; + kib_net_t *net = ni->ni_data; - LASSERT (net != NULL); + LASSERT(net != NULL); - if (net->ibn_with_fmr && tx->tx_u.fmr.fmr_pfmr != NULL) { - kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status); - tx->tx_u.fmr.fmr_pfmr = NULL; - } else if (net->ibn_with_pmr && tx->tx_u.pmr != NULL) { - kiblnd_pmr_pool_unmap(tx->tx_u.pmr); - tx->tx_u.pmr = NULL; - } + if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) { + kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status); + tx->tx_u.fmr.fmr_pfmr = NULL; + + } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) { + kiblnd_pmr_pool_unmap(tx->tx_u.pmr); + tx->tx_u.pmr = NULL; + } if (tx->tx_nfrags != 0) { kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, @@ -661,12 +681,12 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, return 0; } - if (net->ibn_with_fmr) - return kiblnd_fmr_map_tx(net, tx, rd, nob); - else if (net->ibn_with_pmr) - return kiblnd_pmr_map_tx(net, tx, rd, nob); + if (net->ibn_fmr_ps != NULL) + return kiblnd_fmr_map_tx(net, tx, rd, nob); + else if (net->ibn_pmr_ps != NULL) + return kiblnd_pmr_map_tx(net, tx, rd, nob); - return -EINVAL; + return -EINVAL; } @@ -929,7 +949,7 @@ kiblnd_check_sends (kib_conn_t *conn) if (kiblnd_need_noop(conn)) { cfs_spin_unlock(&conn->ibc_lock); - tx = kiblnd_get_idle_tx(ni); + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); if (tx != NULL) kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); @@ -1482,12 +1502,12 @@ kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (nob <= IBLND_MSG_SIZE) break; /* send IMMEDIATE */ - tx = kiblnd_get_idle_tx(ni); - if (tx == NULL) { - CERROR("Can't allocate txd for GET to %s: \n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate txd for GET to %s\n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } ibmsg = tx->tx_msg; @@ -1536,7 +1556,7 @@ kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) if (nob <= IBLND_MSG_SIZE) break; /* send IMMEDIATE */ - tx = kiblnd_get_idle_tx(ni); + tx = kiblnd_get_idle_tx(ni, target.nid); if (tx == NULL) { CERROR("Can't allocate %s txd for %s\n", type == LNET_MSG_PUT ? "PUT" : "REPLY", @@ -1575,7 +1595,7 @@ kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) <= IBLND_MSG_SIZE); - tx = kiblnd_get_idle_tx(ni); + tx = kiblnd_get_idle_tx(ni, target.nid); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted\n", type, libcfs_nid2str(target.nid)); @@ -1616,7 +1636,7 @@ kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) kib_tx_t *tx; int rc; - tx = kiblnd_get_idle_tx(ni); + tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); if (tx == NULL) { CERROR("Can't get tx for REPLY to %s\n", libcfs_nid2str(target.nid)); @@ -1720,7 +1740,7 @@ kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, break; } - tx = kiblnd_get_idle_tx(ni); + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); if (tx == NULL) { CERROR("Can't allocate tx for %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); @@ -3260,33 +3280,35 @@ kiblnd_complete (struct ib_wc *wc) } void -kiblnd_cq_completion (struct ib_cq *cq, void *arg) +kiblnd_cq_completion(struct ib_cq *cq, void *arg) { - /* NB I'm not allowed to schedule this conn once its refcount has - * reached 0. Since fundamentally I'm racing with scheduler threads - * consuming my CQ I could be called after all completions have - * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 - * and this CQ is about to be destroyed so I NOOP. */ - kib_conn_t *conn = (kib_conn_t *)arg; - unsigned long flags; + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + kib_conn_t *conn = (kib_conn_t *)arg; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; - LASSERT (cq == conn->ibc_cq); + LASSERT(cq == conn->ibc_cq); - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); - conn->ibc_ready = 1; + conn->ibc_ready = 1; - if (!conn->ibc_scheduled && - (conn->ibc_nrx > 0 || - conn->ibc_nsends_posted > 0)) { - kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ - conn->ibc_scheduled = 1; - cfs_list_add_tail(&conn->ibc_sched_list, - &kiblnd_data.kib_sched_conns); - cfs_waitq_signal(&kiblnd_data.kib_sched_waitq); - } + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + cfs_list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); + + if (cfs_waitq_active(&sched->ibs_waitq)) + cfs_waitq_signal(&sched->ibs_waitq); + } - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); } void @@ -3301,48 +3323,58 @@ kiblnd_cq_event(struct ib_event *event, void *arg) int kiblnd_scheduler(void *arg) { - long id = (long)arg; - cfs_waitlink_t wait; - char name[16]; - unsigned long flags; - kib_conn_t *conn; - struct ib_wc wc; - int rc; - int did_something; - int busy_loops = 0; - - snprintf(name, sizeof(name), "kiblnd_sd_%02ld", id); - cfs_daemonize(name); - cfs_block_allsigs(); + long id = (long)arg; + struct kib_sched_info *sched; + kib_conn_t *conn; + cfs_waitlink_t wait; + unsigned long flags; + struct ib_wc wc; + char name[20]; + int did_something; + int busy_loops = 0; + int rc; - cfs_waitlink_init(&wait); + snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", + KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + cfs_daemonize(name); + cfs_block_allsigs(); - while (!kiblnd_data.kib_shutdown) { - if (busy_loops++ >= IBLND_RESCHED) { - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, - flags); + cfs_waitlink_init(&wait); - cfs_cond_resched(); - busy_loops = 0; + sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, - flags); - } + rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); + if (rc != 0) { + CWARN("Failed to bind %s on CPT %d, please verify whether " + "all CPUs are healthy and reload modules if necessary, " + "otherwise your system might under risk of low " + "performance\n", name, sched->ibs_cpt); + } - did_something = 0; + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); - if (!cfs_list_empty(&kiblnd_data.kib_sched_conns)) { - conn = cfs_list_entry(kiblnd_data.kib_sched_conns.next, - kib_conn_t, ibc_sched_list); - /* take over kib_sched_conns' ref on conn... */ - LASSERT(conn->ibc_scheduled); - cfs_list_del(&conn->ibc_sched_list); - conn->ibc_ready = 0; + while (!kiblnd_data.kib_shutdown) { + if (busy_loops++ >= IBLND_RESCHED) { + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, - flags); + cfs_cond_resched(); + busy_loops = 0; + + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + } + + did_something = 0; + + if (!cfs_list_empty(&sched->ibs_conns)) { + conn = cfs_list_entry(sched->ibs_conns.next, + kib_conn_t, ibc_sched_list); + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + cfs_list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); rc = ib_poll_cq(conn->ibc_cq, 1, &wc); if (rc == 0) { @@ -3354,52 +3386,47 @@ kiblnd_scheduler(void *arg) libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kiblnd_close_conn(conn, -EIO); kiblnd_conn_decref(conn); - cfs_spin_lock_irqsave(&kiblnd_data. \ - kib_sched_lock, - flags); - continue; - } - - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - } - - if (rc < 0) { - CWARN("%s: ib_poll_cq failed: %d, " - "closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); - cfs_spin_lock_irqsave(&kiblnd_data. \ - kib_sched_lock, flags); - continue; - } - - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, - flags); - - if (rc != 0 || conn->ibc_ready) { - /* There may be another completion waiting; get - * another scheduler to check while I handle - * this one... */ - kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ - cfs_list_add_tail(&conn->ibc_sched_list, - &kiblnd_data.kib_sched_conns); - cfs_waitq_signal(&kiblnd_data.kib_sched_waitq); - } else { - conn->ibc_scheduled = 0; - } - - if (rc != 0) { - cfs_spin_unlock_irqrestore(&kiblnd_data. \ - kib_sched_lock, - flags); - - kiblnd_complete(&wc); - - cfs_spin_lock_irqsave(&kiblnd_data. \ - kib_sched_lock, - flags); + cfs_spin_lock_irqsave(&sched->ibs_lock, + flags); + continue; + } + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + if (rc < 0) { + CWARN("%s: ib_poll_cq failed: %d, " + "closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + continue; + } + + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + /* +1 ref for sched_conns */ + kiblnd_conn_addref(conn); + cfs_list_add_tail(&conn->ibc_sched_list, + &sched->ibs_conns); + if (cfs_waitq_active(&sched->ibs_waitq)) + cfs_waitq_signal(&sched->ibs_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + cfs_spin_unlock_irqrestore(&sched->ibs_lock, + flags); + kiblnd_complete(&wc); + + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); } kiblnd_conn_decref(conn); /* ...drop my ref from above */ @@ -3409,22 +3436,22 @@ kiblnd_scheduler(void *arg) if (did_something) continue; - cfs_set_current_state(CFS_TASK_INTERRUPTIBLE); - cfs_waitq_add_exclusive(&kiblnd_data.kib_sched_waitq, &wait); - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + cfs_set_current_state(CFS_TASK_INTERRUPTIBLE); + cfs_waitq_add_exclusive(&sched->ibs_waitq, &wait); + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); - cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE); - busy_loops = 0; + cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE); + busy_loops = 0; - cfs_waitq_del(&kiblnd_data.kib_sched_waitq, &wait); - cfs_set_current_state(CFS_TASK_RUNNING); - cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); - } + cfs_waitq_del(&sched->ibs_waitq, &wait); + cfs_set_current_state(CFS_TASK_RUNNING); + cfs_spin_lock_irqsave(&sched->ibs_lock, flags); + } - cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags); - kiblnd_thread_fini(); - return (0); + kiblnd_thread_fini(); + return 0; } int diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 1b96ff3..c654ab6 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -50,11 +50,19 @@ static int timeout = 50; CFS_MODULE_PARM(timeout, "i", int, 0644, "timeout (seconds)"); -static int ntx = 256; +/* Number of threads in each scheduler pool which is percpt, + * we will estimate reasonable value based on CPUs if it's set to zero. */ +static int nscheds; +CFS_MODULE_PARM(nscheds, "i", int, 0444, + "number of threads in each scheduler pool"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int ntx = 512; CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of message descriptors"); + "# of message descriptors allocated for each pool"); -static int credits = 64; +/* NB: this value is shared by all CPTs */ +static int credits = 256; CFS_MODULE_PARM(credits, "i", int, 0444, "# concurrent sends"); @@ -102,21 +110,24 @@ static int map_on_demand = 0; CFS_MODULE_PARM(map_on_demand, "i", int, 0444, "map on demand"); +/* NB: this value is shared by all CPTs, it can grow at runtime */ static int fmr_pool_size = 512; CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444, - "size of the fmr pool (>= ntx / 4)"); + "size of fmr pool on each CPT (>= ntx / 4)"); +/* NB: this value is shared by all CPTs, it can grow at runtime */ static int fmr_flush_trigger = 384; CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444, - "# dirty FMRs that triggers pool flush"); + "# dirty FMRs that triggers pool flush"); static int fmr_cache = 1; CFS_MODULE_PARM(fmr_cache, "i", int, 0444, - "non-zero to enable FMR caching"); + "non-zero to enable FMR caching"); +/* NB: this value is shared by all CPTs, it can grow at runtime */ static int pmr_pool_size = 512; CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444, - "size of the MR cache pmr pool"); + "size of MR cache pmr pool on each CPT"); /* * 0: disable failover @@ -159,7 +170,8 @@ kib_tunables_t kiblnd_tunables = { .kib_fmr_cache = &fmr_cache, .kib_pmr_pool_size = &pmr_pool_size, .kib_require_priv_port = &require_privileged_port, - .kib_use_priv_port = &use_privileged_port + .kib_use_priv_port = &use_privileged_port, + .kib_nscheds = &nscheds }; #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM -- 1.8.3.1