X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fo2iblnd%2Fo2iblnd.c;h=12dade5ba4b505ef31f38f62222ec0580695d615;hb=e711370e13dcbe059e2551aa575c41d62cbcfca9;hp=ee5a01f9fa5185e2fb072cfe91004bfe8546ae49;hpb=783428b60a98874b4783f8da48c66019d68d84d6;p=fs%2Flustre-release.git diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index ee5a01f..12dade5 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -37,7 +37,7 @@ #include #include "o2iblnd.h" -static lnd_t the_o2iblnd; +static struct lnet_lnd the_o2iblnd; kib_data_t kiblnd_data; @@ -176,8 +176,8 @@ kiblnd_unpack_rd(kib_msg_t *msg, int flip) } void -kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version, - int credits, lnet_nid_t dstnid, __u64 dststamp) +kiblnd_pack_msg(struct lnet_ni *ni, kib_msg_t *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp) { kib_net_t *net = ni->ni_data; @@ -255,7 +255,7 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob) msg->ibm_cksum = msg_cksum; if (flip) { - /* leave magic unflipped as a clue to peer endianness */ + /* leave magic unflipped as a clue to peer_ni endianness */ msg->ibm_version = version; CLASSERT (sizeof(msg->ibm_type) == 1); CLASSERT (sizeof(msg->ibm_credits) == 1); @@ -313,33 +313,33 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob) } int -kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) +kiblnd_create_peer(struct lnet_ni *ni, kib_peer_ni_t **peerp, lnet_nid_t nid) { - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; kib_net_t *net = ni->ni_data; - int cpt = lnet_cpt_of_nid(nid); + int cpt = lnet_cpt_of_nid(nid, ni); unsigned long flags; LASSERT(net != NULL); LASSERT(nid != LNET_NID_ANY); - LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); - if (peer == NULL) { - CERROR("Cannot allocate peer\n"); + LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni)); + if (peer_ni == NULL) { + CERROR("Cannot allocate peer_ni\n"); return -ENOMEM; } - peer->ibp_ni = ni; - peer->ibp_nid = nid; - peer->ibp_error = 0; - peer->ibp_last_alive = 0; - peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni); - peer->ibp_queue_depth = ni->ni_peertxcredits; - atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ + peer_ni->ibp_ni = ni; + peer_ni->ibp_nid = nid; + peer_ni->ibp_error = 0; + peer_ni->ibp_last_alive = 0; + peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni); + peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits; + atomic_set(&peer_ni->ibp_refcount, 1); /* 1 ref for caller */ - INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD(&peer->ibp_conns); - INIT_LIST_HEAD(&peer->ibp_tx_queue); + INIT_LIST_HEAD(&peer_ni->ibp_list); /* not in the peer_ni table yet */ + INIT_LIST_HEAD(&peer_ni->ibp_conns); + INIT_LIST_HEAD(&peer_ni->ibp_tx_queue); write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -351,72 +351,79 @@ kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - *peerp = peer; + *peerp = peer_ni; return 0; } void -kiblnd_destroy_peer (kib_peer_t *peer) +kiblnd_destroy_peer (kib_peer_ni_t *peer_ni) { - kib_net_t *net = peer->ibp_ni->ni_data; + kib_net_t *net = peer_ni->ibp_ni->ni_data; LASSERT(net != NULL); - LASSERT (atomic_read(&peer->ibp_refcount) == 0); - LASSERT(!kiblnd_peer_active(peer)); - LASSERT(kiblnd_peer_idle(peer)); - LASSERT(list_empty(&peer->ibp_tx_queue)); + LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0); + LASSERT(!kiblnd_peer_active(peer_ni)); + LASSERT(kiblnd_peer_idle(peer_ni)); + LASSERT(list_empty(&peer_ni->ibp_tx_queue)); - LIBCFS_FREE(peer, sizeof(*peer)); + LIBCFS_FREE(peer_ni, sizeof(*peer_ni)); - /* NB a peer's connections keep a reference on their peer until + /* NB a peer_ni's connections keep a reference on their peer_ni until * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to + * with this peer_ni has been cleaned up when its refcount drops to * zero. */ atomic_dec(&net->ibn_npeers); } -kib_peer_t * -kiblnd_find_peer_locked (lnet_nid_t nid) +kib_peer_ni_t * +kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid) { /* the caller is responsible for accounting the additional reference * that this creates */ struct list_head *peer_list = kiblnd_nid2peerlist(nid); struct list_head *tmp; - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; list_for_each(tmp, peer_list) { - peer = list_entry(tmp, kib_peer_t, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_nid != nid) + peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); + + /* + * Match a peer if its NID and the NID of the local NI it + * communicates over are the same. Otherwise don't match + * the peer, which will result in a new lnd peer being + * created. + */ + if (peer_ni->ibp_nid != nid || + peer_ni->ibp_ni->ni_nid != ni->ni_nid) continue; - CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", - peer, libcfs_nid2str(nid), - atomic_read(&peer->ibp_refcount), - peer->ibp_version); - return peer; + CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n", + peer_ni, libcfs_nid2str(nid), + atomic_read(&peer_ni->ibp_refcount), + peer_ni->ibp_version); + return peer_ni; } return NULL; } void -kiblnd_unlink_peer_locked (kib_peer_t *peer) +kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni) { - LASSERT(list_empty(&peer->ibp_conns)); + LASSERT(list_empty(&peer_ni->ibp_conns)); - LASSERT (kiblnd_peer_active(peer)); - list_del_init(&peer->ibp_list); + LASSERT (kiblnd_peer_active(peer_ni)); + list_del_init(&peer_ni->ibp_list); /* lose peerlist's ref */ - kiblnd_peer_decref(peer); + kiblnd_peer_decref(peer_ni); } static int -kiblnd_get_peer_info(lnet_ni_t *ni, int index, +kiblnd_get_peer_info(struct lnet_ni *ni, int index, lnet_nid_t *nidp, int *count) { - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; struct list_head *ptmp; int i; unsigned long flags; @@ -427,17 +434,17 @@ kiblnd_get_peer_info(lnet_ni_t *ni, int index, list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, kib_peer_t, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); + peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); - if (peer->ibp_ni != ni) + if (peer_ni->ibp_ni != ni) continue; if (index-- > 0) continue; - *nidp = peer->ibp_nid; - *count = atomic_read(&peer->ibp_refcount); + *nidp = peer_ni->ibp_nid; + *count = atomic_read(&peer_ni->ibp_refcount); read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); @@ -450,33 +457,33 @@ kiblnd_get_peer_info(lnet_ni_t *ni, int index, } static void -kiblnd_del_peer_locked (kib_peer_t *peer) +kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; - if (list_empty(&peer->ibp_conns)) { - kiblnd_unlink_peer_locked(peer); + if (list_empty(&peer_ni->ibp_conns)) { + kiblnd_unlink_peer_locked(peer_ni); } else { - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { conn = list_entry(ctmp, kib_conn_t, ibc_list); kiblnd_close_conn_locked(conn, 0); } - /* NB closing peer's last conn unlinked it. */ + /* NB closing peer_ni's last conn unlinked it. */ } - /* NB peer now unlinked; might even be freed if the peer table had the + /* NB peer_ni now unlinked; might even be freed if the peer_ni table had the * last ref on it. */ } static int -kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid) +kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid) { struct list_head zombies = LIST_HEAD_INIT(zombies); struct list_head *ptmp; struct list_head *pnxt; - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; int lo; int hi; int i; @@ -494,38 +501,38 @@ kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid) for (i = lo; i <= hi; i++) { list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, kib_peer_t, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); + peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); - if (peer->ibp_ni != ni) + if (peer_ni->ibp_ni != ni) continue; - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) + if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid)) continue; - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT(list_empty(&peer->ibp_conns)); + if (!list_empty(&peer_ni->ibp_tx_queue)) { + LASSERT(list_empty(&peer_ni->ibp_conns)); - list_splice_init(&peer->ibp_tx_queue, + list_splice_init(&peer_ni->ibp_tx_queue, &zombies); } - kiblnd_del_peer_locked(peer); + kiblnd_del_peer_locked(peer_ni); rc = 0; /* matched something */ } } write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - kiblnd_txlist_done(ni, &zombies, -EIO); + kiblnd_txlist_done(&zombies, -EIO); return rc; } static kib_conn_t * -kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index) +kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index) { - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; struct list_head *ptmp; kib_conn_t *conn; struct list_head *ctmp; @@ -537,13 +544,13 @@ kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index) for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, kib_peer_t, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); + peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); - if (peer->ibp_ni != ni) + if (peer_ni->ibp_ni != ni) continue; - list_for_each(ctmp, &peer->ibp_conns) { + list_for_each(ctmp, &peer_ni->ibp_conns) { if (index-- > 0) continue; @@ -691,19 +698,49 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) return 1; } +/* + * Get the scheduler bound to this CPT. If the scheduler has no + * threads, which means that the CPT has no CPUs, then grab the + * next scheduler that we can use. + * + * This case would be triggered if a NUMA node is configured with + * no associated CPUs. + */ +static struct kib_sched_info * +kiblnd_get_scheduler(int cpt) +{ + struct kib_sched_info *sched; + int i; + + sched = kiblnd_data.kib_scheds[cpt]; + + if (sched->ibs_nthreads > 0) + return sched; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + if (sched->ibs_nthreads > 0) { + CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n", + cpt, sched->ibs_cpt); + return sched; + } + } + + return NULL; +} + kib_conn_t * -kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, +kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid, int state, int version) { /* CAVEAT EMPTOR: * If the new conn is created successfully it takes over the caller's - * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself - * is destroyed. On failure, the caller's ref on 'peer' remains and + * ref on 'peer_ni'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer_ni' remains and * she must dispose of 'cmid'. (Actually I'd block forever if I tried * to destroy 'cmid' here since I'm called from the CM which still has * its ref on 'cmid'). */ rwlock_t *glock = &kiblnd_data.kib_global_lock; - kib_net_t *net = peer->ibp_ni->ni_data; + kib_net_t *net = peer_ni->ibp_ni->ni_data; kib_dev_t *dev; struct ib_qp_init_attr *init_qp_attr; struct kib_sched_info *sched; @@ -722,33 +759,42 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, dev = net->ibn_dev; - cpt = lnet_cpt_of_nid(peer->ibp_nid); - sched = kiblnd_data.kib_scheds[cpt]; + cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni); + sched = kiblnd_get_scheduler(cpt); - LASSERT(sched->ibs_nthreads > 0); + if (sched == NULL) { + CERROR("no schedulers available. node is unhealthy\n"); + goto failed_0; + } + + /* + * The cpt might have changed if we ended up selecting a non cpt + * native scheduler. So use the scheduler's cpt instead. + */ + cpt = sched->ibs_cpt; LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, sizeof(*init_qp_attr)); if (init_qp_attr == NULL) { CERROR("Can't allocate qp_attr for %s\n", - libcfs_nid2str(peer->ibp_nid)); + libcfs_nid2str(peer_ni->ibp_nid)); goto failed_0; } LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); if (conn == NULL) { CERROR("Can't allocate connection for %s\n", - libcfs_nid2str(peer->ibp_nid)); + libcfs_nid2str(peer_ni->ibp_nid)); goto failed_1; } conn->ibc_state = IBLND_CONN_INIT; conn->ibc_version = version; - conn->ibc_peer = peer; /* I take the caller's ref */ + conn->ibc_peer = peer_ni; /* I take the caller's ref */ cmid->context = conn; /* for future CM callbacks */ conn->ibc_cmid = cmid; - conn->ibc_max_frags = peer->ibp_max_frags; - conn->ibc_queue_depth = peer->ibp_queue_depth; + conn->ibc_max_frags = peer_ni->ibp_max_frags; + conn->ibc_queue_depth = peer_ni->ibp_queue_depth; INIT_LIST_HEAD(&conn->ibc_early_rxs); INIT_LIST_HEAD(&conn->ibc_tx_noops); @@ -833,16 +879,16 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, goto failed_2; } - init_qp_attr->event_handler = kiblnd_qp_event; - init_qp_attr->qp_context = conn; + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); - init_qp_attr->cap.max_send_sge = 1; - init_qp_attr->cap.max_recv_sge = 1; - init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - init_qp_attr->qp_type = IB_QPT_RC; - init_qp_attr->send_cq = cq; - init_qp_attr->recv_cq = cq; + init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; conn->ibc_sched = sched; @@ -855,9 +901,12 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, } while (rc); if (rc) { - CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", - rc, init_qp_attr->cap.max_send_wr, - init_qp_attr->cap.max_recv_wr); + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, " + "send_sge: %d, recv_sge: %d\n", + rc, init_qp_attr->cap.max_send_wr, + init_qp_attr->cap.max_recv_wr, + init_qp_attr->cap.max_send_sge, + init_qp_attr->cap.max_recv_sge); goto failed_2; } @@ -921,7 +970,7 @@ void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn) { struct rdma_cm_id *cmid = conn->ibc_cmid; - kib_peer_t *peer = conn->ibc_peer; + kib_peer_ni_t *peer_ni = conn->ibc_peer; int rc; LASSERT (!in_interrupt()); @@ -975,9 +1024,9 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn) /* See CAVEAT EMPTOR above in kiblnd_create_conn */ if (conn->ibc_state != IBLND_CONN_INIT) { - kib_net_t *net = peer->ibp_ni->ni_data; + kib_net_t *net = peer_ni->ibp_ni->ni_data; - kiblnd_peer_decref(peer); + kiblnd_peer_decref(peer_ni); rdma_destroy_id(cmid); atomic_dec(&net->ibn_nconns); } @@ -987,19 +1036,19 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn) } int -kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why) +kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why) { kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { conn = list_entry(ctmp, kib_conn_t, ibc_list); CDEBUG(D_NET, "Closing conn -> %s, " "version: %x, reason: %d\n", - libcfs_nid2str(peer->ibp_nid), + libcfs_nid2str(peer_ni->ibp_nid), conn->ibc_version, why); kiblnd_close_conn_locked(conn, why); @@ -1010,7 +1059,7 @@ kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why) } int -kiblnd_close_stale_conns_locked(kib_peer_t *peer, +kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni, int version, __u64 incarnation) { kib_conn_t *conn; @@ -1018,7 +1067,7 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer, struct list_head *cnxt; int count = 0; - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { + list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { conn = list_entry(ctmp, kib_conn_t, ibc_list); if (conn->ibc_version == version && @@ -1027,7 +1076,7 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer, CDEBUG(D_NET, "Closing stale conn -> %s version: %x, " "incarnation:%#llx(%x, %#llx)\n", - libcfs_nid2str(peer->ibp_nid), + libcfs_nid2str(peer_ni->ibp_nid), conn->ibc_version, conn->ibc_incarnation, version, incarnation); @@ -1039,9 +1088,9 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer, } static int -kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) +kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid) { - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; struct list_head *ptmp; struct list_head *pnxt; int lo; @@ -1062,16 +1111,16 @@ kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) for (i = lo; i <= hi; i++) { list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, kib_peer_t, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); + peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list); + LASSERT(!kiblnd_peer_idle(peer_ni)); - if (peer->ibp_ni != ni) + if (peer_ni->ibp_ni != ni) continue; - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) + if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid)) continue; - count += kiblnd_close_peer_conns_locked(peer, 0); + count += kiblnd_close_peer_conns_locked(peer_ni, 0); } } @@ -1085,7 +1134,7 @@ kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) } static int -kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) { struct libcfs_ioctl_data *data = arg; int rc = -EINVAL; @@ -1116,15 +1165,15 @@ kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) break; } - LASSERT (conn->ibc_cmid != NULL); - data->ioc_nid = conn->ibc_peer->ibp_nid; - if (conn->ibc_cmid->route.path_rec == NULL) - data->ioc_u32[0] = 0; /* iWarp has no path MTU */ - else - data->ioc_u32[0] = - ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); - kiblnd_conn_decref(conn); - break; + LASSERT(conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; } case IOC_LIBCFS_CLOSE_CONNECTION: { rc = kiblnd_close_matching_conns(ni, data->ioc_nid); @@ -1139,32 +1188,32 @@ kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) } static void -kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) +kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when) { cfs_time_t last_alive = 0; cfs_time_t now = cfs_time_current(); rwlock_t *glock = &kiblnd_data.kib_global_lock; - kib_peer_t *peer; + kib_peer_ni_t *peer_ni; unsigned long flags; read_lock_irqsave(glock, flags); - peer = kiblnd_find_peer_locked(nid); - if (peer != NULL) - last_alive = peer->ibp_last_alive; + peer_ni = kiblnd_find_peer_locked(ni, nid); + if (peer_ni != NULL) + last_alive = peer_ni->ibp_last_alive; read_unlock_irqrestore(glock, flags); if (last_alive != 0) *when = last_alive; - /* peer is not persistent in hash, trigger peer creation + /* peer_ni is not persistent in hash, trigger peer_ni creation * and connection establishment with a NULL tx */ - if (peer == NULL) + if (peer_ni == NULL) kiblnd_launch_tx(ni, NULL, nid); - CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", - libcfs_nid2str(nid), peer, + CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n", + libcfs_nid2str(nid), peer_ni, last_alive ? cfs_duration_sec(now - last_alive) : -1); return; } @@ -1381,6 +1430,7 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo) } } +#ifdef HAVE_IB_GET_DMA_MR struct ib_mr * kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, int negotiated_nfrags) @@ -1391,7 +1441,7 @@ kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, int mod; __u16 nfrags; - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; mod = tunables->lnd_map_on_demand; nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod; @@ -1402,6 +1452,7 @@ kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, return hdev->ibh_mrs; } +#endif static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo) @@ -1937,7 +1988,7 @@ again: return 0; } spin_unlock(&fps->fps_lock); - rc = -EBUSY; + rc = -EAGAIN; } spin_lock(&fps->fps_lock); @@ -2221,7 +2272,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool) goto out; for (i = 0; i < pool->po_size; i++) { - kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; list_del(&tx->tx_list); if (tx->tx_pages != NULL) @@ -2236,10 +2288,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool) LIBCFS_FREE(tx->tx_wrq, (1 + IBLND_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); - if (tx->tx_sge != NULL) - LIBCFS_FREE(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * + sizeof(*tx->tx_sge)); if (tx->tx_rd != NULL) LIBCFS_FREE(tx->tx_rd, offsetof(kib_rdma_desc_t, @@ -2297,7 +2349,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); for (i = 0; i < size; i++) { - kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + int wrq_sge = *kiblnd_tunables.kib_wrq_sge; tx->tx_pool = tpo; if (ps->ps_net->ibn_fmr_ps != NULL) { @@ -2323,7 +2376,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) break; LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, - (1 + IBLND_MAX_RDMA_FRAGS) * + (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * sizeof(*tx->tx_sge)); if (tx->tx_sge == NULL) break; @@ -2387,16 +2440,20 @@ kiblnd_net_fini_pools(kib_net_t *net) } static int -kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, int ncpts) +kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts, + int ncpts) { struct lnet_ioctl_config_o2iblnd_tunables *tunables; +#ifdef HAVE_IB_GET_DMA_MR unsigned long flags; +#endif int cpt; int rc; int i; - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib; +#ifdef HAVE_IB_GET_DMA_MR read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); if (tunables->lnd_map_on_demand == 0) { read_unlock_irqrestore(&kiblnd_data.kib_global_lock, @@ -2405,6 +2462,7 @@ kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, int ncpts) } read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +#endif if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", @@ -2443,7 +2501,9 @@ kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, int ncpts) if (i > 0) LASSERT(i == ncpts); +#ifdef HAVE_IB_GET_DMA_MR create_tx_pool: +#endif net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), sizeof(kib_tx_poolset_t)); if (net->ibn_tx_ps == NULL) { @@ -2518,6 +2578,7 @@ kiblnd_hdev_get_attr(kib_hca_dev_t *hdev) return -EINVAL; } +#ifdef HAVE_IB_GET_DMA_MR static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev) { @@ -2528,11 +2589,14 @@ kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev) hdev->ibh_mrs = NULL; } +#endif void kiblnd_hdev_destroy(kib_hca_dev_t *hdev) { +#ifdef HAVE_IB_GET_DMA_MR kiblnd_hdev_cleanup_mrs(hdev); +#endif if (hdev->ibh_pd != NULL) ib_dealloc_pd(hdev->ibh_pd); @@ -2543,6 +2607,7 @@ kiblnd_hdev_destroy(kib_hca_dev_t *hdev) LIBCFS_FREE(hdev, sizeof(*hdev)); } +#ifdef HAVE_IB_GET_DMA_MR static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev) { @@ -2566,6 +2631,7 @@ kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev) return 0; } +#endif static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) @@ -2702,12 +2768,16 @@ kiblnd_dev_failover(kib_dev_t *dev) hdev->ibh_cmid = cmid; hdev->ibh_ibdev = cmid->device; - pd = ib_alloc_pd(cmid->device); - if (IS_ERR(pd)) { - rc = PTR_ERR(pd); - CERROR("Can't allocate PD: %d\n", rc); - goto out; - } +#ifdef HAVE_IB_ALLOC_PD_2ARGS + pd = ib_alloc_pd(cmid->device, 0); +#else + pd = ib_alloc_pd(cmid->device); +#endif + if (IS_ERR(pd)) { + rc = PTR_ERR(pd); + CERROR("Can't allocate PD: %d\n", rc); + goto out; + } hdev->ibh_pd = pd; @@ -2717,11 +2787,19 @@ kiblnd_dev_failover(kib_dev_t *dev) goto out; } - rc = kiblnd_hdev_setup_mrs(hdev); - if (rc != 0) { - CERROR("Can't setup device: %d\n", rc); - goto out; - } +#ifdef HAVE_IB_GET_DMA_MR + rc = kiblnd_hdev_setup_mrs(hdev); + if (rc != 0) { + CERROR("Can't setup device: %d\n", rc); + goto out; + } +#else + rc = kiblnd_hdev_get_attr(hdev); + if (rc != 0) { + CERROR("Can't get device attributes: %d\n", rc); + goto out; + } +#endif write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -2899,7 +2977,7 @@ kiblnd_base_shutdown(void) } static void -kiblnd_shutdown (lnet_ni_t *ni) +kiblnd_shutdown(struct lnet_ni *ni) { kib_net_t *net = ni->ni_data; rwlock_t *g_lock = &kiblnd_data.kib_global_lock; @@ -2926,7 +3004,7 @@ kiblnd_shutdown (lnet_ni_t *ni) /* nuke all existing peers within this net */ kiblnd_del_peer(ni, LNET_NID_ANY); - /* Wait for all peer state to clean up */ + /* Wait for all peer_ni state to clean up */ i = 2; while (atomic_read(&net->ibn_npeers) != 0) { i++; @@ -3167,17 +3245,17 @@ kiblnd_dev_search(char *ifname) } static int -kiblnd_startup (lnet_ni_t *ni) +kiblnd_startup(struct lnet_ni *ni) { char *ifname; kib_dev_t *ibdev = NULL; kib_net_t *net; - struct timeval tv; unsigned long flags; int rc; int newdev; + int node_id; - LASSERT (ni->ni_lnd == &the_o2iblnd); + LASSERT (ni->ni_net->net_lnd == &the_o2iblnd); if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { rc = kiblnd_base_startup(); @@ -3190,24 +3268,23 @@ kiblnd_startup (lnet_ni_t *ni) if (net == NULL) goto failed; - do_gettimeofday(&tv); - net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC; kiblnd_tunables_setup(ni); - if (ni->ni_interfaces[0] != NULL) { - /* Use the IPoIB interface specified in 'networks=' */ + if (ni->ni_interfaces[0] != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ - CLASSERT (LNET_MAX_INTERFACES > 1); - if (ni->ni_interfaces[1] != NULL) { - CERROR("Multiple interfaces not supported\n"); - goto failed; - } + CLASSERT(LNET_NUM_INTERFACES > 1); + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + goto failed; + } - ifname = ni->ni_interfaces[0]; - } else { - ifname = *kiblnd_tunables.kib_default_ipif; - } + ifname = ni->ni_interfaces[0]; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { CERROR("IPoIB interface name too long: %s\n", ifname); @@ -3219,13 +3296,16 @@ kiblnd_startup (lnet_ni_t *ni) newdev = ibdev == NULL; /* hmm...create kib_dev even for alias */ if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) - ibdev = kiblnd_create_dev(ifname); + ibdev = kiblnd_create_dev(ifname); - if (ibdev == NULL) - goto failed; + if (ibdev == NULL) + goto failed; + + node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device); + ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id); - net->ibn_dev = ibdev; - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); + net->ibn_dev = ibdev; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts); @@ -3257,7 +3337,7 @@ failed: return -ENETDOWN; } -static lnd_t the_o2iblnd = { +static struct lnet_lnd the_o2iblnd = { .lnd_type = O2IBLND, .lnd_startup = kiblnd_startup, .lnd_shutdown = kiblnd_shutdown,