X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fo2iblnd%2Fo2iblnd.c;h=d939510fe64bf03aefbea39cacd080cc6d558ab0;hp=3437f30120b538dc75135e7716d5936cbc365e49;hb=8a3ef5713cc4aed1ac7bd3ce177895caa597cc4c;hpb=a1282a0d8a5361932c25c37172d360679fbd8232 diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 3437f30..d939510 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -39,7 +39,7 @@ #include "o2iblnd.h" -static struct lnet_lnd the_o2iblnd; +static const struct lnet_lnd the_o2iblnd; struct kib_data kiblnd_data; @@ -335,6 +335,7 @@ kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, peer_ni->ibp_last_alive = 0; peer_ni->ibp_max_frags = IBLND_MAX_RDMA_FRAGS; peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits; + peer_ni->ibp_queue_depth_mod = 0; /* try to use the default */ atomic_set(&peer_ni->ibp_refcount, 1); /* 1 ref for caller */ INIT_LIST_HEAD(&peer_ni->ibp_list); /* not in the peer_ni table yet */ @@ -372,7 +373,8 @@ kiblnd_destroy_peer(struct kib_peer_ni *peer_ni) * they are destroyed, so we can be assured that _all_ state to do * with this peer_ni has been cleaned up when its refcount drops to * zero. */ - atomic_dec(&net->ibn_npeers); + if (atomic_dec_and_test(&net->ibn_npeers)) + wake_up_var(&net->ibn_npeers); } struct kib_peer_ni * @@ -459,18 +461,15 @@ kiblnd_get_peer_info(struct lnet_ni *ni, int index, static void kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni) { - struct list_head *ctmp; - struct list_head *cnxt; + struct kib_conn *cnxt; struct kib_conn *conn; if (list_empty(&peer_ni->ibp_conns)) { kiblnd_unlink_peer_locked(peer_ni); } else { - list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) kiblnd_close_conn_locked(conn, 0); - } /* NB closing peer_ni's last conn unlinked it. */ } /* NB peer_ni now unlinked; might even be freed if the peer_ni table had the @@ -635,40 +634,16 @@ kiblnd_debug_conn(struct kib_conn *conn) spin_unlock(&conn->ibc_lock); } -int -kiblnd_translate_mtu(int value) -{ - switch (value) { - default: - return -1; - case 0: - return 0; - case 256: - return IB_MTU_256; - case 512: - return IB_MTU_512; - case 1024: - return IB_MTU_1024; - case 2048: - return IB_MTU_2048; - case 4096: - return IB_MTU_4096; - } -} - static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) { - int mtu; - /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ if (cmid->route.path_rec == NULL) return; - mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); - LASSERT (mtu >= 0); - if (mtu != 0) - cmid->route.path_rec->mtu = mtu; + if (*kiblnd_tunables.kib_ib_mtu) + cmid->route.path_rec->mtu = + ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu); } static int @@ -772,7 +747,7 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, rwlock_t *glock = &kiblnd_data.kib_global_lock; struct kib_net *net = peer_ni->ibp_ni->ni_data; struct kib_dev *dev; - struct ib_qp_init_attr *init_qp_attr; + struct ib_qp_init_attr init_qp_attr = {}; struct kib_sched_info *sched; #ifdef HAVE_IB_CQ_INIT_ATTR struct ib_cq_init_attr cq_attr = {}; @@ -803,19 +778,11 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, */ cpt = sched->ibs_cpt; - LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, - sizeof(*init_qp_attr)); - if (init_qp_attr == NULL) { - CERROR("Can't allocate qp_attr for %s\n", - libcfs_nid2str(peer_ni->ibp_nid)); - goto failed_0; - } - LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); if (conn == NULL) { CERROR("Can't allocate connection for %s\n", libcfs_nid2str(peer_ni->ibp_nid)); - goto failed_1; + goto failed_0; } conn->ibc_state = IBLND_CONN_INIT; @@ -904,40 +871,57 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, goto failed_2; } - init_qp_attr->event_handler = kiblnd_qp_event; - init_qp_attr->qp_context = conn; - init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; - init_qp_attr->cap.max_recv_sge = 1; - init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - init_qp_attr->qp_type = IB_QPT_RC; - init_qp_attr->send_cq = cq; - init_qp_attr->recv_cq = cq; - /* - * kiblnd_send_wrs() can change the connection's queue depth if - * the maximum work requests for the device is maxed out - */ - init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn); - init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); + init_qp_attr.event_handler = kiblnd_qp_event; + init_qp_attr.qp_context = conn; + init_qp_attr.cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr.qp_type = IB_QPT_RC; + init_qp_attr.send_cq = cq; + init_qp_attr.recv_cq = cq; + + if (peer_ni->ibp_queue_depth_mod && + peer_ni->ibp_queue_depth_mod < peer_ni->ibp_queue_depth) { + conn->ibc_queue_depth = peer_ni->ibp_queue_depth_mod; + CDEBUG(D_NET, "Use reduced queue depth %u (from %u)\n", + peer_ni->ibp_queue_depth_mod, + peer_ni->ibp_queue_depth); + } + + do { + /* kiblnd_send_wrs() can change the connection's queue depth if + * the maximum work requests for the device is maxed out + */ + init_qp_attr.cap.max_send_wr = kiblnd_send_wrs(conn); + init_qp_attr.cap.max_recv_wr = IBLND_RECV_WRS(conn); + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, + &init_qp_attr); + if (rc != -ENOMEM || conn->ibc_queue_depth < 2) + break; + conn->ibc_queue_depth--; + } while (rc); - rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); if (rc) { CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, " "send_sge: %d, recv_sge: %d\n", - rc, init_qp_attr->cap.max_send_wr, - init_qp_attr->cap.max_recv_wr, - init_qp_attr->cap.max_send_sge, - init_qp_attr->cap.max_recv_sge); + rc, init_qp_attr.cap.max_send_wr, + init_qp_attr.cap.max_recv_wr, + init_qp_attr.cap.max_send_sge, + init_qp_attr.cap.max_recv_sge); goto failed_2; } conn->ibc_sched = sched; - if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth) + if (!peer_ni->ibp_queue_depth_mod && + conn->ibc_queue_depth != peer_ni->ibp_queue_depth) { CWARN("peer %s - queue depth reduced from %u to %u" " to allow for qp creation\n", libcfs_nid2str(peer_ni->ibp_nid), peer_ni->ibp_queue_depth, conn->ibc_queue_depth); + peer_ni->ibp_queue_depth_mod = conn->ibc_queue_depth; + } LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); @@ -953,8 +937,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, kiblnd_map_rx_descs(conn); - LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); - /* 1 ref for caller and each rxmsg */ atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); conn->ibc_nrx = IBLND_RX_MSGS(conn); @@ -1000,8 +982,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, failed_2: kiblnd_destroy_conn(conn); LIBCFS_FREE(conn, sizeof(*conn)); - failed_1: - LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); failed_0: return NULL; } @@ -1050,10 +1030,8 @@ kiblnd_destroy_conn(struct kib_conn *conn) if (conn->ibc_rx_pages != NULL) kiblnd_unmap_rx_descs(conn); - if (conn->ibc_rxs != NULL) { - LIBCFS_FREE(conn->ibc_rxs, - IBLND_RX_MSGS(conn) * sizeof(struct kib_rx)); - } + if (conn->ibc_rxs != NULL) + CFS_FREE_PTR_ARRAY(conn->ibc_rxs, IBLND_RX_MSGS(conn)); if (conn->ibc_connvars != NULL) LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); @@ -1075,13 +1053,11 @@ int kiblnd_close_peer_conns_locked(struct kib_peer_ni *peer_ni, int why) { struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); + struct kib_conn *cnxt; + int count = 0; + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) { CDEBUG(D_NET, "Closing conn -> %s, " "version: %x, reason: %d\n", libcfs_nid2str(peer_ni->ibp_nid), @@ -1099,13 +1075,11 @@ kiblnd_close_stale_conns_locked(struct kib_peer_ni *peer_ni, int version, __u64 incarnation) { struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); + struct kib_conn *cnxt; + int count = 0; + list_for_each_entry_safe(conn, cnxt, &peer_ni->ibp_conns, + ibc_list) { if (conn->ibc_version == version && conn->ibc_incarnation == incarnation) continue; @@ -1224,36 +1198,6 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) } static void -kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when) -{ - time64_t last_alive = 0; - time64_t now = ktime_get_seconds(); - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_peer_ni *peer_ni; - unsigned long flags; - - read_lock_irqsave(glock, flags); - - peer_ni = kiblnd_find_peer_locked(ni, nid); - if (peer_ni != NULL) - last_alive = peer_ni->ibp_last_alive; - - read_unlock_irqrestore(glock, flags); - - if (last_alive != 0) - *when = last_alive; - - /* peer_ni is not persistent in hash, trigger peer_ni creation - * and connection establishment with a NULL tx */ - if (peer_ni == NULL) - kiblnd_launch_tx(ni, NULL, nid); - - CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n", - libcfs_nid2str(nid), peer_ni, - last_alive ? now - last_alive : -1); -} - -static void kiblnd_free_pages(struct kib_pages *p) { int npages = p->ibp_npages; @@ -1398,8 +1342,7 @@ kiblnd_current_hdev(struct kib_dev *dev) if (i++ % 50 == 0) CDEBUG(D_NET, "%s: Wait for failover\n", dev->ibd_ifname); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(cfs_time_seconds(1) / 100); + schedule_timeout_interruptible(cfs_time_seconds(1) / 100); read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); } @@ -1918,14 +1861,14 @@ again: #ifdef HAVE_IB_MAP_MR_SG #ifdef HAVE_IB_MAP_MR_SG_5ARGS n = ib_map_mr_sg(mr, tx->tx_frags, - tx->tx_nfrags, NULL, PAGE_SIZE); + rd->rd_nfrags, NULL, PAGE_SIZE); #else n = ib_map_mr_sg(mr, tx->tx_frags, - tx->tx_nfrags, PAGE_SIZE); + rd->rd_nfrags, PAGE_SIZE); #endif - if (unlikely(n != tx->tx_nfrags)) { + if (unlikely(n != rd->rd_nfrags)) { CERROR("Failed to map mr %d/%d " - "elements\n", n, tx->tx_nfrags); + "elements\n", n, rd->rd_nfrags); return n < 0 ? n : -EINVAL; } @@ -1997,7 +1940,7 @@ again: spin_unlock(&fps->fps_lock); CDEBUG(D_NET, "Another thread is allocating new " "FMR pool, waiting for her to complete\n"); - schedule(); + wait_var_event(fps, !fps->fps_increasing); goto again; } @@ -2015,6 +1958,7 @@ again: rc = kiblnd_create_fmr_pool(fps, &fpo); spin_lock(&fps->fps_lock); fps->fps_increasing = 0; + wake_up_var(fps); if (rc == 0) { fps->fps_version++; list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); @@ -2202,13 +2146,11 @@ again: /* another thread is allocating a new pool */ spin_unlock(&ps->ps_lock); trips++; - CDEBUG(D_NET, "Another thread is allocating new " - "%s pool, waiting %d HZs for her to complete." - "trips = %d\n", + CDEBUG(D_NET, + "Another thread is allocating new %s pool, waiting %d jiffies for her to complete. trips = %d\n", ps->ps_name, interval, trips); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(interval); + schedule_timeout_interruptible(interval); if (interval < cfs_time_seconds(1)) interval *= 2; @@ -2261,37 +2203,32 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool) if (tpo->tpo_tx_descs == NULL) goto out; - for (i = 0; i < pool->po_size; i++) { + for (i = 0; i < pool->po_size; i++) { struct kib_tx *tx = &tpo->tpo_tx_descs[i]; int wrq_sge = *kiblnd_tunables.kib_wrq_sge; list_del(&tx->tx_list); - if (tx->tx_pages != NULL) - LIBCFS_FREE(tx->tx_pages, - LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - if (tx->tx_frags != NULL) - LIBCFS_FREE(tx->tx_frags, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_frags)); - if (tx->tx_wrq != NULL) - LIBCFS_FREE(tx->tx_wrq, - (1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); + if (tx->tx_pages != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV); + if (tx->tx_frags != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_frags, + (1 + IBLND_MAX_RDMA_FRAGS)); + if (tx->tx_wrq != NULL) + CFS_FREE_PTR_ARRAY(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS)); if (tx->tx_sge != NULL) - LIBCFS_FREE(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge * - sizeof(*tx->tx_sge)); - if (tx->tx_rd != NULL) - LIBCFS_FREE(tx->tx_rd, + CFS_FREE_PTR_ARRAY(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + wrq_sge); + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, offsetof(struct kib_rdma_desc, - rd_frags[IBLND_MAX_RDMA_FRAGS])); - } + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } - LIBCFS_FREE(tpo->tpo_tx_descs, - pool->po_size * sizeof(struct kib_tx)); + CFS_FREE_PTR_ARRAY(tpo->tpo_tx_descs, pool->po_size); out: - kiblnd_fini_pool(pool); + kiblnd_fini_pool(pool); CFS_FREE_PTR(tpo); } @@ -2535,10 +2472,95 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts, } static int +kiblnd_port_get_attr(struct kib_hca_dev *hdev) +{ + struct ib_port_attr *port_attr; + int rc; + unsigned long flags; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + + LIBCFS_ALLOC(port_attr, sizeof(*port_attr)); + if (port_attr == NULL) { + CDEBUG(D_NETERROR, "Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_port(hdev->ibh_ibdev, hdev->ibh_port, port_attr); + + write_lock_irqsave(g_lock, flags); + + if (rc == 0) + hdev->ibh_state = port_attr->state == IB_PORT_ACTIVE + ? IBLND_DEV_PORT_ACTIVE + : IBLND_DEV_PORT_DOWN; + + write_unlock_irqrestore(g_lock, flags); + LIBCFS_FREE(port_attr, sizeof(*port_attr)); + + if (rc != 0) { + CDEBUG(D_NETERROR, "Failed to query IB port: %d\n", rc); + return rc; + } + return 0; +} + +static inline void +kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val) +{ + struct kib_net *net; + + /* for health check */ + list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) { + if (val) + CDEBUG(D_NETERROR, "Fatal device error for NI %s\n", + libcfs_nid2str(net->ibn_ni->ni_nid)); + atomic_set(&net->ibn_ni->ni_fatal_error_on, val); + } +} + +void +kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + struct kib_hca_dev *hdev; + unsigned long flags; + + hdev = container_of(handler, struct kib_hca_dev, ibh_event_handler); + + write_lock_irqsave(g_lock, flags); + + switch (event->event) { + case IB_EVENT_DEVICE_FATAL: + CDEBUG(D_NET, "IB device fatal\n"); + hdev->ibh_state = IBLND_DEV_FATAL; + kiblnd_set_ni_fatal_on(hdev, 1); + break; + case IB_EVENT_PORT_ACTIVE: + CDEBUG(D_NET, "IB port active\n"); + if (event->element.port_num == hdev->ibh_port) { + hdev->ibh_state = IBLND_DEV_PORT_ACTIVE; + kiblnd_set_ni_fatal_on(hdev, 0); + } + break; + case IB_EVENT_PORT_ERR: + CDEBUG(D_NET, "IB port err\n"); + if (event->element.port_num == hdev->ibh_port) { + hdev->ibh_state = IBLND_DEV_PORT_DOWN; + kiblnd_set_ni_fatal_on(hdev, 1); + } + break; + default: + break; + } + write_unlock_irqrestore(g_lock, flags); +} + +static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) { struct ib_device_attr *dev_attr; int rc = 0; + int rc2 = 0; /* It's safe to assume a HCA can handle a page size * matching that of the native system */ @@ -2592,6 +2614,10 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) rc = -ENOSYS; } + rc2 = kiblnd_port_get_attr(hdev); + if (rc2 != 0) + return rc2; + if (rc != 0) rc = -EINVAL; @@ -2624,6 +2650,9 @@ kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev) void kiblnd_hdev_destroy(struct kib_hca_dev *hdev) { + if (hdev->ibh_event_handler.device != NULL) + ib_unregister_event_handler(&hdev->ibh_event_handler); + #ifdef HAVE_IB_GET_DMA_MR kiblnd_hdev_cleanup_mrs(hdev); #endif @@ -2792,6 +2821,7 @@ kiblnd_dev_failover(struct kib_dev *dev, struct net *ns) hdev->ibh_dev = dev; hdev->ibh_cmid = cmid; hdev->ibh_ibdev = cmid->device; + hdev->ibh_port = cmid->port_num; #ifdef HAVE_IB_ALLOC_PD_2ARGS pd = ib_alloc_pd(cmid->device, 0); @@ -2826,6 +2856,10 @@ kiblnd_dev_failover(struct kib_dev *dev, struct net *ns) } #endif + INIT_IB_EVENT_HANDLER(&hdev->ibh_event_handler, + hdev->ibh_ibdev, kiblnd_event_handler); + ib_register_event_handler(&hdev->ibh_event_handler); + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); old = dev->ibd_hdev; @@ -2885,8 +2919,8 @@ kiblnd_base_shutdown(void) LASSERT(list_empty(&kiblnd_data.kib_devs)); - CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %lld\n", + libcfs_kmem_read()); switch (kiblnd_data.kib_init) { default: @@ -2915,34 +2949,25 @@ kiblnd_base_shutdown(void) wake_up_all(&kiblnd_data.kib_connd_waitq); wake_up_all(&kiblnd_data.kib_failover_waitq); - i = 2; - while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { - i++; - /* power of 2? */ - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, - "Waiting for %d threads to terminate\n", - atomic_read(&kiblnd_data.kib_nthreads)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(cfs_time_seconds(1)); - } - - /* fall through */ + wait_var_event_warning(&kiblnd_data.kib_nthreads, + !atomic_read(&kiblnd_data.kib_nthreads), + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + /* fall through */ case IBLND_INIT_NOTHING: break; } - if (kiblnd_data.kib_peers != NULL) { - LIBCFS_FREE(kiblnd_data.kib_peers, - sizeof(struct list_head) * - kiblnd_data.kib_peer_hash_size); - } + if (kiblnd_data.kib_peers) + CFS_FREE_PTR_ARRAY(kiblnd_data.kib_peers, + kiblnd_data.kib_peer_hash_size); if (kiblnd_data.kib_scheds != NULL) cfs_percpt_free(kiblnd_data.kib_scheds); - CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %lld\n", + libcfs_kmem_read()); kiblnd_data.kib_init = IBLND_INIT_NOTHING; module_put(THIS_MODULE); @@ -2953,16 +2978,15 @@ kiblnd_shutdown(struct lnet_ni *ni) { struct kib_net *net = ni->ni_data; rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - int i; - unsigned long flags; + unsigned long flags; LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); if (net == NULL) goto out; - CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %lld\n", + libcfs_kmem_read()); write_lock_irqsave(g_lock, flags); net->ibn_shutdown = 1; @@ -2972,22 +2996,16 @@ kiblnd_shutdown(struct lnet_ni *ni) default: LBUG(); - case IBLND_INIT_ALL: - /* nuke all existing peers within this net */ - kiblnd_del_peer(ni, LNET_NID_ANY); + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); /* Wait for all peer_ni state to clean up */ - i = 2; - while (atomic_read(&net->ibn_npeers) != 0) { - i++; - /* power of 2? */ - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, - "%s: waiting for %d peers to disconnect\n", - libcfs_nid2str(ni->ni_nid), - atomic_read(&net->ibn_npeers)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(cfs_time_seconds(1)); - } + wait_var_event_warning(&net->ibn_npeers, + atomic_read(&net->ibn_npeers) == 0, + "%s: waiting for %d peers to disconnect\n", + libcfs_nid2str(ni->ni_nid), + atomic_read(&net->ibn_npeers)); kiblnd_net_fini_pools(net); @@ -2997,7 +3015,7 @@ kiblnd_shutdown(struct lnet_ni *ni) list_del(&net->ibn_list); write_unlock_irqrestore(g_lock, flags); - /* fall through */ + /* fall through */ case IBLND_INIT_NOTHING: LASSERT (atomic_read(&net->ibn_nconns) == 0); @@ -3009,8 +3027,8 @@ kiblnd_shutdown(struct lnet_ni *ni) break; } - CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %lld\n", + libcfs_kmem_read()); net->ibn_init = IBLND_INIT_NOTHING; ni->ni_data = NULL; @@ -3042,9 +3060,8 @@ kiblnd_base_startup(struct net *ns) INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; - LIBCFS_ALLOC(kiblnd_data.kib_peers, - sizeof(struct list_head) * - kiblnd_data.kib_peer_hash_size); + CFS_ALLOC_PTR_ARRAY(kiblnd_data.kib_peers, + kiblnd_data.kib_peer_hash_size); if (kiblnd_data.kib_peers == NULL) goto failed; @@ -3053,6 +3070,7 @@ kiblnd_base_startup(struct net *ns) spin_lock_init(&kiblnd_data.kib_connd_lock); INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_waits); INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); @@ -3244,6 +3262,7 @@ kiblnd_startup(struct lnet_ni *ni) goto failed; } + net->ibn_ni = ni; net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC; kiblnd_tunables_setup(ni); @@ -3335,6 +3354,9 @@ kiblnd_startup(struct lnet_ni *ni) write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); ibdev->ibd_nnets++; list_add_tail(&net->ibn_list, &ibdev->ibd_nets); + /* for health check */ + if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN) + kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1); write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); net->ibn_init = IBLND_INIT_ALL; @@ -3354,12 +3376,11 @@ failed: return -ENETDOWN; } -static struct lnet_lnd the_o2iblnd = { +static const struct lnet_lnd the_o2iblnd = { .lnd_type = O2IBLND, .lnd_startup = kiblnd_startup, .lnd_shutdown = kiblnd_shutdown, .lnd_ctl = kiblnd_ctl, - .lnd_query = kiblnd_query, .lnd_send = kiblnd_send, .lnd_recv = kiblnd_recv, };