Whamcloud - gitweb
LU-12768 o2iblnd: wait properly for fps->increasing.
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.c
index 6da280e..fd6f5d8 100644 (file)
@@ -39,7 +39,7 @@
 
 #include "o2iblnd.h"
 
-static struct lnet_lnd the_o2iblnd;
+static const struct lnet_lnd the_o2iblnd;
 
 struct kib_data kiblnd_data;
 
@@ -256,8 +256,8 @@ int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
         if (flip) {
                 /* leave magic unflipped as a clue to peer_ni endianness */
                 msg->ibm_version = version;
-                CLASSERT (sizeof(msg->ibm_type) == 1);
-                CLASSERT (sizeof(msg->ibm_credits) == 1);
+               BUILD_BUG_ON(sizeof(msg->ibm_type) != 1);
+               BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1);
                 msg->ibm_nob     = msg_nob;
                 __swab64s(&msg->ibm_srcnid);
                 __swab64s(&msg->ibm_srcstamp);
@@ -372,7 +372,8 @@ kiblnd_destroy_peer(struct kib_peer_ni *peer_ni)
         * they are destroyed, so we can be assured that _all_ state to do
         * with this peer_ni has been cleaned up when its refcount drops to
         * zero. */
-       atomic_dec(&net->ibn_npeers);
+       if (atomic_dec_and_test(&net->ibn_npeers))
+               wake_up_var(&net->ibn_npeers);
 }
 
 struct kib_peer_ni *
@@ -480,7 +481,7 @@ kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
 static int
 kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 {
-       struct list_head        zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct list_head        *ptmp;
        struct list_head        *pnxt;
        struct kib_peer_ni              *peer_ni;
@@ -570,9 +571,9 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 static void
 kiblnd_debug_rx(struct kib_rx *rx)
 {
-        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
-               rx, rx->rx_status, rx->rx_msg->ibm_type,
-               rx->rx_msg->ibm_credits);
+       CDEBUG(D_CONSOLE, "      %p msg_type %x cred %d\n",
+              rx, rx->rx_msg->ibm_type,
+              rx->rx_msg->ibm_credits);
 }
 
 static void
@@ -635,46 +636,22 @@ kiblnd_debug_conn(struct kib_conn *conn)
        spin_unlock(&conn->ibc_lock);
 }
 
-int
-kiblnd_translate_mtu(int value)
-{
-        switch (value) {
-        default:
-                return -1;
-        case 0:
-                return 0;
-        case 256:
-                return IB_MTU_256;
-        case 512:
-                return IB_MTU_512;
-        case 1024:
-                return IB_MTU_1024;
-        case 2048:
-                return IB_MTU_2048;
-        case 4096:
-                return IB_MTU_4096;
-        }
-}
-
 static void
 kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 {
-        int           mtu;
-
         /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
         if (cmid->route.path_rec == NULL)
                 return;
 
-        mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
-        LASSERT (mtu >= 0);
-        if (mtu != 0)
-                cmid->route.path_rec->mtu = mtu;
+       if (*kiblnd_tunables.kib_ib_mtu)
+               cmid->route.path_rec->mtu =
+                       ib_mtu_int_to_enum(*kiblnd_tunables.kib_ib_mtu);
 }
 
 static int
 kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 {
-       cpumask_t       *mask;
+       cpumask_var_t   *mask;
        int             vectors;
        int             off;
        int             i;
@@ -688,8 +665,8 @@ kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 
        /* hash NID to CPU id in this partition... */
        ibp_nid = conn->ibc_peer->ibp_nid;
-       off = do_div(ibp_nid, cpumask_weight(mask));
-       for_each_cpu(i, mask) {
+       off = do_div(ibp_nid, cpumask_weight(*mask));
+       for_each_cpu(i, *mask) {
                if (off-- == 0)
                        return i % vectors;
        }
@@ -734,16 +711,28 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
         * One WR for the LNet message
         * And ibc_max_frags for the transfer WRs
         */
-       unsigned int ret = 1 + conn->ibc_max_frags;
+       int ret;
+       int multiplier = 1 + conn->ibc_max_frags;
        enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
 
        /* FastReg needs two extra WRs for map and invalidate */
        if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
-               ret += 2;
+               multiplier += 2;
 
        /* account for a maximum of ibc_queue_depth in-flight transfers */
-       ret *= conn->ibc_queue_depth;
-       return ret;
+       ret = multiplier * conn->ibc_queue_depth;
+
+       if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+               CDEBUG(D_NET, "peer_credits %u will result in send work "
+                      "request size %d larger than maximum %d device "
+                      "can handle\n", conn->ibc_queue_depth, ret,
+                      conn->ibc_hdev->ibh_max_qp_wr);
+               conn->ibc_queue_depth =
+                       conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+       }
+
+       /* don't go beyond the maximum the device can handle */
+       return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
 }
 
 struct kib_conn *
@@ -760,7 +749,7 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
        rwlock_t               *glock = &kiblnd_data.kib_global_lock;
        struct kib_net              *net = peer_ni->ibp_ni->ni_data;
        struct kib_dev *dev;
-       struct ib_qp_init_attr *init_qp_attr;
+       struct ib_qp_init_attr init_qp_attr = {};
        struct kib_sched_info   *sched;
 #ifdef HAVE_IB_CQ_INIT_ATTR
        struct ib_cq_init_attr  cq_attr = {};
@@ -791,19 +780,11 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
         */
        cpt = sched->ibs_cpt;
 
-       LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
-                        sizeof(*init_qp_attr));
-       if (init_qp_attr == NULL) {
-               CERROR("Can't allocate qp_attr for %s\n",
-                      libcfs_nid2str(peer_ni->ibp_nid));
-               goto failed_0;
-       }
-
        LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
        if (conn == NULL) {
                CERROR("Can't allocate connection for %s\n",
                       libcfs_nid2str(peer_ni->ibp_nid));
-               goto failed_1;
+               goto failed_0;
        }
 
        conn->ibc_state = IBLND_CONN_INIT;
@@ -892,38 +873,34 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
-       init_qp_attr->event_handler = kiblnd_qp_event;
-       init_qp_attr->qp_context = conn;
-       init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
-       init_qp_attr->cap.max_recv_sge = 1;
-       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-       init_qp_attr->qp_type = IB_QPT_RC;
-       init_qp_attr->send_cq = cq;
-       init_qp_attr->recv_cq = cq;
-
-       conn->ibc_sched = sched;
-
-       do {
-               init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-               init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-
-               rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-               if (!rc || conn->ibc_queue_depth < 2)
-                       break;
-
-               conn->ibc_queue_depth--;
-       } while (rc);
+       init_qp_attr.event_handler = kiblnd_qp_event;
+       init_qp_attr.qp_context = conn;
+       init_qp_attr.cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+       init_qp_attr.cap.max_recv_sge = 1;
+       init_qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr.qp_type = IB_QPT_RC;
+       init_qp_attr.send_cq = cq;
+       init_qp_attr.recv_cq = cq;
+       /*
+        * kiblnd_send_wrs() can change the connection's queue depth if
+        * the maximum work requests for the device is maxed out
+        */
+       init_qp_attr.cap.max_send_wr = kiblnd_send_wrs(conn);
+       init_qp_attr.cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
+       rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, &init_qp_attr);
        if (rc) {
                CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
                       "send_sge: %d, recv_sge: %d\n",
-                      rc, init_qp_attr->cap.max_send_wr,
-                      init_qp_attr->cap.max_recv_wr,
-                      init_qp_attr->cap.max_send_sge,
-                      init_qp_attr->cap.max_recv_sge);
+                      rc, init_qp_attr.cap.max_send_wr,
+                      init_qp_attr.cap.max_recv_wr,
+                      init_qp_attr.cap.max_send_sge,
+                      init_qp_attr.cap.max_recv_sge);
                goto failed_2;
        }
 
+       conn->ibc_sched = sched;
+
        if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
                CWARN("peer %s - queue depth reduced from %u to %u"
                      "  to allow for qp creation\n",
@@ -945,8 +922,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
 
        kiblnd_map_rx_descs(conn);
 
-       LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
-
        /* 1 ref for caller and each rxmsg */
        atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
        conn->ibc_nrx = IBLND_RX_MSGS(conn);
@@ -992,8 +967,6 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
  failed_2:
        kiblnd_destroy_conn(conn);
        LIBCFS_FREE(conn, sizeof(*conn));
- failed_1:
-        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
  failed_0:
         return NULL;
 }
@@ -1002,8 +975,7 @@ void
 kiblnd_destroy_conn(struct kib_conn *conn)
 {
        struct rdma_cm_id *cmid = conn->ibc_cmid;
-       struct kib_peer_ni        *peer_ni = conn->ibc_peer;
-       int                rc;
+       struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
        LASSERT (!in_interrupt());
        LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1034,11 +1006,8 @@ kiblnd_destroy_conn(struct kib_conn *conn)
        if (cmid != NULL && cmid->qp != NULL)
                rdma_destroy_qp(cmid);
 
-       if (conn->ibc_cq != NULL) {
-               rc = ib_destroy_cq(conn->ibc_cq);
-               if (rc != 0)
-                       CWARN("Error destroying CQ: %d\n", rc);
-       }
+       if (conn->ibc_cq)
+               ib_destroy_cq(conn->ibc_cq);
 
        kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
                           LNET_MSG_STATUS_OK);
@@ -1046,10 +1015,8 @@ kiblnd_destroy_conn(struct kib_conn *conn)
        if (conn->ibc_rx_pages != NULL)
                kiblnd_unmap_rx_descs(conn);
 
-       if (conn->ibc_rxs != NULL) {
-               LIBCFS_FREE(conn->ibc_rxs,
-                           IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
-       }
+       if (conn->ibc_rxs != NULL)
+               CFS_FREE_PTR_ARRAY(conn->ibc_rxs, IBLND_RX_MSGS(conn));
 
        if (conn->ibc_connvars != NULL)
                LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
@@ -1220,37 +1187,6 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
 }
 
 static void
-kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
-{
-       time64_t last_alive = 0;
-       time64_t now = ktime_get_seconds();
-       rwlock_t *glock = &kiblnd_data.kib_global_lock;
-       struct kib_peer_ni *peer_ni;
-       unsigned long flags;
-
-       read_lock_irqsave(glock, flags);
-
-       peer_ni = kiblnd_find_peer_locked(ni, nid);
-       if (peer_ni != NULL)
-               last_alive = peer_ni->ibp_last_alive;
-
-       read_unlock_irqrestore(glock, flags);
-
-       if (last_alive != 0)
-               *when = last_alive;
-
-       /* peer_ni is not persistent in hash, trigger peer_ni creation
-        * and connection establishment with a NULL tx */
-       if (peer_ni == NULL)
-               kiblnd_launch_tx(ni, NULL, nid);
-
-       CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n",
-              libcfs_nid2str(nid), peer_ni,
-              last_alive ? now - last_alive : -1);
-       return;
-}
-
-static void
 kiblnd_free_pages(struct kib_pages *p)
 {
        int     npages = p->ibp_npages;
@@ -1395,8 +1331,7 @@ kiblnd_current_hdev(struct kib_dev *dev)
                if (i++ % 50 == 0)
                        CDEBUG(D_NET, "%s: Wait for failover\n",
                               dev->ibd_ifname);
-               set_current_state(TASK_INTERRUPTIBLE);
-               schedule_timeout(cfs_time_seconds(1) / 100);
+               schedule_timeout_interruptible(cfs_time_seconds(1) / 100);
 
                read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        }
@@ -1426,11 +1361,11 @@ kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
 
        dev = net->ibn_dev;
 
-        /* pre-mapped messages are not bigger than 1 page */
-        CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+       /* pre-mapped messages are not bigger than 1 page */
+       BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE);
 
-        /* No fancy arithmetic when we do the buffer calculations */
-        CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+       /* No fancy arithmetic when we do the buffer calculations */
+       BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE != 0);
 
         tpo->tpo_hdev = kiblnd_current_hdev(dev);
 
@@ -1699,11 +1634,10 @@ kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
                                                      fpo_list);
 
                fpo->fpo_failed = 1;
-               list_del(&fpo->fpo_list);
                if (fpo->fpo_map_count == 0)
-                       list_add(&fpo->fpo_list, zombies);
+                       list_move(&fpo->fpo_list, zombies);
                else
-                       list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+                       list_move(&fpo->fpo_list, &fps->fps_failed_pool_list);
        }
 
        spin_unlock(&fps->fps_lock);
@@ -1781,7 +1715,7 @@ kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
 void
 kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 {
-       struct list_head zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct kib_fmr_pool *fpo = fmr->fmr_pool;
        struct kib_fmr_poolset *fps;
        time64_t now = ktime_get_seconds();
@@ -1794,8 +1728,7 @@ kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
        fps = fpo->fpo_owner;
        if (fpo->fpo_is_fmr) {
                if (fmr->fmr_pfmr) {
-                       rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
-                       LASSERT(!rc);
+                       ib_fmr_pool_unmap(fmr->fmr_pfmr);
                        fmr->fmr_pfmr = NULL;
                }
 
@@ -1864,8 +1797,8 @@ again:
                                tx_pages_mapped = 1;
                        }
 
-                       pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
-                                                   pages, npages, iov);
+                       pfmr = kib_fmr_pool_map(fpo->fmr.fpo_fmr_pool,
+                                               pages, npages, iov);
                        if (likely(!IS_ERR(pfmr))) {
                                fmr->fmr_key  = is_rx ? pfmr->fmr->rkey
                                                      : pfmr->fmr->lkey;
@@ -1917,14 +1850,14 @@ again:
 #ifdef HAVE_IB_MAP_MR_SG
 #ifdef HAVE_IB_MAP_MR_SG_5ARGS
                                n = ib_map_mr_sg(mr, tx->tx_frags,
-                                                tx->tx_nfrags, NULL, PAGE_SIZE);
+                                                rd->rd_nfrags, NULL, PAGE_SIZE);
 #else
                                n = ib_map_mr_sg(mr, tx->tx_frags,
-                                                tx->tx_nfrags, PAGE_SIZE);
+                                                rd->rd_nfrags, PAGE_SIZE);
 #endif
-                               if (unlikely(n != tx->tx_nfrags)) {
+                               if (unlikely(n != rd->rd_nfrags)) {
                                        CERROR("Failed to map mr %d/%d "
-                                              "elements\n", n, tx->tx_nfrags);
+                                              "elements\n", n, rd->rd_nfrags);
                                        return n < 0 ? n : -EINVAL;
                                }
 
@@ -1996,7 +1929,7 @@ again:
                spin_unlock(&fps->fps_lock);
                CDEBUG(D_NET, "Another thread is allocating new "
                       "FMR pool, waiting for her to complete\n");
-               schedule();
+               wait_var_event(fps, !fps->fps_increasing);
                goto again;
 
        }
@@ -2014,6 +1947,7 @@ again:
        rc = kiblnd_create_fmr_pool(fps, &fpo);
        spin_lock(&fps->fps_lock);
        fps->fps_increasing = 0;
+       wake_up_var(fps);
        if (rc == 0) {
                fps->fps_version++;
                list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
@@ -2072,11 +2006,10 @@ kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
                                                 struct kib_pool, po_list);
 
                po->po_failed = 1;
-               list_del(&po->po_list);
                if (po->po_allocated == 0)
-                       list_add(&po->po_list, zombies);
+                       list_move(&po->po_list, zombies);
                else
-                       list_add(&po->po_list, &ps->ps_failed_pool_list);
+                       list_move(&po->po_list, &ps->ps_failed_pool_list);
        }
        spin_unlock(&ps->ps_lock);
 }
@@ -2139,7 +2072,7 @@ kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
 void
 kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
 {
-       struct list_head zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct kib_poolset *ps = pool->po_owner;
        struct kib_pool *tmp;
        time64_t now = ktime_get_seconds();
@@ -2202,13 +2135,11 @@ again:
                /* another thread is allocating a new pool */
                spin_unlock(&ps->ps_lock);
                trips++;
-                CDEBUG(D_NET, "Another thread is allocating new "
-                      "%s pool, waiting %d HZs for her to complete."
-                      "trips = %d\n",
+               CDEBUG(D_NET,
+                      "Another thread is allocating new %s pool, waiting %d jiffies for her to complete. trips = %d\n",
                       ps->ps_name, interval, trips);
 
-               set_current_state(TASK_INTERRUPTIBLE);
-               schedule_timeout(interval);
+               schedule_timeout_interruptible(interval);
                if (interval < cfs_time_seconds(1))
                        interval *= 2;
 
@@ -2261,38 +2192,33 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool)
         if (tpo->tpo_tx_descs == NULL)
                 goto out;
 
-        for (i = 0; i < pool->po_size; i++) {
+       for (i = 0; i < pool->po_size; i++) {
                struct kib_tx *tx = &tpo->tpo_tx_descs[i];
                int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                list_del(&tx->tx_list);
-                if (tx->tx_pages != NULL)
-                        LIBCFS_FREE(tx->tx_pages,
-                                    LNET_MAX_IOV *
-                                    sizeof(*tx->tx_pages));
-                if (tx->tx_frags != NULL)
-                        LIBCFS_FREE(tx->tx_frags,
-                                   (1 + IBLND_MAX_RDMA_FRAGS) *
-                                   sizeof(*tx->tx_frags));
-                if (tx->tx_wrq != NULL)
-                        LIBCFS_FREE(tx->tx_wrq,
-                                    (1 + IBLND_MAX_RDMA_FRAGS) *
-                                    sizeof(*tx->tx_wrq));
+               if (tx->tx_pages != NULL)
+                       CFS_FREE_PTR_ARRAY(tx->tx_pages, LNET_MAX_IOV);
+               if (tx->tx_frags != NULL)
+                       CFS_FREE_PTR_ARRAY(tx->tx_frags,
+                                          (1 + IBLND_MAX_RDMA_FRAGS));
+               if (tx->tx_wrq != NULL)
+                       CFS_FREE_PTR_ARRAY(tx->tx_wrq,
+                                          (1 + IBLND_MAX_RDMA_FRAGS));
                if (tx->tx_sge != NULL)
-                       LIBCFS_FREE(tx->tx_sge,
-                                   (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
-                                   sizeof(*tx->tx_sge));
-                if (tx->tx_rd != NULL)
-                        LIBCFS_FREE(tx->tx_rd,
+                       CFS_FREE_PTR_ARRAY(tx->tx_sge,
+                                          (1 + IBLND_MAX_RDMA_FRAGS) *
+                                          wrq_sge);
+               if (tx->tx_rd != NULL)
+                       LIBCFS_FREE(tx->tx_rd,
                                    offsetof(struct kib_rdma_desc,
-                                             rd_frags[IBLND_MAX_RDMA_FRAGS]));
-        }
+                                            rd_frags[IBLND_MAX_RDMA_FRAGS]));
+       }
 
-        LIBCFS_FREE(tpo->tpo_tx_descs,
-                   pool->po_size * sizeof(struct kib_tx));
+       CFS_FREE_PTR_ARRAY(tpo->tpo_tx_descs, pool->po_size);
 out:
-        kiblnd_fini_pool(pool);
-       LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+       kiblnd_fini_pool(pool);
+       CFS_FREE_PTR(tpo);
 }
 
 static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
@@ -2328,7 +2254,7 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
         npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
        if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
                CERROR("Can't allocate tx pages: %d\n", npg);
-               LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+               CFS_FREE_PTR(tpo);
                return -ENOMEM;
        }
 
@@ -2535,10 +2461,95 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 }
 
 static int
+kiblnd_port_get_attr(struct kib_hca_dev *hdev)
+{
+       struct ib_port_attr *port_attr;
+       int rc;
+       unsigned long flags;
+       rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+
+       LIBCFS_ALLOC(port_attr, sizeof(*port_attr));
+       if (port_attr == NULL) {
+               CDEBUG(D_NETERROR, "Out of memory\n");
+               return -ENOMEM;
+       }
+
+       rc = ib_query_port(hdev->ibh_ibdev, hdev->ibh_port, port_attr);
+
+       write_lock_irqsave(g_lock, flags);
+
+       if (rc == 0)
+               hdev->ibh_state = port_attr->state == IB_PORT_ACTIVE
+                                ? IBLND_DEV_PORT_ACTIVE
+                                : IBLND_DEV_PORT_DOWN;
+
+       write_unlock_irqrestore(g_lock, flags);
+       LIBCFS_FREE(port_attr, sizeof(*port_attr));
+
+       if (rc != 0) {
+               CDEBUG(D_NETERROR, "Failed to query IB port: %d\n", rc);
+               return rc;
+       }
+       return 0;
+}
+
+static inline void
+kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val)
+{
+       struct kib_net  *net;
+
+       /* for health check */
+       list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) {
+               if (val)
+                       CDEBUG(D_NETERROR, "Fatal device error for NI %s\n",
+                                       libcfs_nid2str(net->ibn_ni->ni_nid));
+               atomic_set(&net->ibn_ni->ni_fatal_error_on, val);
+       }
+}
+
+void
+kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event)
+{
+       rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+       struct kib_hca_dev  *hdev;
+       unsigned long flags;
+
+       hdev = container_of(handler, struct kib_hca_dev, ibh_event_handler);
+
+       write_lock_irqsave(g_lock, flags);
+
+       switch (event->event) {
+       case IB_EVENT_DEVICE_FATAL:
+               CDEBUG(D_NET, "IB device fatal\n");
+               hdev->ibh_state = IBLND_DEV_FATAL;
+               kiblnd_set_ni_fatal_on(hdev, 1);
+               break;
+       case IB_EVENT_PORT_ACTIVE:
+               CDEBUG(D_NET, "IB port active\n");
+               if (event->element.port_num == hdev->ibh_port) {
+                       hdev->ibh_state = IBLND_DEV_PORT_ACTIVE;
+                       kiblnd_set_ni_fatal_on(hdev, 0);
+               }
+               break;
+       case IB_EVENT_PORT_ERR:
+               CDEBUG(D_NET, "IB port err\n");
+               if (event->element.port_num == hdev->ibh_port) {
+                       hdev->ibh_state = IBLND_DEV_PORT_DOWN;
+                       kiblnd_set_ni_fatal_on(hdev, 1);
+               }
+               break;
+       default:
+               break;
+       }
+       write_unlock_irqrestore(g_lock, flags);
+}
+
+static int
 kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 {
        struct ib_device_attr *dev_attr;
        int rc = 0;
+       int rc2 = 0;
 
        /* It's safe to assume a HCA can handle a page size
         * matching that of the native system */
@@ -2563,12 +2574,20 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 #endif
 
        hdev->ibh_mr_size = dev_attr->max_mr_size;
+       hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
 
        /* Setup device Memory Registration capabilities */
+#ifdef HAVE_IB_DEVICE_OPS
+       if (hdev->ibh_ibdev->ops.alloc_fmr &&
+           hdev->ibh_ibdev->ops.dealloc_fmr &&
+           hdev->ibh_ibdev->ops.map_phys_fmr &&
+           hdev->ibh_ibdev->ops.unmap_fmr) {
+#else
        if (hdev->ibh_ibdev->alloc_fmr &&
            hdev->ibh_ibdev->dealloc_fmr &&
            hdev->ibh_ibdev->map_phys_fmr &&
            hdev->ibh_ibdev->unmap_fmr) {
+#endif
                LCONSOLE_INFO("Using FMR for registration\n");
                hdev->ibh_dev->ibd_dev_caps |= IBLND_DEV_CAPS_FMR_ENABLED;
        } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
@@ -2584,9 +2603,11 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
                rc = -ENOSYS;
        }
 
-       if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
-               hdev->ibh_mr_shift = 64;
-       else if (rc != 0)
+       rc2 = kiblnd_port_get_attr(hdev);
+       if (rc2 != 0)
+               return rc2;
+
+       if (rc != 0)
                rc = -EINVAL;
 
 #ifndef HAVE_IB_DEVICE_ATTRS
@@ -2618,6 +2639,9 @@ kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
 void
 kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
 {
+       if (hdev->ibh_event_handler.device != NULL)
+               ib_unregister_event_handler(&hdev->ibh_event_handler);
+
 #ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
 #endif
@@ -2659,7 +2683,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(struct kib_dev *dev)
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2681,8 +2705,8 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
          *
          * a. rdma_bind_addr(), it will conflict with listener cmid
          * b. rdma_resolve_addr() to zero addr */
-        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev,
+                                    RDMA_PS_TCP, IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2711,11 +2735,11 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
 }
 
 int
-kiblnd_dev_failover(struct kib_dev *dev)
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 {
-       struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
-       struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
-       struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
+       LIST_HEAD(zombie_tpo);
+       LIST_HEAD(zombie_ppo);
+       LIST_HEAD(zombie_fpo);
         struct rdma_cm_id  *cmid  = NULL;
        struct kib_hca_dev *hdev  = NULL;
        struct kib_hca_dev *old;
@@ -2730,7 +2754,7 @@ kiblnd_dev_failover(struct kib_dev *dev)
                  dev->ibd_can_failover ||
                  dev->ibd_hdev == NULL);
 
-        rc = kiblnd_dev_need_failover(dev);
+       rc = kiblnd_dev_need_failover(dev, ns);
         if (rc <= 0)
                 goto out;
 
@@ -2751,8 +2775,8 @@ kiblnd_dev_failover(struct kib_dev *dev)
                 rdma_destroy_id(cmid);
         }
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2786,6 +2810,7 @@ kiblnd_dev_failover(struct kib_dev *dev)
         hdev->ibh_dev   = dev;
         hdev->ibh_cmid  = cmid;
         hdev->ibh_ibdev = cmid->device;
+       hdev->ibh_port  = cmid->port_num;
 
 #ifdef HAVE_IB_ALLOC_PD_2ARGS
        pd = ib_alloc_pd(cmid->device, 0);
@@ -2820,6 +2845,10 @@ kiblnd_dev_failover(struct kib_dev *dev)
        }
 #endif
 
+       INIT_IB_EVENT_HANDLER(&hdev->ibh_event_handler,
+                               hdev->ibh_ibdev, kiblnd_event_handler);
+       ib_register_event_handler(&hdev->ibh_event_handler);
+
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
        old = dev->ibd_hdev;
@@ -2871,80 +2900,6 @@ kiblnd_destroy_dev(struct kib_dev *dev)
         LIBCFS_FREE(dev, sizeof(*dev));
 }
 
-static struct kib_dev *
-kiblnd_create_dev(char *ifname)
-{
-       struct net_device *netdev;
-       struct kib_dev *dev = NULL;
-       int flags;
-       int rc;
-
-       rtnl_lock();
-       for_each_netdev(&init_net, netdev) {
-               struct in_device *in_dev;
-
-               if (strcmp(netdev->name, "lo") == 0) /* skip the loopback IF */
-                       continue;
-
-               flags = dev_get_flags(netdev);
-               if (!(flags & IFF_UP)) {
-                       CERROR("Can't query IPoIB interface %s: it's down\n",
-                              netdev->name);
-                       goto unlock;
-               }
-
-               in_dev = __in_dev_get_rtnl(netdev);
-               if (!in_dev) {
-                       CWARN("Interface %s has no IPv4 status.\n",
-                             netdev->name);
-                       goto unlock;
-               }
-
-               for_ifa(in_dev)
-                       if (strcmp(ifname, ifa->ifa_label) == 0) {
-                               LIBCFS_ALLOC(dev, sizeof(*dev));
-                               if (!dev)
-                                       goto unlock;
-
-                               dev->ibd_can_failover = !!(flags & IFF_MASTER);
-                               dev->ibd_ifip = ntohl(ifa->ifa_local);
-
-                               INIT_LIST_HEAD(&dev->ibd_nets);
-                               INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
-                               INIT_LIST_HEAD(&dev->ibd_fail_list);
-                               break;
-                       }
-               endfor_ifa(in_dev);
-       }
-       rtnl_unlock();
-
-       if (!dev) {
-               CERROR("Can't find any usable interfaces\n");
-               return NULL;
-       }
-
-       if (dev->ibd_ifip == 0) {
-               CERROR("Can't initialize device: no IP address\n");
-               goto free_dev;
-       }
-       strcpy(&dev->ibd_ifname[0], ifname);
-
-       /* initialize the device */
-       rc = kiblnd_dev_failover(dev);
-       if (rc != 0) {
-               CERROR("Can't initialize device: %d\n", rc);
-               goto free_dev;
-       }
-
-       list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
-       return dev;
-unlock:
-       rtnl_unlock();
-free_dev:
-       LIBCFS_FREE(dev, sizeof(*dev));
-       return NULL;
-}
-
 static void
 kiblnd_base_shutdown(void)
 {
@@ -2983,28 +2938,19 @@ kiblnd_base_shutdown(void)
                wake_up_all(&kiblnd_data.kib_connd_waitq);
                wake_up_all(&kiblnd_data.kib_failover_waitq);
 
-               i = 2;
-               while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
-                       i++;
-                       /* power of 2? */
-                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
-                              "Waiting for %d threads to terminate\n",
-                              atomic_read(&kiblnd_data.kib_nthreads));
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       schedule_timeout(cfs_time_seconds(1));
-               }
-
-                /* fall through */
+               wait_var_event_warning(&kiblnd_data.kib_nthreads,
+                                      !atomic_read(&kiblnd_data.kib_nthreads),
+                                      "Waiting for %d threads to terminate\n",
+                                      atomic_read(&kiblnd_data.kib_nthreads));
+               /* fall through */
 
         case IBLND_INIT_NOTHING:
                 break;
         }
 
-       if (kiblnd_data.kib_peers != NULL) {
-               LIBCFS_FREE(kiblnd_data.kib_peers,
-                           sizeof(struct list_head) *
-                           kiblnd_data.kib_peer_hash_size);
-       }
+       if (kiblnd_data.kib_peers)
+               CFS_FREE_PTR_ARRAY(kiblnd_data.kib_peers,
+                                  kiblnd_data.kib_peer_hash_size);
 
        if (kiblnd_data.kib_scheds != NULL)
                cfs_percpt_free(kiblnd_data.kib_scheds);
@@ -3021,8 +2967,7 @@ kiblnd_shutdown(struct lnet_ni *ni)
 {
        struct kib_net *net = ni->ni_data;
        rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
-        int               i;
-        unsigned long     flags;
+       unsigned long     flags;
 
         LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
 
@@ -3040,22 +2985,16 @@ kiblnd_shutdown(struct lnet_ni *ni)
         default:
                 LBUG();
 
-        case IBLND_INIT_ALL:
-                /* nuke all existing peers within this net */
-                kiblnd_del_peer(ni, LNET_NID_ANY);
+       case IBLND_INIT_ALL:
+               /* nuke all existing peers within this net */
+               kiblnd_del_peer(ni, LNET_NID_ANY);
 
                /* Wait for all peer_ni state to clean up */
-               i = 2;
-               while (atomic_read(&net->ibn_npeers) != 0) {
-                       i++;
-                       /* power of 2? */
-                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
-                              "%s: waiting for %d peers to disconnect\n",
-                              libcfs_nid2str(ni->ni_nid),
-                              atomic_read(&net->ibn_npeers));
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       schedule_timeout(cfs_time_seconds(1));
-               }
+               wait_var_event_warning(&net->ibn_npeers,
+                                      atomic_read(&net->ibn_npeers) == 0,
+                                      "%s: waiting for %d peers to disconnect\n",
+                                      libcfs_nid2str(ni->ni_nid),
+                                      atomic_read(&net->ibn_npeers));
 
                kiblnd_net_fini_pools(net);
 
@@ -3065,7 +3004,7 @@ kiblnd_shutdown(struct lnet_ni *ni)
                list_del(&net->ibn_list);
                write_unlock_irqrestore(g_lock, flags);
 
-                /* fall through */
+               /* fall through */
 
         case IBLND_INIT_NOTHING:
                LASSERT (atomic_read(&net->ibn_nconns) == 0);
@@ -3088,11 +3027,10 @@ kiblnd_shutdown(struct lnet_ni *ni)
 out:
        if (list_empty(&kiblnd_data.kib_devs))
                 kiblnd_base_shutdown();
-        return;
 }
 
 static int
-kiblnd_base_startup(void)
+kiblnd_base_startup(struct net *ns)
 {
        struct kib_sched_info   *sched;
        int                     rc;
@@ -3100,7 +3038,9 @@ kiblnd_base_startup(void)
 
        LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
 
-       try_module_get(THIS_MODULE);
+       if (!try_module_get(THIS_MODULE))
+               goto failed;
+
        memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
 
        rwlock_init(&kiblnd_data.kib_global_lock);
@@ -3109,9 +3049,8 @@ kiblnd_base_startup(void)
        INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
 
        kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
-       LIBCFS_ALLOC(kiblnd_data.kib_peers,
-                    sizeof(struct list_head) *
-                    kiblnd_data.kib_peer_hash_size);
+       CFS_ALLOC_PTR_ARRAY(kiblnd_data.kib_peers,
+                           kiblnd_data.kib_peer_hash_size);
        if (kiblnd_data.kib_peers == NULL)
                goto failed;
 
@@ -3165,7 +3104,7 @@ kiblnd_base_startup(void)
         }
 
        if (*kiblnd_tunables.kib_dev_failover != 0)
-               rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+               rc = kiblnd_thread_start(kiblnd_failover_thread, ns,
                                         "kiblnd_failover");
 
         if (rc != 0) {
@@ -3225,8 +3164,8 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
        return rc;
 }
 
-static int
-kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
+                                   int ncpts)
 {
        int     cpt;
        int     rc;
@@ -3256,8 +3195,8 @@ kiblnd_dev_search(char *ifname)
 {
        struct kib_dev *alias = NULL;
        struct kib_dev *dev;
-       char            *colon;
-       char            *colon2;
+       char            *colon;
+       char            *colon2;
 
        colon = strchr(ifname, ':');
        list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
@@ -3287,27 +3226,31 @@ kiblnd_dev_search(char *ifname)
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
-        char                     *ifname;
+       char *ifname = NULL;
+       struct lnet_inetdev *ifaces = NULL;
        struct kib_dev *ibdev = NULL;
-       struct kib_net *net;
-        unsigned long             flags;
-        int                       rc;
-       int                       newdev;
-       int                       node_id;
+       struct kib_net *net = NULL;
+       unsigned long flags;
+       int rc;
+       int i;
+       bool newdev;
 
-        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
+       LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
 
-        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
-                rc = kiblnd_base_startup();
-                if (rc != 0)
-                        return rc;
-        }
+       if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+               rc = kiblnd_base_startup(ni->ni_net_ns);
+               if (rc != 0)
+                       return rc;
+       }
 
-        LIBCFS_ALLOC(net, sizeof(*net));
-        ni->ni_data = net;
-        if (net == NULL)
-                goto failed;
+       LIBCFS_ALLOC(net, sizeof(*net));
+       ni->ni_data = net;
+       if (net == NULL) {
+               rc = -ENOMEM;
+               goto failed;
+       }
 
+       net->ibn_ni = ni;
        net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
 
        kiblnd_tunables_setup(ni);
@@ -3319,10 +3262,9 @@ kiblnd_startup(struct lnet_ni *ni)
         */
        if (ni->ni_interfaces[0] != NULL) {
                /* Use the IPoIB interface specified in 'networks=' */
-
-               CLASSERT(LNET_INTERFACES_NUM > 1);
                if (ni->ni_interfaces[1] != NULL) {
-                       CERROR("Multiple interfaces not supported\n");
+                       CERROR("ko2iblnd: Multiple interfaces not supported\n");
+                       rc = -EINVAL;
                        goto failed;
                }
 
@@ -3331,63 +3273,102 @@ kiblnd_startup(struct lnet_ni *ni)
                ifname = *kiblnd_tunables.kib_default_ipif;
        }
 
-        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
-                CERROR("IPoIB interface name too long: %s\n", ifname);
-                goto failed;
-        }
+       if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+               CERROR("IPoIB interface name too long: %s\n", ifname);
+               rc = -E2BIG;
+               goto failed;
+       }
 
-       ibdev = kiblnd_dev_search(ifname);
+       rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+       if (rc < 0)
+               goto failed;
+
+       for (i = 0; i < rc; i++) {
+               if (strcmp(ifname, ifaces[i].li_name) == 0)
+                       break;
+       }
+
+       if (i == rc) {
+               CERROR("ko2iblnd: No matching interfaces\n");
+               rc = -ENOENT;
+               goto failed;
+       }
 
+       ibdev = kiblnd_dev_search(ifname);
        newdev = ibdev == NULL;
        /* hmm...create kib_dev even for alias */
-       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-               ibdev = kiblnd_create_dev(ifname);
+       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
+               LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+               if (!ibdev) {
+                       rc = -ENOMEM;
+                       goto failed;
+               }
 
-       if (ibdev == NULL)
-               goto failed;
+               ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+               strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+                       sizeof(ibdev->ibd_ifname));
+               ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+
+               INIT_LIST_HEAD(&ibdev->ibd_nets);
+               INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+               INIT_LIST_HEAD(&ibdev->ibd_fail_list);
+
+               /* initialize the device */
+               rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+               if (rc) {
+                       CERROR("ko2iblnd: Can't initialize device: rc = %d\n",
+                              rc);
+                       goto failed;
+               }
 
-       node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
-       ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+               list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+       }
 
        net->ibn_dev = ibdev;
        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
-       rc = kiblnd_dev_start_threads(ibdev, newdev,
-                                     ni->ni_cpts, ni->ni_ncpts);
+       ni->ni_dev_cpt = ifaces[i].li_cpt;
+
+       rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
        if (rc != 0)
                goto failed;
 
        rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
-        if (rc != 0) {
-                CERROR("Failed to initialize NI pools: %d\n", rc);
-                goto failed;
-        }
+       if (rc != 0) {
+               CERROR("Failed to initialize NI pools: %d\n", rc);
+               goto failed;
+       }
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        ibdev->ibd_nnets++;
        list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+       /* for health check */
+       if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN)
+               kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1);
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-        net->ibn_init = IBLND_INIT_ALL;
+       net->ibn_init = IBLND_INIT_ALL;
 
-        return 0;
+       return 0;
 
 failed:
        if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
-                kiblnd_destroy_dev(ibdev);
+               kiblnd_destroy_dev(ibdev);
 
-        kiblnd_shutdown(ni);
+       kfree(ifaces);
+       kiblnd_shutdown(ni);
 
-        CDEBUG(D_NET, "kiblnd_startup failed\n");
-        return -ENETDOWN;
+       CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
+              ifname ? ifname : "", rc);
+
+       return -ENETDOWN;
 }
 
-static struct lnet_lnd the_o2iblnd = {
+static const struct lnet_lnd the_o2iblnd = {
        .lnd_type       = O2IBLND,
        .lnd_startup    = kiblnd_startup,
        .lnd_shutdown   = kiblnd_shutdown,
        .lnd_ctl        = kiblnd_ctl,
-       .lnd_query      = kiblnd_query,
        .lnd_send       = kiblnd_send,
        .lnd_recv       = kiblnd_recv,
 };
@@ -3401,13 +3382,13 @@ static int __init ko2iblnd_init(void)
 {
        int rc;
 
-       CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
-       CLASSERT(offsetof(struct kib_msg,
-                         ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
-                IBLND_MSG_SIZE);
-       CLASSERT(offsetof(struct kib_msg,
-                         ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
-                <= IBLND_MSG_SIZE);
+       BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE);
+       BUILD_BUG_ON(offsetof(struct kib_msg,
+                    ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+                    IBLND_MSG_SIZE);
+       BUILD_BUG_ON(offsetof(struct kib_msg,
+                    ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+                    IBLND_MSG_SIZE);
 
        rc = kiblnd_tunables_init();
        if (rc != 0)