Whamcloud - gitweb
LU-12287 lnet: handling device failure by IB event handler
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.c
index 1da7cd7..a183506 100644 (file)
@@ -39,7 +39,7 @@
 
 #include "o2iblnd.h"
 
-static struct lnet_lnd the_o2iblnd;
+static const struct lnet_lnd the_o2iblnd;
 
 struct kib_data kiblnd_data;
 
@@ -256,8 +256,8 @@ int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
         if (flip) {
                 /* leave magic unflipped as a clue to peer_ni endianness */
                 msg->ibm_version = version;
-                CLASSERT (sizeof(msg->ibm_type) == 1);
-                CLASSERT (sizeof(msg->ibm_credits) == 1);
+               BUILD_BUG_ON(sizeof(msg->ibm_type) != 1);
+               BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1);
                 msg->ibm_nob     = msg_nob;
                 __swab64s(&msg->ibm_srcnid);
                 __swab64s(&msg->ibm_srcstamp);
@@ -480,7 +480,7 @@ kiblnd_del_peer_locked(struct kib_peer_ni *peer_ni)
 static int
 kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 {
-       struct list_head        zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct list_head        *ptmp;
        struct list_head        *pnxt;
        struct kib_peer_ni              *peer_ni;
@@ -570,9 +570,9 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 static void
 kiblnd_debug_rx(struct kib_rx *rx)
 {
-        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
-               rx, rx->rx_status, rx->rx_msg->ibm_type,
-               rx->rx_msg->ibm_credits);
+       CDEBUG(D_CONSOLE, "      %p msg_type %x cred %d\n",
+              rx, rx->rx_msg->ibm_type,
+              rx->rx_msg->ibm_credits);
 }
 
 static void
@@ -674,7 +674,7 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 static int
 kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 {
-       cpumask_t       *mask;
+       cpumask_var_t   *mask;
        int             vectors;
        int             off;
        int             i;
@@ -688,8 +688,8 @@ kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 
        /* hash NID to CPU id in this partition... */
        ibp_nid = conn->ibc_peer->ibp_nid;
-       off = do_div(ibp_nid, cpumask_weight(mask));
-       for_each_cpu(i, mask) {
+       off = do_div(ibp_nid, cpumask_weight(*mask));
+       for_each_cpu(i, *mask) {
                if (off-- == 0)
                        return i % vectors;
        }
@@ -734,16 +734,28 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
         * One WR for the LNet message
         * And ibc_max_frags for the transfer WRs
         */
-       unsigned int ret = 1 + conn->ibc_max_frags;
+       int ret;
+       int multiplier = 1 + conn->ibc_max_frags;
        enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
 
        /* FastReg needs two extra WRs for map and invalidate */
        if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
-               ret += 2;
+               multiplier += 2;
 
        /* account for a maximum of ibc_queue_depth in-flight transfers */
-       ret *= conn->ibc_queue_depth;
-       return ret;
+       ret = multiplier * conn->ibc_queue_depth;
+
+       if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+               CDEBUG(D_NET, "peer_credits %u will result in send work "
+                      "request size %d larger than maximum %d device "
+                      "can handle\n", conn->ibc_queue_depth, ret,
+                      conn->ibc_hdev->ibh_max_qp_wr);
+               conn->ibc_queue_depth =
+                       conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+       }
+
+       /* don't go beyond the maximum the device can handle */
+       return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
 }
 
 struct kib_conn *
@@ -900,20 +912,14 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
        init_qp_attr->qp_type = IB_QPT_RC;
        init_qp_attr->send_cq = cq;
        init_qp_attr->recv_cq = cq;
+       /*
+        * kiblnd_send_wrs() can change the connection's queue depth if
+        * the maximum work requests for the device is maxed out
+        */
+       init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+       init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
-       conn->ibc_sched = sched;
-
-       do {
-               init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-               init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-
-               rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-               if (!rc || conn->ibc_queue_depth < 2)
-                       break;
-
-               conn->ibc_queue_depth--;
-       } while (rc);
-
+       rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
        if (rc) {
                CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
                       "send_sge: %d, recv_sge: %d\n",
@@ -924,6 +930,8 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
+       conn->ibc_sched = sched;
+
        if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
                CWARN("peer %s - queue depth reduced from %u to %u"
                      "  to allow for qp creation\n",
@@ -1002,8 +1010,7 @@ void
 kiblnd_destroy_conn(struct kib_conn *conn)
 {
        struct rdma_cm_id *cmid = conn->ibc_cmid;
-       struct kib_peer_ni        *peer_ni = conn->ibc_peer;
-       int                rc;
+       struct kib_peer_ni *peer_ni = conn->ibc_peer;
 
        LASSERT (!in_interrupt());
        LASSERT (atomic_read(&conn->ibc_refcount) == 0);
@@ -1034,11 +1041,8 @@ kiblnd_destroy_conn(struct kib_conn *conn)
        if (cmid != NULL && cmid->qp != NULL)
                rdma_destroy_qp(cmid);
 
-       if (conn->ibc_cq != NULL) {
-               rc = ib_destroy_cq(conn->ibc_cq);
-               if (rc != 0)
-                       CWARN("Error destroying CQ: %d\n", rc);
-       }
+       if (conn->ibc_cq)
+               ib_destroy_cq(conn->ibc_cq);
 
        kiblnd_txlist_done(&conn->ibc_zombie_txs, -ECONNABORTED,
                           LNET_MSG_STATUS_OK);
@@ -1247,7 +1251,6 @@ kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, time64_t *when)
        CDEBUG(D_NET, "peer_ni %s %p, alive %lld secs ago\n",
               libcfs_nid2str(nid), peer_ni,
               last_alive ? now - last_alive : -1);
-       return;
 }
 
 static void
@@ -1426,11 +1429,11 @@ kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
 
        dev = net->ibn_dev;
 
-        /* pre-mapped messages are not bigger than 1 page */
-        CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+       /* pre-mapped messages are not bigger than 1 page */
+       BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE);
 
-        /* No fancy arithmetic when we do the buffer calculations */
-        CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+       /* No fancy arithmetic when we do the buffer calculations */
+       BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE != 0);
 
         tpo->tpo_hdev = kiblnd_current_hdev(dev);
 
@@ -1699,11 +1702,10 @@ kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, struct list_head *zombies)
                                                      fpo_list);
 
                fpo->fpo_failed = 1;
-               list_del(&fpo->fpo_list);
                if (fpo->fpo_map_count == 0)
-                       list_add(&fpo->fpo_list, zombies);
+                       list_move(&fpo->fpo_list, zombies);
                else
-                       list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+                       list_move(&fpo->fpo_list, &fps->fps_failed_pool_list);
        }
 
        spin_unlock(&fps->fps_lock);
@@ -1781,7 +1783,7 @@ kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
 void
 kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
 {
-       struct list_head zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct kib_fmr_pool *fpo = fmr->fmr_pool;
        struct kib_fmr_poolset *fps;
        time64_t now = ktime_get_seconds();
@@ -1794,8 +1796,7 @@ kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
        fps = fpo->fpo_owner;
        if (fpo->fpo_is_fmr) {
                if (fmr->fmr_pfmr) {
-                       rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
-                       LASSERT(!rc);
+                       ib_fmr_pool_unmap(fmr->fmr_pfmr);
                        fmr->fmr_pfmr = NULL;
                }
 
@@ -1864,8 +1865,8 @@ again:
                                tx_pages_mapped = 1;
                        }
 
-                       pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
-                                                   pages, npages, iov);
+                       pfmr = kib_fmr_pool_map(fpo->fmr.fpo_fmr_pool,
+                                               pages, npages, iov);
                        if (likely(!IS_ERR(pfmr))) {
                                fmr->fmr_key  = is_rx ? pfmr->fmr->rkey
                                                      : pfmr->fmr->lkey;
@@ -2072,11 +2073,10 @@ kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
                                                 struct kib_pool, po_list);
 
                po->po_failed = 1;
-               list_del(&po->po_list);
                if (po->po_allocated == 0)
-                       list_add(&po->po_list, zombies);
+                       list_move(&po->po_list, zombies);
                else
-                       list_add(&po->po_list, &ps->ps_failed_pool_list);
+                       list_move(&po->po_list, &ps->ps_failed_pool_list);
        }
        spin_unlock(&ps->ps_lock);
 }
@@ -2139,7 +2139,7 @@ kiblnd_pool_is_idle(struct kib_pool *pool, time64_t now)
 void
 kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
 {
-       struct list_head zombies = LIST_HEAD_INIT(zombies);
+       LIST_HEAD(zombies);
        struct kib_poolset *ps = pool->po_owner;
        struct kib_pool *tmp;
        time64_t now = ktime_get_seconds();
@@ -2292,7 +2292,7 @@ kiblnd_destroy_tx_pool(struct kib_pool *pool)
                    pool->po_size * sizeof(struct kib_tx));
 out:
         kiblnd_fini_pool(pool);
-       LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+       CFS_FREE_PTR(tpo);
 }
 
 static int kiblnd_tx_pool_size(struct lnet_ni *ni, int ncpts)
@@ -2328,7 +2328,7 @@ kiblnd_create_tx_pool(struct kib_poolset *ps, int size, struct kib_pool **pp_po)
         npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
        if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
                CERROR("Can't allocate tx pages: %d\n", npg);
-               LIBCFS_FREE(tpo, sizeof(struct kib_tx_pool));
+               CFS_FREE_PTR(tpo);
                return -ENOMEM;
        }
 
@@ -2535,10 +2535,95 @@ kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, __u32 *cpts,
 }
 
 static int
+kiblnd_port_get_attr(struct kib_hca_dev *hdev)
+{
+       struct ib_port_attr *port_attr;
+       int rc;
+       unsigned long flags;
+       rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+
+       LIBCFS_ALLOC(port_attr, sizeof(*port_attr));
+       if (port_attr == NULL) {
+               CDEBUG(D_NETERROR, "Out of memory\n");
+               return -ENOMEM;
+       }
+
+       rc = ib_query_port(hdev->ibh_ibdev, hdev->ibh_port, port_attr);
+
+       write_lock_irqsave(g_lock, flags);
+
+       if (rc == 0)
+               hdev->ibh_state = port_attr->state == IB_PORT_ACTIVE
+                                ? IBLND_DEV_PORT_ACTIVE
+                                : IBLND_DEV_PORT_DOWN;
+
+       write_unlock_irqrestore(g_lock, flags);
+       LIBCFS_FREE(port_attr, sizeof(*port_attr));
+
+       if (rc != 0) {
+               CDEBUG(D_NETERROR, "Failed to query IB port: %d\n", rc);
+               return rc;
+       }
+       return 0;
+}
+
+static inline void
+kiblnd_set_ni_fatal_on(struct kib_hca_dev *hdev, int val)
+{
+       struct kib_net  *net;
+
+       /* for health check */
+       list_for_each_entry(net, &hdev->ibh_dev->ibd_nets, ibn_list) {
+               if (val)
+                       CDEBUG(D_NETERROR, "Fatal device error for NI %s\n",
+                                       libcfs_nid2str(net->ibn_ni->ni_nid));
+               atomic_set(&net->ibn_ni->ni_fatal_error_on, val);
+       }
+}
+
+void
+kiblnd_event_handler(struct ib_event_handler *handler, struct ib_event *event)
+{
+       rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+       struct kib_hca_dev  *hdev;
+       unsigned long flags;
+
+       hdev = container_of(handler, struct kib_hca_dev, ibh_event_handler);
+
+       write_lock_irqsave(g_lock, flags);
+
+       switch (event->event) {
+       case IB_EVENT_DEVICE_FATAL:
+               CDEBUG(D_NET, "IB device fatal\n");
+               hdev->ibh_state = IBLND_DEV_FATAL;
+               kiblnd_set_ni_fatal_on(hdev, 1);
+               break;
+       case IB_EVENT_PORT_ACTIVE:
+               CDEBUG(D_NET, "IB port active\n");
+               if (event->element.port_num == hdev->ibh_port) {
+                       hdev->ibh_state = IBLND_DEV_PORT_ACTIVE;
+                       kiblnd_set_ni_fatal_on(hdev, 0);
+               }
+               break;
+       case IB_EVENT_PORT_ERR:
+               CDEBUG(D_NET, "IB port err\n");
+               if (event->element.port_num == hdev->ibh_port) {
+                       hdev->ibh_state = IBLND_DEV_PORT_DOWN;
+                       kiblnd_set_ni_fatal_on(hdev, 1);
+               }
+               break;
+       default:
+               break;
+       }
+       write_unlock_irqrestore(g_lock, flags);
+}
+
+static int
 kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 {
        struct ib_device_attr *dev_attr;
        int rc = 0;
+       int rc2 = 0;
 
        /* It's safe to assume a HCA can handle a page size
         * matching that of the native system */
@@ -2563,6 +2648,7 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 #endif
 
        hdev->ibh_mr_size = dev_attr->max_mr_size;
+       hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
 
        /* Setup device Memory Registration capabilities */
 #ifdef HAVE_IB_DEVICE_OPS
@@ -2591,9 +2677,11 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
                rc = -ENOSYS;
        }
 
-       if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
-               hdev->ibh_mr_shift = 64;
-       else if (rc != 0)
+       rc2 = kiblnd_port_get_attr(hdev);
+       if (rc2 != 0)
+               return rc2;
+
+       if (rc != 0)
                rc = -EINVAL;
 
 #ifndef HAVE_IB_DEVICE_ATTRS
@@ -2625,6 +2713,9 @@ kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
 void
 kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
 {
+       if (hdev->ibh_event_handler.device != NULL)
+               ib_unregister_event_handler(&hdev->ibh_event_handler);
+
 #ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
 #endif
@@ -2666,7 +2757,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(struct kib_dev *dev)
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2688,8 +2779,8 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
          *
          * a. rdma_bind_addr(), it will conflict with listener cmid
          * b. rdma_resolve_addr() to zero addr */
-        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev,
+                                    RDMA_PS_TCP, IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2718,11 +2809,11 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
 }
 
 int
-kiblnd_dev_failover(struct kib_dev *dev)
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 {
-       struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
-       struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
-       struct list_head    zombie_fpo = LIST_HEAD_INIT(zombie_fpo);
+       LIST_HEAD(zombie_tpo);
+       LIST_HEAD(zombie_ppo);
+       LIST_HEAD(zombie_fpo);
         struct rdma_cm_id  *cmid  = NULL;
        struct kib_hca_dev *hdev  = NULL;
        struct kib_hca_dev *old;
@@ -2737,7 +2828,7 @@ kiblnd_dev_failover(struct kib_dev *dev)
                  dev->ibd_can_failover ||
                  dev->ibd_hdev == NULL);
 
-        rc = kiblnd_dev_need_failover(dev);
+       rc = kiblnd_dev_need_failover(dev, ns);
         if (rc <= 0)
                 goto out;
 
@@ -2758,8 +2849,8 @@ kiblnd_dev_failover(struct kib_dev *dev)
                 rdma_destroy_id(cmid);
         }
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2793,6 +2884,7 @@ kiblnd_dev_failover(struct kib_dev *dev)
         hdev->ibh_dev   = dev;
         hdev->ibh_cmid  = cmid;
         hdev->ibh_ibdev = cmid->device;
+       hdev->ibh_port  = cmid->port_num;
 
 #ifdef HAVE_IB_ALLOC_PD_2ARGS
        pd = ib_alloc_pd(cmid->device, 0);
@@ -2827,6 +2919,10 @@ kiblnd_dev_failover(struct kib_dev *dev)
        }
 #endif
 
+       INIT_IB_EVENT_HANDLER(&hdev->ibh_event_handler,
+                               hdev->ibh_ibdev, kiblnd_event_handler);
+       ib_register_event_handler(&hdev->ibh_event_handler);
+
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
        old = dev->ibd_hdev;
@@ -2878,80 +2974,6 @@ kiblnd_destroy_dev(struct kib_dev *dev)
         LIBCFS_FREE(dev, sizeof(*dev));
 }
 
-static struct kib_dev *
-kiblnd_create_dev(char *ifname)
-{
-       struct net_device *netdev;
-       struct kib_dev *dev = NULL;
-       int flags;
-       int rc;
-
-       rtnl_lock();
-       for_each_netdev(&init_net, netdev) {
-               struct in_device *in_dev;
-
-               if (strcmp(netdev->name, "lo") == 0) /* skip the loopback IF */
-                       continue;
-
-               flags = dev_get_flags(netdev);
-               if (!(flags & IFF_UP)) {
-                       CWARN("Can't query IPoIB interface %s: it's down\n",
-                             netdev->name);
-                       continue;
-               }
-
-               in_dev = __in_dev_get_rtnl(netdev);
-               if (!in_dev) {
-                       CWARN("Interface %s has no IPv4 status.\n",
-                             netdev->name);
-                       continue;
-               }
-
-               for_ifa(in_dev)
-                       if (strcmp(ifname, ifa->ifa_label) == 0) {
-                               LIBCFS_ALLOC(dev, sizeof(*dev));
-                               if (!dev)
-                                       goto unlock;
-
-                               dev->ibd_can_failover = !!(flags & IFF_MASTER);
-                               dev->ibd_ifip = ntohl(ifa->ifa_local);
-
-                               INIT_LIST_HEAD(&dev->ibd_nets);
-                               INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
-                               INIT_LIST_HEAD(&dev->ibd_fail_list);
-                               break;
-                       }
-               endfor_ifa(in_dev);
-       }
-       rtnl_unlock();
-
-       if (!dev) {
-               CERROR("Can't find any usable interfaces\n");
-               return NULL;
-       }
-
-       if (dev->ibd_ifip == 0) {
-               CERROR("Can't initialize device: no IP address\n");
-               goto free_dev;
-       }
-       strcpy(&dev->ibd_ifname[0], ifname);
-
-       /* initialize the device */
-       rc = kiblnd_dev_failover(dev);
-       if (rc != 0) {
-               CERROR("Can't initialize device: %d\n", rc);
-               goto free_dev;
-       }
-
-       list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
-       return dev;
-unlock:
-       rtnl_unlock();
-free_dev:
-       LIBCFS_FREE(dev, sizeof(*dev));
-       return NULL;
-}
-
 static void
 kiblnd_base_shutdown(void)
 {
@@ -3095,11 +3117,10 @@ kiblnd_shutdown(struct lnet_ni *ni)
 out:
        if (list_empty(&kiblnd_data.kib_devs))
                 kiblnd_base_shutdown();
-        return;
 }
 
 static int
-kiblnd_base_startup(void)
+kiblnd_base_startup(struct net *ns)
 {
        struct kib_sched_info   *sched;
        int                     rc;
@@ -3107,7 +3128,9 @@ kiblnd_base_startup(void)
 
        LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
 
-       try_module_get(THIS_MODULE);
+       if (!try_module_get(THIS_MODULE))
+               goto failed;
+
        memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
 
        rwlock_init(&kiblnd_data.kib_global_lock);
@@ -3172,7 +3195,7 @@ kiblnd_base_startup(void)
         }
 
        if (*kiblnd_tunables.kib_dev_failover != 0)
-               rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+               rc = kiblnd_thread_start(kiblnd_failover_thread, ns,
                                         "kiblnd_failover");
 
         if (rc != 0) {
@@ -3232,8 +3255,8 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
        return rc;
 }
 
-static int
-kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(struct kib_dev *dev, bool newdev, u32 *cpts,
+                                   int ncpts)
 {
        int     cpt;
        int     rc;
@@ -3263,8 +3286,8 @@ kiblnd_dev_search(char *ifname)
 {
        struct kib_dev *alias = NULL;
        struct kib_dev *dev;
-       char            *colon;
-       char            *colon2;
+       char            *colon;
+       char            *colon2;
 
        colon = strchr(ifname, ':');
        list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
@@ -3294,27 +3317,31 @@ kiblnd_dev_search(char *ifname)
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
-        char                     *ifname;
+       char *ifname = NULL;
+       struct lnet_inetdev *ifaces = NULL;
        struct kib_dev *ibdev = NULL;
-       struct kib_net *net;
-        unsigned long             flags;
-        int                       rc;
-       int                       newdev;
-       int                       node_id;
+       struct kib_net *net = NULL;
+       unsigned long flags;
+       int rc;
+       int i;
+       bool newdev;
 
-        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
+       LASSERT(ni->ni_net->net_lnd == &the_o2iblnd);
 
-        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
-                rc = kiblnd_base_startup();
-                if (rc != 0)
-                        return rc;
-        }
+       if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+               rc = kiblnd_base_startup(ni->ni_net_ns);
+               if (rc != 0)
+                       return rc;
+       }
 
-        LIBCFS_ALLOC(net, sizeof(*net));
-        ni->ni_data = net;
-        if (net == NULL)
-                goto failed;
+       LIBCFS_ALLOC(net, sizeof(*net));
+       ni->ni_data = net;
+       if (net == NULL) {
+               rc = -ENOMEM;
+               goto failed;
+       }
 
+       net->ibn_ni = ni;
        net->ibn_incarnation = ktime_get_real_ns() / NSEC_PER_USEC;
 
        kiblnd_tunables_setup(ni);
@@ -3326,10 +3353,9 @@ kiblnd_startup(struct lnet_ni *ni)
         */
        if (ni->ni_interfaces[0] != NULL) {
                /* Use the IPoIB interface specified in 'networks=' */
-
-               CLASSERT(LNET_INTERFACES_NUM > 1);
                if (ni->ni_interfaces[1] != NULL) {
-                       CERROR("Multiple interfaces not supported\n");
+                       CERROR("ko2iblnd: Multiple interfaces not supported\n");
+                       rc = -EINVAL;
                        goto failed;
                }
 
@@ -3338,58 +3364,98 @@ kiblnd_startup(struct lnet_ni *ni)
                ifname = *kiblnd_tunables.kib_default_ipif;
        }
 
-        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
-                CERROR("IPoIB interface name too long: %s\n", ifname);
-                goto failed;
-        }
+       if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+               CERROR("IPoIB interface name too long: %s\n", ifname);
+               rc = -E2BIG;
+               goto failed;
+       }
 
-       ibdev = kiblnd_dev_search(ifname);
+       rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+       if (rc < 0)
+               goto failed;
 
+       for (i = 0; i < rc; i++) {
+               if (strcmp(ifname, ifaces[i].li_name) == 0)
+                       break;
+       }
+
+       if (i == rc) {
+               CERROR("ko2iblnd: No matching interfaces\n");
+               rc = -ENOENT;
+               goto failed;
+       }
+
+       ibdev = kiblnd_dev_search(ifname);
        newdev = ibdev == NULL;
        /* hmm...create kib_dev even for alias */
-       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-               ibdev = kiblnd_create_dev(ifname);
+       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) {
+               LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+               if (!ibdev) {
+                       rc = -ENOMEM;
+                       goto failed;
+               }
 
-       if (ibdev == NULL)
-               goto failed;
+               ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+               strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+                       sizeof(ibdev->ibd_ifname));
+               ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
+
+               INIT_LIST_HEAD(&ibdev->ibd_nets);
+               INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+               INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-       node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
-       ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+               /* initialize the device */
+               rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+               if (rc) {
+                       CERROR("ko2iblnd: Can't initialize device: rc = %d\n",
+                              rc);
+                       goto failed;
+               }
+
+               list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
+       }
 
        net->ibn_dev = ibdev;
        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
-       rc = kiblnd_dev_start_threads(ibdev, newdev,
-                                     ni->ni_cpts, ni->ni_ncpts);
+       ni->ni_dev_cpt = ifaces[i].li_cpt;
+
+       rc = kiblnd_dev_start_threads(ibdev, newdev, ni->ni_cpts, ni->ni_ncpts);
        if (rc != 0)
                goto failed;
 
        rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
-        if (rc != 0) {
-                CERROR("Failed to initialize NI pools: %d\n", rc);
-                goto failed;
-        }
+       if (rc != 0) {
+               CERROR("Failed to initialize NI pools: %d\n", rc);
+               goto failed;
+       }
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        ibdev->ibd_nnets++;
        list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+       /* for health check */
+       if (ibdev->ibd_hdev->ibh_state == IBLND_DEV_PORT_DOWN)
+               kiblnd_set_ni_fatal_on(ibdev->ibd_hdev, 1);
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-        net->ibn_init = IBLND_INIT_ALL;
+       net->ibn_init = IBLND_INIT_ALL;
 
-        return 0;
+       return 0;
 
 failed:
        if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
-                kiblnd_destroy_dev(ibdev);
+               kiblnd_destroy_dev(ibdev);
 
-        kiblnd_shutdown(ni);
+       kfree(ifaces);
+       kiblnd_shutdown(ni);
 
-        CDEBUG(D_NET, "kiblnd_startup failed\n");
-        return -ENETDOWN;
+       CDEBUG(D_NET, "Configuration of device %s failed: rc = %d\n",
+              ifname ? ifname : "", rc);
+
+       return -ENETDOWN;
 }
 
-static struct lnet_lnd the_o2iblnd = {
+static const struct lnet_lnd the_o2iblnd = {
        .lnd_type       = O2IBLND,
        .lnd_startup    = kiblnd_startup,
        .lnd_shutdown   = kiblnd_shutdown,
@@ -3408,13 +3474,13 @@ static int __init ko2iblnd_init(void)
 {
        int rc;
 
-       CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
-       CLASSERT(offsetof(struct kib_msg,
-                         ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <=
-                IBLND_MSG_SIZE);
-       CLASSERT(offsetof(struct kib_msg,
-                         ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
-                <= IBLND_MSG_SIZE);
+       BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE);
+       BUILD_BUG_ON(offsetof(struct kib_msg,
+                    ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+                    IBLND_MSG_SIZE);
+       BUILD_BUG_ON(offsetof(struct kib_msg,
+                    ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) >
+                    IBLND_MSG_SIZE);
 
        rc = kiblnd_tunables_init();
        if (rc != 0)