Whamcloud - gitweb
LU-9448 lnet: handle empty CPTs
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.c
index 5a2850a..12dade5 100644 (file)
@@ -524,7 +524,7 @@ kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
 
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-       kiblnd_txlist_done(ni, &zombies, -EIO);
+       kiblnd_txlist_done(&zombies, -EIO);
 
        return rc;
 }
@@ -698,6 +698,36 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
        return 1;
 }
 
+/*
+ * Get the scheduler bound to this CPT. If the scheduler has no
+ * threads, which means that the CPT has no CPUs, then grab the
+ * next scheduler that we can use.
+ *
+ * This case would be triggered if a NUMA node is configured with
+ * no associated CPUs.
+ */
+static struct kib_sched_info *
+kiblnd_get_scheduler(int cpt)
+{
+       struct kib_sched_info *sched;
+       int i;
+
+       sched = kiblnd_data.kib_scheds[cpt];
+
+       if (sched->ibs_nthreads > 0)
+               return sched;
+
+       cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+               if (sched->ibs_nthreads > 0) {
+                       CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+                                       cpt, sched->ibs_cpt);
+                       return sched;
+               }
+       }
+
+       return NULL;
+}
+
 kib_conn_t *
 kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                   int state, int version)
@@ -730,9 +760,18 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
        dev = net->ibn_dev;
 
        cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni);
-       sched = kiblnd_data.kib_scheds[cpt];
+       sched = kiblnd_get_scheduler(cpt);
 
-       LASSERT(sched->ibs_nthreads > 0);
+       if (sched == NULL) {
+               CERROR("no schedulers available. node is unhealthy\n");
+               goto failed_0;
+       }
+
+       /*
+        * The cpt might have changed if we ended up selecting a non cpt
+        * native scheduler. So use the scheduler's cpt instead.
+        */
+       cpt = sched->ibs_cpt;
 
        LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
                         sizeof(*init_qp_attr));
@@ -840,16 +879,16 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
-        init_qp_attr->event_handler = kiblnd_qp_event;
-        init_qp_attr->qp_context = conn;
+       init_qp_attr->event_handler = kiblnd_qp_event;
+       init_qp_attr->qp_context = conn;
        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-        init_qp_attr->cap.max_send_sge = 1;
-        init_qp_attr->cap.max_recv_sge = 1;
-        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-        init_qp_attr->qp_type = IB_QPT_RC;
-        init_qp_attr->send_cq = cq;
-        init_qp_attr->recv_cq = cq;
+       init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+       init_qp_attr->cap.max_recv_sge = 1;
+       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr->qp_type = IB_QPT_RC;
+       init_qp_attr->send_cq = cq;
+       init_qp_attr->recv_cq = cq;
 
        conn->ibc_sched = sched;
 
@@ -862,9 +901,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
        } while (rc);
 
        if (rc) {
-               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
-                       rc, init_qp_attr->cap.max_send_wr,
-                       init_qp_attr->cap.max_recv_wr);
+               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+                      "send_sge: %d, recv_sge: %d\n",
+                      rc, init_qp_attr->cap.max_send_wr,
+                      init_qp_attr->cap.max_recv_wr,
+                      init_qp_attr->cap.max_send_sge,
+                      init_qp_attr->cap.max_recv_sge);
                goto failed_2;
        }
 
@@ -1123,15 +1165,15 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                         break;
                 }
 
-                LASSERT (conn->ibc_cmid != NULL);
-                data->ioc_nid = conn->ibc_peer->ibp_nid;
-                if (conn->ibc_cmid->route.path_rec == NULL)
-                        data->ioc_u32[0] = 0; /* iWarp has no path MTU */
-                else
-                        data->ioc_u32[0] =
-                        ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
-                kiblnd_conn_decref(conn);
-                break;
+               LASSERT(conn->ibc_cmid != NULL);
+               data->ioc_nid = conn->ibc_peer->ibp_nid;
+               if (conn->ibc_cmid->route.path_rec == NULL)
+                       data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+               else
+                       data->ioc_u32[0] =
+                       ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+               kiblnd_conn_decref(conn);
+               break;
         }
         case IOC_LIBCFS_CLOSE_CONNECTION: {
                 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
@@ -1388,6 +1430,7 @@ kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
        }
 }
 
+#ifdef HAVE_IB_GET_DMA_MR
 struct ib_mr *
 kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
                      int negotiated_nfrags)
@@ -1409,6 +1452,7 @@ kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
 
        return hdev->ibh_mrs;
 }
+#endif
 
 static void
 kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
@@ -1944,7 +1988,7 @@ again:
                                return 0;
                        }
                        spin_unlock(&fps->fps_lock);
-                       rc = -EBUSY;
+                       rc = -EAGAIN;
                }
 
                spin_lock(&fps->fps_lock);
@@ -2228,7 +2272,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                list_del(&tx->tx_list);
                 if (tx->tx_pages != NULL)
@@ -2243,10 +2288,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                         LIBCFS_FREE(tx->tx_wrq,
                                     (1 + IBLND_MAX_RDMA_FRAGS) *
                                     sizeof(*tx->tx_wrq));
-                if (tx->tx_sge != NULL)
-                        LIBCFS_FREE(tx->tx_sge,
-                                    (1 + IBLND_MAX_RDMA_FRAGS) *
-                                    sizeof(*tx->tx_sge));
+               if (tx->tx_sge != NULL)
+                       LIBCFS_FREE(tx->tx_sge,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+                                   sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
                                     offsetof(kib_rdma_desc_t,
@@ -2304,7 +2349,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
 
         for (i = 0; i < size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
                if (ps->ps_net->ibn_fmr_ps != NULL) {
@@ -2330,7 +2376,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
                        break;
 
                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
                                 sizeof(*tx->tx_sge));
                if (tx->tx_sge == NULL)
                        break;
@@ -2398,13 +2444,16 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
                      int ncpts)
 {
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+#ifdef HAVE_IB_GET_DMA_MR
        unsigned long   flags;
+#endif
        int             cpt;
        int             rc;
        int             i;
 
        tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
+#ifdef HAVE_IB_GET_DMA_MR
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        if (tunables->lnd_map_on_demand == 0) {
                read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
@@ -2413,6 +2462,7 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
        }
 
        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+#endif
 
        if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
                CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
@@ -2451,7 +2501,9 @@ kiblnd_net_init_pools(kib_net_t *net, struct lnet_ni *ni, __u32 *cpts,
        if (i > 0)
                LASSERT(i == ncpts);
 
+#ifdef HAVE_IB_GET_DMA_MR
  create_tx_pool:
+#endif
        net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
                                          sizeof(kib_tx_poolset_t));
        if (net->ibn_tx_ps == NULL) {
@@ -2526,6 +2578,7 @@ kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
         return -EINVAL;
 }
 
+#ifdef HAVE_IB_GET_DMA_MR
 static void
 kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
 {
@@ -2536,11 +2589,14 @@ kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
 
        hdev->ibh_mrs = NULL;
 }
+#endif
 
 void
 kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
 {
+#ifdef HAVE_IB_GET_DMA_MR
         kiblnd_hdev_cleanup_mrs(hdev);
+#endif
 
         if (hdev->ibh_pd != NULL)
                 ib_dealloc_pd(hdev->ibh_pd);
@@ -2551,6 +2607,7 @@ kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
         LIBCFS_FREE(hdev, sizeof(*hdev));
 }
 
+#ifdef HAVE_IB_GET_DMA_MR
 static int
 kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
 {
@@ -2574,6 +2631,7 @@ kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
 
        return 0;
 }
+#endif
 
 static int
 kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
@@ -2710,12 +2768,16 @@ kiblnd_dev_failover(kib_dev_t *dev)
         hdev->ibh_cmid  = cmid;
         hdev->ibh_ibdev = cmid->device;
 
-        pd = ib_alloc_pd(cmid->device);
-        if (IS_ERR(pd)) {
-                rc = PTR_ERR(pd);
-                CERROR("Can't allocate PD: %d\n", rc);
-                goto out;
-        }
+#ifdef HAVE_IB_ALLOC_PD_2ARGS
+       pd = ib_alloc_pd(cmid->device, 0);
+#else
+       pd = ib_alloc_pd(cmid->device);
+#endif
+       if (IS_ERR(pd)) {
+               rc = PTR_ERR(pd);
+               CERROR("Can't allocate PD: %d\n", rc);
+               goto out;
+       }
 
         hdev->ibh_pd = pd;
 
@@ -2725,11 +2787,19 @@ kiblnd_dev_failover(kib_dev_t *dev)
                 goto out;
         }
 
-        rc = kiblnd_hdev_setup_mrs(hdev);
-        if (rc != 0) {
-                CERROR("Can't setup device: %d\n", rc);
-                goto out;
-        }
+#ifdef HAVE_IB_GET_DMA_MR
+       rc = kiblnd_hdev_setup_mrs(hdev);
+       if (rc != 0) {
+               CERROR("Can't setup device: %d\n", rc);
+               goto out;
+       }
+#else
+       rc = kiblnd_hdev_get_attr(hdev);
+       if (rc != 0) {
+               CERROR("Can't get device attributes: %d\n", rc);
+               goto out;
+       }
+#endif
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
@@ -3202,19 +3272,19 @@ kiblnd_startup(struct lnet_ni *ni)
 
        kiblnd_tunables_setup(ni);
 
-        if (ni->ni_interfaces[0] != NULL) {
-                /* Use the IPoIB interface specified in 'networks=' */
+       if (ni->ni_interfaces[0] != NULL) {
+               /* Use the IPoIB interface specified in 'networks=' */
 
-                CLASSERT (LNET_MAX_INTERFACES > 1);
-                if (ni->ni_interfaces[1] != NULL) {
-                        CERROR("Multiple interfaces not supported\n");
-                        goto failed;
-                }
+               CLASSERT(LNET_NUM_INTERFACES > 1);
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Multiple interfaces not supported\n");
+                       goto failed;
+               }
 
-                ifname = ni->ni_interfaces[0];
-        } else {
-                ifname = *kiblnd_tunables.kib_default_ipif;
-        }
+               ifname = ni->ni_interfaces[0];
+       } else {
+               ifname = *kiblnd_tunables.kib_default_ipif;
+       }
 
         if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
                 CERROR("IPoIB interface name too long: %s\n", ifname);