Whamcloud - gitweb
LU-9448 lnet: handle empty CPTs
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.c
index a5e7543..12dade5 100644 (file)
@@ -698,6 +698,36 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
        return 1;
 }
 
+/*
+ * Get the scheduler bound to this CPT. If the scheduler has no
+ * threads, which means that the CPT has no CPUs, then grab the
+ * next scheduler that we can use.
+ *
+ * This case would be triggered if a NUMA node is configured with
+ * no associated CPUs.
+ */
+static struct kib_sched_info *
+kiblnd_get_scheduler(int cpt)
+{
+       struct kib_sched_info *sched;
+       int i;
+
+       sched = kiblnd_data.kib_scheds[cpt];
+
+       if (sched->ibs_nthreads > 0)
+               return sched;
+
+       cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+               if (sched->ibs_nthreads > 0) {
+                       CDEBUG(D_NET, "scheduler[%d] has no threads. selected scheduler[%d]\n",
+                                       cpt, sched->ibs_cpt);
+                       return sched;
+               }
+       }
+
+       return NULL;
+}
+
 kib_conn_t *
 kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                   int state, int version)
@@ -730,9 +760,18 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
        dev = net->ibn_dev;
 
        cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni);
-       sched = kiblnd_data.kib_scheds[cpt];
+       sched = kiblnd_get_scheduler(cpt);
+
+       if (sched == NULL) {
+               CERROR("no schedulers available. node is unhealthy\n");
+               goto failed_0;
+       }
 
-       LASSERT(sched->ibs_nthreads > 0);
+       /*
+        * The cpt might have changed if we ended up selecting a non cpt
+        * native scheduler. So use the scheduler's cpt instead.
+        */
+       cpt = sched->ibs_cpt;
 
        LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
                         sizeof(*init_qp_attr));
@@ -840,16 +879,16 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
-        init_qp_attr->event_handler = kiblnd_qp_event;
-        init_qp_attr->qp_context = conn;
+       init_qp_attr->event_handler = kiblnd_qp_event;
+       init_qp_attr->qp_context = conn;
        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-        init_qp_attr->cap.max_send_sge = 1;
-        init_qp_attr->cap.max_recv_sge = 1;
-        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-        init_qp_attr->qp_type = IB_QPT_RC;
-        init_qp_attr->send_cq = cq;
-        init_qp_attr->recv_cq = cq;
+       init_qp_attr->cap.max_send_sge = *kiblnd_tunables.kib_wrq_sge;
+       init_qp_attr->cap.max_recv_sge = 1;
+       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr->qp_type = IB_QPT_RC;
+       init_qp_attr->send_cq = cq;
+       init_qp_attr->recv_cq = cq;
 
        conn->ibc_sched = sched;
 
@@ -862,9 +901,12 @@ kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
        } while (rc);
 
        if (rc) {
-               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
-                       rc, init_qp_attr->cap.max_send_wr,
-                       init_qp_attr->cap.max_recv_wr);
+               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
+                      "send_sge: %d, recv_sge: %d\n",
+                      rc, init_qp_attr->cap.max_send_wr,
+                      init_qp_attr->cap.max_recv_wr,
+                      init_qp_attr->cap.max_send_sge,
+                      init_qp_attr->cap.max_recv_sge);
                goto failed_2;
        }
 
@@ -1123,15 +1165,15 @@ kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                         break;
                 }
 
-                LASSERT (conn->ibc_cmid != NULL);
-                data->ioc_nid = conn->ibc_peer->ibp_nid;
-                if (conn->ibc_cmid->route.path_rec == NULL)
-                        data->ioc_u32[0] = 0; /* iWarp has no path MTU */
-                else
-                        data->ioc_u32[0] =
-                        ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
-                kiblnd_conn_decref(conn);
-                break;
+               LASSERT(conn->ibc_cmid != NULL);
+               data->ioc_nid = conn->ibc_peer->ibp_nid;
+               if (conn->ibc_cmid->route.path_rec == NULL)
+                       data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+               else
+                       data->ioc_u32[0] =
+                       ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+               kiblnd_conn_decref(conn);
+               break;
         }
         case IOC_LIBCFS_CLOSE_CONNECTION: {
                 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
@@ -1946,7 +1988,7 @@ again:
                                return 0;
                        }
                        spin_unlock(&fps->fps_lock);
-                       rc = -EBUSY;
+                       rc = -EAGAIN;
                }
 
                spin_lock(&fps->fps_lock);
@@ -2230,7 +2272,8 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                 goto out;
 
         for (i = 0; i < pool->po_size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                list_del(&tx->tx_list);
                 if (tx->tx_pages != NULL)
@@ -2245,10 +2288,10 @@ kiblnd_destroy_tx_pool(kib_pool_t *pool)
                         LIBCFS_FREE(tx->tx_wrq,
                                     (1 + IBLND_MAX_RDMA_FRAGS) *
                                     sizeof(*tx->tx_wrq));
-                if (tx->tx_sge != NULL)
-                        LIBCFS_FREE(tx->tx_sge,
-                                    (1 + IBLND_MAX_RDMA_FRAGS) *
-                                    sizeof(*tx->tx_sge));
+               if (tx->tx_sge != NULL)
+                       LIBCFS_FREE(tx->tx_sge,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
+                                   sizeof(*tx->tx_sge));
                 if (tx->tx_rd != NULL)
                         LIBCFS_FREE(tx->tx_rd,
                                     offsetof(kib_rdma_desc_t,
@@ -2306,7 +2349,8 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
         memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
 
         for (i = 0; i < size; i++) {
-                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+               int       wrq_sge = *kiblnd_tunables.kib_wrq_sge;
 
                 tx->tx_pool = tpo;
                if (ps->ps_net->ibn_fmr_ps != NULL) {
@@ -2332,7 +2376,7 @@ kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
                        break;
 
                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
-                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                (1 + IBLND_MAX_RDMA_FRAGS) * wrq_sge *
                                 sizeof(*tx->tx_sge));
                if (tx->tx_sge == NULL)
                        break;
@@ -2724,10 +2768,10 @@ kiblnd_dev_failover(kib_dev_t *dev)
         hdev->ibh_cmid  = cmid;
         hdev->ibh_ibdev = cmid->device;
 
-#ifdef HAVE_IB_GET_DMA_MR
-       pd = ib_alloc_pd(cmid->device);
-#else
+#ifdef HAVE_IB_ALLOC_PD_2ARGS
        pd = ib_alloc_pd(cmid->device, 0);
+#else
+       pd = ib_alloc_pd(cmid->device);
 #endif
        if (IS_ERR(pd)) {
                rc = PTR_ERR(pd);
@@ -3228,19 +3272,19 @@ kiblnd_startup(struct lnet_ni *ni)
 
        kiblnd_tunables_setup(ni);
 
-        if (ni->ni_interfaces[0] != NULL) {
-                /* Use the IPoIB interface specified in 'networks=' */
+       if (ni->ni_interfaces[0] != NULL) {
+               /* Use the IPoIB interface specified in 'networks=' */
 
-                CLASSERT (LNET_MAX_INTERFACES > 1);
-                if (ni->ni_interfaces[1] != NULL) {
-                        CERROR("Multiple interfaces not supported\n");
-                        goto failed;
-                }
+               CLASSERT(LNET_NUM_INTERFACES > 1);
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Multiple interfaces not supported\n");
+                       goto failed;
+               }
 
-                ifname = ni->ni_interfaces[0];
-        } else {
-                ifname = *kiblnd_tunables.kib_default_ipif;
-        }
+               ifname = ni->ni_interfaces[0];
+       } else {
+               ifname = *kiblnd_tunables.kib_default_ipif;
+       }
 
         if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
                 CERROR("IPoIB interface name too long: %s\n", ifname);