Whamcloud - gitweb
LU-12621 o2iblnd: cache max_qp_wr
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.c
index 29e82d7..0d7b50a 100644 (file)
@@ -570,9 +570,9 @@ kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
 static void
 kiblnd_debug_rx(struct kib_rx *rx)
 {
-        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
-               rx, rx->rx_status, rx->rx_msg->ibm_type,
-               rx->rx_msg->ibm_credits);
+       CDEBUG(D_CONSOLE, "      %p msg_type %x cred %d\n",
+              rx, rx->rx_msg->ibm_type,
+              rx->rx_msg->ibm_credits);
 }
 
 static void
@@ -674,7 +674,7 @@ kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 static int
 kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 {
-       cpumask_t       *mask;
+       cpumask_var_t   *mask;
        int             vectors;
        int             off;
        int             i;
@@ -688,8 +688,8 @@ kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
 
        /* hash NID to CPU id in this partition... */
        ibp_nid = conn->ibc_peer->ibp_nid;
-       off = do_div(ibp_nid, cpumask_weight(mask));
-       for_each_cpu(i, mask) {
+       off = do_div(ibp_nid, cpumask_weight(*mask));
+       for_each_cpu(i, *mask) {
                if (off-- == 0)
                        return i % vectors;
        }
@@ -734,16 +734,28 @@ static unsigned int kiblnd_send_wrs(struct kib_conn *conn)
         * One WR for the LNet message
         * And ibc_max_frags for the transfer WRs
         */
-       unsigned int ret = 1 + conn->ibc_max_frags;
+       int ret;
+       int multiplier = 1 + conn->ibc_max_frags;
        enum kib_dev_caps dev_caps = conn->ibc_hdev->ibh_dev->ibd_dev_caps;
 
        /* FastReg needs two extra WRs for map and invalidate */
        if (dev_caps & IBLND_DEV_CAPS_FASTREG_ENABLED)
-               ret += 2;
+               multiplier += 2;
 
        /* account for a maximum of ibc_queue_depth in-flight transfers */
-       ret *= conn->ibc_queue_depth;
-       return ret;
+       ret = multiplier * conn->ibc_queue_depth;
+
+       if (ret > conn->ibc_hdev->ibh_max_qp_wr) {
+               CDEBUG(D_NET, "peer_credits %u will result in send work "
+                      "request size %d larger than maximum %d device "
+                      "can handle\n", conn->ibc_queue_depth, ret,
+                      conn->ibc_hdev->ibh_max_qp_wr);
+               conn->ibc_queue_depth =
+                       conn->ibc_hdev->ibh_max_qp_wr / multiplier;
+       }
+
+       /* don't go beyond the maximum the device can handle */
+       return min(ret, conn->ibc_hdev->ibh_max_qp_wr);
 }
 
 struct kib_conn *
@@ -900,20 +912,14 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
        init_qp_attr->qp_type = IB_QPT_RC;
        init_qp_attr->send_cq = cq;
        init_qp_attr->recv_cq = cq;
+       /*
+        * kiblnd_send_wrs() can change the connection's queue depth if
+        * the maximum work requests for the device is maxed out
+        */
+       init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
+       init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
 
-       conn->ibc_sched = sched;
-
-       do {
-               init_qp_attr->cap.max_send_wr = kiblnd_send_wrs(conn);
-               init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
-
-               rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
-               if (!rc || conn->ibc_queue_depth < 2)
-                       break;
-
-               conn->ibc_queue_depth--;
-       } while (rc);
-
+       rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
        if (rc) {
                CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d, "
                       "send_sge: %d, recv_sge: %d\n",
@@ -924,6 +930,8 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
                goto failed_2;
        }
 
+       conn->ibc_sched = sched;
+
        if (conn->ibc_queue_depth != peer_ni->ibp_queue_depth)
                CWARN("peer %s - queue depth reduced from %u to %u"
                      "  to allow for qp creation\n",
@@ -1863,8 +1871,8 @@ again:
                                tx_pages_mapped = 1;
                        }
 
-                       pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
-                                                   pages, npages, iov);
+                       pfmr = kib_fmr_pool_map(fpo->fmr.fpo_fmr_pool,
+                                               pages, npages, iov);
                        if (likely(!IS_ERR(pfmr))) {
                                fmr->fmr_key  = is_rx ? pfmr->fmr->rkey
                                                      : pfmr->fmr->lkey;
@@ -2562,6 +2570,7 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
 #endif
 
        hdev->ibh_mr_size = dev_attr->max_mr_size;
+       hdev->ibh_max_qp_wr = dev_attr->max_qp_wr;
 
        /* Setup device Memory Registration capabilities */
 #ifdef HAVE_IB_DEVICE_OPS
@@ -2590,9 +2599,7 @@ kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
                rc = -ENOSYS;
        }
 
-       if (rc == 0 && hdev->ibh_mr_size == ~0ULL)
-               hdev->ibh_mr_shift = 64;
-       else if (rc != 0)
+       if (rc != 0)
                rc = -EINVAL;
 
 #ifndef HAVE_IB_DEVICE_ATTRS
@@ -2665,7 +2672,7 @@ kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 }
 
 static int
-kiblnd_dev_need_failover(struct kib_dev *dev)
+kiblnd_dev_need_failover(struct kib_dev *dev, struct net *ns)
 {
         struct rdma_cm_id  *cmid;
         struct sockaddr_in  srcaddr;
@@ -2687,8 +2694,8 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
          *
          * a. rdma_bind_addr(), it will conflict with listener cmid
          * b. rdma_resolve_addr() to zero addr */
-        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_dummy_callback, dev,
+                                    RDMA_PS_TCP, IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2717,7 +2724,7 @@ kiblnd_dev_need_failover(struct kib_dev *dev)
 }
 
 int
-kiblnd_dev_failover(struct kib_dev *dev)
+kiblnd_dev_failover(struct kib_dev *dev, struct net *ns)
 {
        struct list_head    zombie_tpo = LIST_HEAD_INIT(zombie_tpo);
        struct list_head    zombie_ppo = LIST_HEAD_INIT(zombie_ppo);
@@ -2736,7 +2743,7 @@ kiblnd_dev_failover(struct kib_dev *dev)
                  dev->ibd_can_failover ||
                  dev->ibd_hdev == NULL);
 
-        rc = kiblnd_dev_need_failover(dev);
+       rc = kiblnd_dev_need_failover(dev, ns);
         if (rc <= 0)
                 goto out;
 
@@ -2757,8 +2764,8 @@ kiblnd_dev_failover(struct kib_dev *dev)
                 rdma_destroy_id(cmid);
         }
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
-                                     IB_QPT_RC);
+       cmid = kiblnd_rdma_create_id(ns, kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
         if (IS_ERR(cmid)) {
                 rc = PTR_ERR(cmid);
                 CERROR("Failed to create cmid for failover: %d\n", rc);
@@ -2877,80 +2884,6 @@ kiblnd_destroy_dev(struct kib_dev *dev)
         LIBCFS_FREE(dev, sizeof(*dev));
 }
 
-static struct kib_dev *
-kiblnd_create_dev(char *ifname)
-{
-       struct net_device *netdev;
-       struct kib_dev *dev = NULL;
-       int flags;
-       int rc;
-
-       rtnl_lock();
-       for_each_netdev(&init_net, netdev) {
-               struct in_device *in_dev;
-
-               if (strcmp(netdev->name, "lo") == 0) /* skip the loopback IF */
-                       continue;
-
-               flags = dev_get_flags(netdev);
-               if (!(flags & IFF_UP)) {
-                       CWARN("Can't query IPoIB interface %s: it's down\n",
-                             netdev->name);
-                       continue;
-               }
-
-               in_dev = __in_dev_get_rtnl(netdev);
-               if (!in_dev) {
-                       CWARN("Interface %s has no IPv4 status.\n",
-                             netdev->name);
-                       continue;
-               }
-
-               for_ifa(in_dev)
-                       if (strcmp(ifname, ifa->ifa_label) == 0) {
-                               LIBCFS_ALLOC(dev, sizeof(*dev));
-                               if (!dev)
-                                       goto unlock;
-
-                               dev->ibd_can_failover = !!(flags & IFF_MASTER);
-                               dev->ibd_ifip = ntohl(ifa->ifa_local);
-
-                               INIT_LIST_HEAD(&dev->ibd_nets);
-                               INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
-                               INIT_LIST_HEAD(&dev->ibd_fail_list);
-                               break;
-                       }
-               endfor_ifa(in_dev);
-       }
-       rtnl_unlock();
-
-       if (!dev) {
-               CERROR("Can't find any usable interfaces\n");
-               return NULL;
-       }
-
-       if (dev->ibd_ifip == 0) {
-               CERROR("Can't initialize device: no IP address\n");
-               goto free_dev;
-       }
-       strcpy(&dev->ibd_ifname[0], ifname);
-
-       /* initialize the device */
-       rc = kiblnd_dev_failover(dev);
-       if (rc != 0) {
-               CERROR("Can't initialize device: %d\n", rc);
-               goto free_dev;
-       }
-
-       list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
-       return dev;
-unlock:
-       rtnl_unlock();
-free_dev:
-       LIBCFS_FREE(dev, sizeof(*dev));
-       return NULL;
-}
-
 static void
 kiblnd_base_shutdown(void)
 {
@@ -3098,7 +3031,7 @@ out:
 }
 
 static int
-kiblnd_base_startup(void)
+kiblnd_base_startup(struct net *ns)
 {
        struct kib_sched_info   *sched;
        int                     rc;
@@ -3171,7 +3104,7 @@ kiblnd_base_startup(void)
         }
 
        if (*kiblnd_tunables.kib_dev_failover != 0)
-               rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+               rc = kiblnd_thread_start(kiblnd_failover_thread, ns,
                                         "kiblnd_failover");
 
         if (rc != 0) {
@@ -3231,8 +3164,7 @@ kiblnd_start_schedulers(struct kib_sched_info *sched)
        return rc;
 }
 
-static int
-kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, u32 *cpts, int ncpts)
+static int kiblnd_dev_start_threads(struct kib_dev *dev, u32 *cpts, int ncpts)
 {
        int     cpt;
        int     rc;
@@ -3244,7 +3176,7 @@ kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, u32 *cpts, int ncpts)
                cpt = (cpts == NULL) ? i : cpts[i];
                sched = kiblnd_data.kib_scheds[cpt];
 
-               if (!newdev && sched->ibs_nthreads > 0)
+               if (sched->ibs_nthreads > 0)
                        continue;
 
                rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
@@ -3257,54 +3189,21 @@ kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, u32 *cpts, int ncpts)
        return 0;
 }
 
-static struct kib_dev *
-kiblnd_dev_search(char *ifname)
-{
-       struct kib_dev *alias = NULL;
-       struct kib_dev *dev;
-       char            *colon;
-       char            *colon2;
-
-       colon = strchr(ifname, ':');
-       list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
-               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-                       return dev;
-
-               if (alias != NULL)
-                       continue;
-
-               colon2 = strchr(dev->ibd_ifname, ':');
-               if (colon != NULL)
-                       *colon = 0;
-               if (colon2 != NULL)
-                       *colon2 = 0;
-
-               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
-                       alias = dev;
-
-               if (colon != NULL)
-                       *colon = ':';
-               if (colon2 != NULL)
-                       *colon2 = ':';
-       }
-       return alias;
-}
-
 static int
 kiblnd_startup(struct lnet_ni *ni)
 {
         char                     *ifname;
+       struct lnet_inetdev *ifaces = NULL;
        struct kib_dev *ibdev = NULL;
        struct kib_net *net;
         unsigned long             flags;
         int                       rc;
-       int                       newdev;
-       int                       node_id;
+       int i;
 
         LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
 
         if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
-                rc = kiblnd_base_startup();
+               rc = kiblnd_base_startup(ni->ni_net_ns);
                 if (rc != 0)
                         return rc;
         }
@@ -3325,10 +3224,8 @@ kiblnd_startup(struct lnet_ni *ni)
         */
        if (ni->ni_interfaces[0] != NULL) {
                /* Use the IPoIB interface specified in 'networks=' */
-
-               CLASSERT(LNET_INTERFACES_NUM > 1);
                if (ni->ni_interfaces[1] != NULL) {
-                       CERROR("Multiple interfaces not supported\n");
+                       CERROR("ko2iblnd: Multiple interfaces not supported\n");
                        goto failed;
                }
 
@@ -3342,24 +3239,51 @@ kiblnd_startup(struct lnet_ni *ni)
                 goto failed;
         }
 
-       ibdev = kiblnd_dev_search(ifname);
+       rc = lnet_inet_enumerate(&ifaces, ni->ni_net_ns);
+       if (rc < 0)
+               goto failed;
+
+       for (i = 0; i < rc; i++) {
+               if (strcmp(ifname, ifaces[i].li_name) == 0)
+                       break;
+       }
+
+       if (i == rc) {
+               CERROR("ko2iblnd: No matching interfaces\n");
+               rc = -ENOENT;
+               goto failed;
+       }
+
+       LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+       if (!ibdev) {
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       ibdev->ibd_ifip = ifaces[i].li_ipaddr;
+       strlcpy(ibdev->ibd_ifname, ifaces[i].li_name,
+               sizeof(ibdev->ibd_ifname));
+       ibdev->ibd_can_failover = !!(ifaces[i].li_flags & IFF_MASTER);
 
-       newdev = ibdev == NULL;
-       /* hmm...create kib_dev even for alias */
-       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-               ibdev = kiblnd_create_dev(ifname);
+       INIT_LIST_HEAD(&ibdev->ibd_nets);
+       INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+       INIT_LIST_HEAD(&ibdev->ibd_fail_list);
 
-       if (ibdev == NULL)
+       /* initialize the device */
+       rc = kiblnd_dev_failover(ibdev, ni->ni_net_ns);
+       if (rc) {
+               CERROR("ko2iblnd: Can't initialize device: rc = %d\n", rc);
                goto failed;
+       }
 
-       node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
-       ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+       list_add_tail(&ibdev->ibd_list, &kiblnd_data.kib_devs);
 
        net->ibn_dev = ibdev;
        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
-       rc = kiblnd_dev_start_threads(ibdev, newdev,
-                                     ni->ni_cpts, ni->ni_ncpts);
+       ni->ni_dev_cpt = ifaces[i].li_cpt;
+
+       rc = kiblnd_dev_start_threads(ibdev, ni->ni_cpts, ni->ni_ncpts);
        if (rc != 0)
                goto failed;
 
@@ -3382,6 +3306,7 @@ failed:
        if (net != NULL && net->ibn_dev == NULL && ibdev != NULL)
                 kiblnd_destroy_dev(ibdev);
 
+       kfree(ifaces);
         kiblnd_shutdown(ni);
 
         CDEBUG(D_NET, "kiblnd_startup failed\n");