Whamcloud - gitweb
LU-56 ksocklnd: CPT affinity socklnd
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd.c
index acf2494..c26de82 100644 (file)
@@ -662,37 +662,26 @@ ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
 }
 
 ksock_sched_t *
-ksocknal_choose_scheduler_locked (unsigned int irq)
+ksocknal_choose_scheduler_locked(unsigned int cpt)
 {
-        ksock_sched_t    *sched;
-        ksock_irqinfo_t  *info;
-        int               i;
-
-        LASSERT (irq < CFS_NR_IRQS);
-        info = &ksocknal_data.ksnd_irqinfo[irq];
-
-        if (irq != 0 &&                         /* hardware NIC */
-            info->ksni_valid) {                 /* already set up */
-                return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]);
-        }
-
-        /* software NIC (irq == 0) || not associated with a scheduler yet.
-         * Choose the CPU with the fewest connections... */
-        sched = &ksocknal_data.ksnd_schedulers[0];
-        for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++)
-                if (sched->kss_nconns >
-                    ksocknal_data.ksnd_schedulers[i].kss_nconns)
-                        sched = &ksocknal_data.ksnd_schedulers[i];
-
-        if (irq != 0) {                         /* Hardware NIC */
-                info->ksni_valid = 1;
-                info->ksni_sched = (unsigned int)(sched - ksocknal_data.ksnd_schedulers);
-
-                /* no overflow... */
-                LASSERT (info->ksni_sched == (unsigned int)(sched - ksocknal_data.ksnd_schedulers));
-        }
-
-        return (sched);
+       struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+       ksock_sched_t           *sched;
+       int                     i;
+
+       LASSERT(info->ksi_nthreads > 0);
+
+       sched = &info->ksi_scheds[0];
+       /*
+        * NB: it's safe so far, but info->ksi_nthreads could be changed
+        * at runtime when we have dynamic LNet configuration, then we
+        * need to take care of this.
+        */
+       for (i = 1; i < info->ksi_nthreads; i++) {
+               if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+                       sched = &info->ksi_scheds[i];
+       }
+
+       return sched;
 }
 
 int
@@ -1040,7 +1029,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         ksock_peer_t      *peer2;
         ksock_sched_t     *sched;
         ksock_hello_msg_t *hello;
-        unsigned int       irq;
+       int                cpt;
         ksock_tx_t        *tx;
         ksock_tx_t        *txtmp;
         int                rc;
@@ -1051,8 +1040,6 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
 
         LASSERT (active == (type != SOCKLND_CONN_NONE));
 
-        irq = ksocknal_lib_sock_irq (sock);
-
         LIBCFS_ALLOC(conn, sizeof(*conn));
         if (conn == NULL) {
                 rc = -ENOMEM;
@@ -1138,6 +1125,8 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         LASSERT (conn->ksnc_proto != NULL);
         LASSERT (peerid.nid != LNET_NID_ANY);
 
+       cpt = lnet_cpt_of_nid(peerid.nid);
+
         if (active) {
                 ksocknal_peer_addref(peer);
                 cfs_write_lock_bh (global_lock);
@@ -1273,7 +1262,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         peer->ksnp_send_keepalive = 0;
         peer->ksnp_error = 0;
 
-        sched = ksocknal_choose_scheduler_locked (irq);
+       sched = ksocknal_choose_scheduler_locked(cpt);
         sched->kss_nconns++;
         conn->ksnc_scheduler = sched;
 
@@ -1309,14 +1298,12 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
          *        socket callbacks.
          */
 
-        ksocknal_lib_bind_irq (irq);
-
-        CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
-               " incarnation:"LPD64" sched[%d]/%d\n",
-               libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
-               HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
-               conn->ksnc_port, incarnation,
-               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
+       CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+              " incarnation:"LPD64" sched[%d:%d]\n",
+              libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+              HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+              conn->ksnc_port, incarnation, cpt,
+              (int)(sched - &sched->kss_info->ksi_scheds[0]));
 
         if (active) {
                 /* additional routes after interface exchange? */
@@ -2196,8 +2183,7 @@ ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
                 data->ioc_u32[1] = conn->ksnc_port;
                 data->ioc_u32[2] = conn->ksnc_myipaddr;
                 data->ioc_u32[3] = conn->ksnc_type;
-                data->ioc_u32[4] = (__u32)(conn->ksnc_scheduler -
-                                   ksocknal_data.ksnd_schedulers);
+               data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
                 data->ioc_u32[5] = rxmem;
                 data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
                 ksocknal_conn_decref(conn);
@@ -2236,9 +2222,19 @@ ksocknal_free_buffers (void)
 {
         LASSERT (cfs_atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
-        if (ksocknal_data.ksnd_schedulers != NULL)
-                LIBCFS_FREE (ksocknal_data.ksnd_schedulers,
-                             sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
+       if (ksocknal_data.ksnd_sched_info != NULL) {
+               struct ksock_sched_info *info;
+               int                     i;
+
+               cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+                       if (info->ksi_scheds != NULL) {
+                               LIBCFS_FREE(info->ksi_scheds,
+                                           info->ksi_nthreads_max *
+                                           sizeof(info->ksi_scheds[0]));
+                       }
+               }
+               cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+       }
 
         LIBCFS_FREE (ksocknal_data.ksnd_peers,
                      sizeof (cfs_list_t) *
@@ -2265,10 +2261,12 @@ ksocknal_free_buffers (void)
 }
 
 void
-ksocknal_base_shutdown (void)
+ksocknal_base_shutdown(void)
 {
-        ksock_sched_t *sched;
-        int            i;
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       int                     i;
+       int                     j;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                cfs_atomic_read (&libcfs_kmemory));
@@ -2284,33 +2282,50 @@ ksocknal_base_shutdown (void)
                 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                         LASSERT (cfs_list_empty (&ksocknal_data.ksnd_peers[i]));
                 }
+
+               LASSERT(cfs_list_empty(&ksocknal_data.ksnd_nets));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_enomem_conns));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_zombie_conns));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_connreqs));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_routes));
 
-                if (ksocknal_data.ksnd_schedulers != NULL)
-                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                                ksock_sched_t *kss =
-                                        &ksocknal_data.ksnd_schedulers[i];
-
-                                LASSERT (cfs_list_empty (&kss->kss_tx_conns));
-                                LASSERT (cfs_list_empty (&kss->kss_rx_conns));
-                                LASSERT (cfs_list_empty (&kss-> \
-                                                         kss_zombie_noop_txs));
-                                LASSERT (kss->kss_nconns == 0);
-                        }
-
-                /* flag threads to terminate; wake and wait for them to die */
-                ksocknal_data.ksnd_shuttingdown = 1;
-                cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq);
-                cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq);
-
-                if (ksocknal_data.ksnd_schedulers != NULL)
-                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                                sched = &ksocknal_data.ksnd_schedulers[i];
-                                cfs_waitq_broadcast(&sched->kss_waitq);
-                        }
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+                                       sched = &info->ksi_scheds[j];
+                                       LASSERT(cfs_list_empty(&sched->\
+                                                              kss_tx_conns));
+                                       LASSERT(cfs_list_empty(&sched->\
+                                                              kss_rx_conns));
+                                       LASSERT(cfs_list_empty(&sched-> \
+                                                 kss_zombie_noop_txs));
+                                       LASSERT(sched->kss_nconns == 0);
+                               }
+                       }
+               }
+
+               /* flag threads to terminate; wake and wait for them to die */
+               ksocknal_data.ksnd_shuttingdown = 1;
+               cfs_waitq_broadcast(&ksocknal_data.ksnd_connd_waitq);
+               cfs_waitq_broadcast(&ksocknal_data.ksnd_reaper_waitq);
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+                                       sched = &info->ksi_scheds[j];
+                                       cfs_waitq_broadcast(&sched->kss_waitq);
+                               }
+                       }
+               }
 
                 i = 4;
                 cfs_read_lock (&ksocknal_data.ksnd_global_lock);
@@ -2353,10 +2368,11 @@ ksocknal_new_incarnation (void)
 }
 
 int
-ksocknal_base_startup (void)
+ksocknal_base_startup(void)
 {
-        int               rc;
-        int               i;
+       struct ksock_sched_info *info;
+       int                     rc;
+       int                     i;
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
         LASSERT (ksocknal_data.ksnd_nnets == 0);
@@ -2374,6 +2390,7 @@ ksocknal_base_startup (void)
                 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
 
         cfs_rwlock_init(&ksocknal_data.ksnd_global_lock);
+       CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
 
         cfs_spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
@@ -2389,37 +2406,48 @@ ksocknal_base_startup (void)
         cfs_spin_lock_init (&ksocknal_data.ksnd_tx_lock);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
 
-        /* NB memset above zeros whole of ksocknal_data, including
-         * ksocknal_data.ksnd_irqinfo[all].ksni_valid */
+       /* NB memset above zeros whole of ksocknal_data */
 
-        /* flag lists/ptrs/locks initialised */
-        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
-        PORTAL_MODULE_USE;
+       /* flag lists/ptrs/locks initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+       PORTAL_MODULE_USE;
 
-        ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
-        LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers,
-                     sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
-        if (ksocknal_data.ksnd_schedulers == NULL)
-                goto failed;
+       ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+                                                        sizeof(*info));
+       if (ksocknal_data.ksnd_sched_info == NULL)
+               goto failed;
 
-        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+       cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+               ksock_sched_t   *sched;
+               int             nthrs;
 
-                cfs_spin_lock_init (&kss->kss_lock);
-                CFS_INIT_LIST_HEAD (&kss->kss_rx_conns);
-                CFS_INIT_LIST_HEAD (&kss->kss_tx_conns);
-                CFS_INIT_LIST_HEAD (&kss->kss_zombie_noop_txs);
-                cfs_waitq_init (&kss->kss_waitq);
-        }
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+               } else {
+                       /* max to half of CPUs, assume another half should be
+                        * reserved for upper layer modules */
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+               }
 
-        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                rc = ksocknal_thread_start (ksocknal_scheduler,
-                                            &ksocknal_data.ksnd_schedulers[i]);
-                if (rc != 0) {
-                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
-                               i, rc);
-                        goto failed;
-                }
+               info->ksi_nthreads_max = nthrs;
+               info->ksi_cpt = i;
+
+               LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+                                info->ksi_nthreads_max * sizeof(*sched));
+               if (info->ksi_scheds == NULL)
+                       goto failed;
+
+               for (; nthrs > 0; nthrs--) {
+                       sched = &info->ksi_scheds[nthrs - 1];
+
+                       sched->kss_info = info;
+                       cfs_spin_lock_init(&sched->kss_lock);
+                       CFS_INIT_LIST_HEAD(&sched->kss_rx_conns);
+                       CFS_INIT_LIST_HEAD(&sched->kss_tx_conns);
+                       CFS_INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+                       cfs_waitq_init(&sched->kss_waitq);
+               }
         }
 
         ksocknal_data.ksnd_connd_starting         = 0;
@@ -2565,7 +2593,8 @@ ksocknal_shutdown (lnet_ni_t *ni)
                 LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
         }
 
-        LIBCFS_FREE(net, sizeof(*net));
+       cfs_list_del(&net->ksnn_list);
+       LIBCFS_FREE(net, sizeof(*net));
 
         ksocknal_data.ksnd_nnets--;
         if (ksocknal_data.ksnd_nnets == 0)
@@ -2616,6 +2645,8 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
 
                 net->ksnn_interfaces[j].ksni_ipaddr = ip;
                 net->ksnn_interfaces[j].ksni_netmask = mask;
+               strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+                       names[i], IFNAMSIZ);
                 j++;
         }
 
@@ -2628,6 +2659,114 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
 }
 
 int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+       int     new_ipif = 0;
+       int     i;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               char            *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+               char            *colon = strchr(ifnam, ':');
+               int             found  = 0;
+               ksock_net_t     *tmp;
+               int             j;
+
+               if (colon != NULL) /* ignore alias device */
+                       *colon = 0;
+
+               cfs_list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+                                       ksnn_list) {
+                       for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+                               char *ifnam2 = &tmp->ksnn_interfaces[j].\
+                                            ksni_name[0];
+                               char *colon2 = strchr(ifnam2, ':');
+
+                               if (colon2 != NULL)
+                                       *colon2 = 0;
+
+                               found = strcmp(ifnam, ifnam2) == 0;
+                               if (colon2 != NULL)
+                                       *colon2 = ':';
+                       }
+                       if (found)
+                               break;
+               }
+
+               new_ipif += !found;
+               if (colon != NULL)
+                       *colon = ':';
+       }
+
+       return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+       int     nthrs;
+       int     rc = 0;
+       int     i;
+
+       if (info->ksi_nthreads == 0) {
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = info->ksi_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              info->ksi_cpt);
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+                       nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+               }
+               nthrs = min(nthrs, info->ksi_nthreads_max);
+       } else {
+               LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+               /* increase two threads if there is new interface */
+               nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long    id;
+
+               id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+               rc = ksocknal_thread_start(ksocknal_scheduler, (void *)id);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      info->ksi_cpt, info->ksi_nthreads + i, rc);
+               break;
+       }
+
+       info->ksi_nthreads += i;
+       return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+       int     newif = ksocknal_search_new_ipif(net);
+       int     rc;
+       int     i;
+
+       LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+       for (i = 0; i < ncpts; i++) {
+               struct ksock_sched_info *info;
+               int cpt = (cpts == NULL) ? i : cpts[i];
+
+               LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+               info = ksocknal_data.ksnd_sched_info[cpt];
+
+               if (!newif && info->ksi_nthreads > 0)
+                       continue;
+
+               rc = ksocknal_start_schedulers(info);
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+
+int
 ksocknal_startup (lnet_ni_t *ni)
 {
         ksock_net_t  *net;
@@ -2646,7 +2785,6 @@ ksocknal_startup (lnet_ni_t *ni)
         if (net == NULL)
                 goto fail_0;
 
-        memset(net, 0, sizeof(*net));
         cfs_spin_lock_init(&net->ksnn_lock);
         net->ksnn_incarnation = ksocknal_new_incarnation();
         ni->ni_data = net;
@@ -2684,12 +2822,21 @@ ksocknal_startup (lnet_ni_t *ni)
                                        ni->ni_interfaces[i]);
                                 goto fail_1;
                         }
-                }
-                net->ksnn_ninterfaces = i;
-        }
 
-        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
-                                net->ksnn_interfaces[0].ksni_ipaddr);
+                       strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+                               ni->ni_interfaces[i], IFNAMSIZ);
+               }
+               net->ksnn_ninterfaces = i;
+       }
+
+       /* call it before add it to ksocknal_data.ksnd_nets */
+       rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto fail_1;
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                               net->ksnn_interfaces[0].ksni_ipaddr);
+       cfs_list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
 
         ksocknal_data.ksnd_nnets++;