Whamcloud - gitweb
LU-56 ksocklnd: CPT affinity socklnd
authorLiang Zhen <liang@whamcloud.com>
Fri, 11 May 2012 02:51:58 +0000 (10:51 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 3 Jul 2012 15:53:49 +0000 (11:53 -0400)
this patch covered a few things:
- implement percpt scheduler threads for socklnd
- decrease overall threads number for fat core machine
- create more threads only if there are more than one NIC
- remove IRQ affinity implementation from socklnd
  IRQ affinity is not very helpful because CPUs on modern computer
  are very powerful. Also, user can still setup IRQ affinity via
  /proc and cpu_pattern of libcfs

Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: Idfa19037a529fe96cb1432cbd7f55a5dfac89d29
Reviewed-on: http://review.whamcloud.com/2718
Reviewed-by: Lai Siyao <laisiyao@whamcloud.com>
Reviewed-by: Doug Oucharek <doug@whamcloud.com>
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_lib-darwin.c
lnet/klnds/socklnd/socklnd_lib-darwin.h
lnet/klnds/socklnd/socklnd_lib-linux.c
lnet/klnds/socklnd/socklnd_lib-linux.h
lnet/klnds/socklnd/socklnd_modparams.c

index acf2494..c26de82 100644 (file)
@@ -662,37 +662,26 @@ ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
 }
 
 ksock_sched_t *
-ksocknal_choose_scheduler_locked (unsigned int irq)
+ksocknal_choose_scheduler_locked(unsigned int cpt)
 {
-        ksock_sched_t    *sched;
-        ksock_irqinfo_t  *info;
-        int               i;
-
-        LASSERT (irq < CFS_NR_IRQS);
-        info = &ksocknal_data.ksnd_irqinfo[irq];
-
-        if (irq != 0 &&                         /* hardware NIC */
-            info->ksni_valid) {                 /* already set up */
-                return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]);
-        }
-
-        /* software NIC (irq == 0) || not associated with a scheduler yet.
-         * Choose the CPU with the fewest connections... */
-        sched = &ksocknal_data.ksnd_schedulers[0];
-        for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++)
-                if (sched->kss_nconns >
-                    ksocknal_data.ksnd_schedulers[i].kss_nconns)
-                        sched = &ksocknal_data.ksnd_schedulers[i];
-
-        if (irq != 0) {                         /* Hardware NIC */
-                info->ksni_valid = 1;
-                info->ksni_sched = (unsigned int)(sched - ksocknal_data.ksnd_schedulers);
-
-                /* no overflow... */
-                LASSERT (info->ksni_sched == (unsigned int)(sched - ksocknal_data.ksnd_schedulers));
-        }
-
-        return (sched);
+       struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+       ksock_sched_t           *sched;
+       int                     i;
+
+       LASSERT(info->ksi_nthreads > 0);
+
+       sched = &info->ksi_scheds[0];
+       /*
+        * NB: it's safe so far, but info->ksi_nthreads could be changed
+        * at runtime when we have dynamic LNet configuration, then we
+        * need to take care of this.
+        */
+       for (i = 1; i < info->ksi_nthreads; i++) {
+               if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+                       sched = &info->ksi_scheds[i];
+       }
+
+       return sched;
 }
 
 int
@@ -1040,7 +1029,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         ksock_peer_t      *peer2;
         ksock_sched_t     *sched;
         ksock_hello_msg_t *hello;
-        unsigned int       irq;
+       int                cpt;
         ksock_tx_t        *tx;
         ksock_tx_t        *txtmp;
         int                rc;
@@ -1051,8 +1040,6 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
 
         LASSERT (active == (type != SOCKLND_CONN_NONE));
 
-        irq = ksocknal_lib_sock_irq (sock);
-
         LIBCFS_ALLOC(conn, sizeof(*conn));
         if (conn == NULL) {
                 rc = -ENOMEM;
@@ -1138,6 +1125,8 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         LASSERT (conn->ksnc_proto != NULL);
         LASSERT (peerid.nid != LNET_NID_ANY);
 
+       cpt = lnet_cpt_of_nid(peerid.nid);
+
         if (active) {
                 ksocknal_peer_addref(peer);
                 cfs_write_lock_bh (global_lock);
@@ -1273,7 +1262,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         peer->ksnp_send_keepalive = 0;
         peer->ksnp_error = 0;
 
-        sched = ksocknal_choose_scheduler_locked (irq);
+       sched = ksocknal_choose_scheduler_locked(cpt);
         sched->kss_nconns++;
         conn->ksnc_scheduler = sched;
 
@@ -1309,14 +1298,12 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
          *        socket callbacks.
          */
 
-        ksocknal_lib_bind_irq (irq);
-
-        CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
-               " incarnation:"LPD64" sched[%d]/%d\n",
-               libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
-               HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
-               conn->ksnc_port, incarnation,
-               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
+       CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+              " incarnation:"LPD64" sched[%d:%d]\n",
+              libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+              HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+              conn->ksnc_port, incarnation, cpt,
+              (int)(sched - &sched->kss_info->ksi_scheds[0]));
 
         if (active) {
                 /* additional routes after interface exchange? */
@@ -2196,8 +2183,7 @@ ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
                 data->ioc_u32[1] = conn->ksnc_port;
                 data->ioc_u32[2] = conn->ksnc_myipaddr;
                 data->ioc_u32[3] = conn->ksnc_type;
-                data->ioc_u32[4] = (__u32)(conn->ksnc_scheduler -
-                                   ksocknal_data.ksnd_schedulers);
+               data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
                 data->ioc_u32[5] = rxmem;
                 data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
                 ksocknal_conn_decref(conn);
@@ -2236,9 +2222,19 @@ ksocknal_free_buffers (void)
 {
         LASSERT (cfs_atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
-        if (ksocknal_data.ksnd_schedulers != NULL)
-                LIBCFS_FREE (ksocknal_data.ksnd_schedulers,
-                             sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
+       if (ksocknal_data.ksnd_sched_info != NULL) {
+               struct ksock_sched_info *info;
+               int                     i;
+
+               cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+                       if (info->ksi_scheds != NULL) {
+                               LIBCFS_FREE(info->ksi_scheds,
+                                           info->ksi_nthreads_max *
+                                           sizeof(info->ksi_scheds[0]));
+                       }
+               }
+               cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+       }
 
         LIBCFS_FREE (ksocknal_data.ksnd_peers,
                      sizeof (cfs_list_t) *
@@ -2265,10 +2261,12 @@ ksocknal_free_buffers (void)
 }
 
 void
-ksocknal_base_shutdown (void)
+ksocknal_base_shutdown(void)
 {
-        ksock_sched_t *sched;
-        int            i;
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       int                     i;
+       int                     j;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                cfs_atomic_read (&libcfs_kmemory));
@@ -2284,33 +2282,50 @@ ksocknal_base_shutdown (void)
                 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                         LASSERT (cfs_list_empty (&ksocknal_data.ksnd_peers[i]));
                 }
+
+               LASSERT(cfs_list_empty(&ksocknal_data.ksnd_nets));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_enomem_conns));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_zombie_conns));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_connreqs));
                 LASSERT (cfs_list_empty (&ksocknal_data.ksnd_connd_routes));
 
-                if (ksocknal_data.ksnd_schedulers != NULL)
-                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                                ksock_sched_t *kss =
-                                        &ksocknal_data.ksnd_schedulers[i];
-
-                                LASSERT (cfs_list_empty (&kss->kss_tx_conns));
-                                LASSERT (cfs_list_empty (&kss->kss_rx_conns));
-                                LASSERT (cfs_list_empty (&kss-> \
-                                                         kss_zombie_noop_txs));
-                                LASSERT (kss->kss_nconns == 0);
-                        }
-
-                /* flag threads to terminate; wake and wait for them to die */
-                ksocknal_data.ksnd_shuttingdown = 1;
-                cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq);
-                cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq);
-
-                if (ksocknal_data.ksnd_schedulers != NULL)
-                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                                sched = &ksocknal_data.ksnd_schedulers[i];
-                                cfs_waitq_broadcast(&sched->kss_waitq);
-                        }
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+                                       sched = &info->ksi_scheds[j];
+                                       LASSERT(cfs_list_empty(&sched->\
+                                                              kss_tx_conns));
+                                       LASSERT(cfs_list_empty(&sched->\
+                                                              kss_rx_conns));
+                                       LASSERT(cfs_list_empty(&sched-> \
+                                                 kss_zombie_noop_txs));
+                                       LASSERT(sched->kss_nconns == 0);
+                               }
+                       }
+               }
+
+               /* flag threads to terminate; wake and wait for them to die */
+               ksocknal_data.ksnd_shuttingdown = 1;
+               cfs_waitq_broadcast(&ksocknal_data.ksnd_connd_waitq);
+               cfs_waitq_broadcast(&ksocknal_data.ksnd_reaper_waitq);
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+                                       sched = &info->ksi_scheds[j];
+                                       cfs_waitq_broadcast(&sched->kss_waitq);
+                               }
+                       }
+               }
 
                 i = 4;
                 cfs_read_lock (&ksocknal_data.ksnd_global_lock);
@@ -2353,10 +2368,11 @@ ksocknal_new_incarnation (void)
 }
 
 int
-ksocknal_base_startup (void)
+ksocknal_base_startup(void)
 {
-        int               rc;
-        int               i;
+       struct ksock_sched_info *info;
+       int                     rc;
+       int                     i;
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
         LASSERT (ksocknal_data.ksnd_nnets == 0);
@@ -2374,6 +2390,7 @@ ksocknal_base_startup (void)
                 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
 
         cfs_rwlock_init(&ksocknal_data.ksnd_global_lock);
+       CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
 
         cfs_spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
@@ -2389,37 +2406,48 @@ ksocknal_base_startup (void)
         cfs_spin_lock_init (&ksocknal_data.ksnd_tx_lock);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
 
-        /* NB memset above zeros whole of ksocknal_data, including
-         * ksocknal_data.ksnd_irqinfo[all].ksni_valid */
+       /* NB memset above zeros whole of ksocknal_data */
 
-        /* flag lists/ptrs/locks initialised */
-        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
-        PORTAL_MODULE_USE;
+       /* flag lists/ptrs/locks initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+       PORTAL_MODULE_USE;
 
-        ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
-        LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers,
-                     sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
-        if (ksocknal_data.ksnd_schedulers == NULL)
-                goto failed;
+       ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+                                                        sizeof(*info));
+       if (ksocknal_data.ksnd_sched_info == NULL)
+               goto failed;
 
-        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+       cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+               ksock_sched_t   *sched;
+               int             nthrs;
 
-                cfs_spin_lock_init (&kss->kss_lock);
-                CFS_INIT_LIST_HEAD (&kss->kss_rx_conns);
-                CFS_INIT_LIST_HEAD (&kss->kss_tx_conns);
-                CFS_INIT_LIST_HEAD (&kss->kss_zombie_noop_txs);
-                cfs_waitq_init (&kss->kss_waitq);
-        }
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+               } else {
+                       /* max to half of CPUs, assume another half should be
+                        * reserved for upper layer modules */
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+               }
 
-        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                rc = ksocknal_thread_start (ksocknal_scheduler,
-                                            &ksocknal_data.ksnd_schedulers[i]);
-                if (rc != 0) {
-                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
-                               i, rc);
-                        goto failed;
-                }
+               info->ksi_nthreads_max = nthrs;
+               info->ksi_cpt = i;
+
+               LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+                                info->ksi_nthreads_max * sizeof(*sched));
+               if (info->ksi_scheds == NULL)
+                       goto failed;
+
+               for (; nthrs > 0; nthrs--) {
+                       sched = &info->ksi_scheds[nthrs - 1];
+
+                       sched->kss_info = info;
+                       cfs_spin_lock_init(&sched->kss_lock);
+                       CFS_INIT_LIST_HEAD(&sched->kss_rx_conns);
+                       CFS_INIT_LIST_HEAD(&sched->kss_tx_conns);
+                       CFS_INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+                       cfs_waitq_init(&sched->kss_waitq);
+               }
         }
 
         ksocknal_data.ksnd_connd_starting         = 0;
@@ -2565,7 +2593,8 @@ ksocknal_shutdown (lnet_ni_t *ni)
                 LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
         }
 
-        LIBCFS_FREE(net, sizeof(*net));
+       cfs_list_del(&net->ksnn_list);
+       LIBCFS_FREE(net, sizeof(*net));
 
         ksocknal_data.ksnd_nnets--;
         if (ksocknal_data.ksnd_nnets == 0)
@@ -2616,6 +2645,8 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
 
                 net->ksnn_interfaces[j].ksni_ipaddr = ip;
                 net->ksnn_interfaces[j].ksni_netmask = mask;
+               strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+                       names[i], IFNAMSIZ);
                 j++;
         }
 
@@ -2628,6 +2659,114 @@ ksocknal_enumerate_interfaces(ksock_net_t *net)
 }
 
 int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+       int     new_ipif = 0;
+       int     i;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               char            *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+               char            *colon = strchr(ifnam, ':');
+               int             found  = 0;
+               ksock_net_t     *tmp;
+               int             j;
+
+               if (colon != NULL) /* ignore alias device */
+                       *colon = 0;
+
+               cfs_list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+                                       ksnn_list) {
+                       for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+                               char *ifnam2 = &tmp->ksnn_interfaces[j].\
+                                            ksni_name[0];
+                               char *colon2 = strchr(ifnam2, ':');
+
+                               if (colon2 != NULL)
+                                       *colon2 = 0;
+
+                               found = strcmp(ifnam, ifnam2) == 0;
+                               if (colon2 != NULL)
+                                       *colon2 = ':';
+                       }
+                       if (found)
+                               break;
+               }
+
+               new_ipif += !found;
+               if (colon != NULL)
+                       *colon = ':';
+       }
+
+       return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+       int     nthrs;
+       int     rc = 0;
+       int     i;
+
+       if (info->ksi_nthreads == 0) {
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = info->ksi_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              info->ksi_cpt);
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+                       nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+               }
+               nthrs = min(nthrs, info->ksi_nthreads_max);
+       } else {
+               LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+               /* increase two threads if there is new interface */
+               nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long    id;
+
+               id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+               rc = ksocknal_thread_start(ksocknal_scheduler, (void *)id);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      info->ksi_cpt, info->ksi_nthreads + i, rc);
+               break;
+       }
+
+       info->ksi_nthreads += i;
+       return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+       int     newif = ksocknal_search_new_ipif(net);
+       int     rc;
+       int     i;
+
+       LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+       for (i = 0; i < ncpts; i++) {
+               struct ksock_sched_info *info;
+               int cpt = (cpts == NULL) ? i : cpts[i];
+
+               LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+               info = ksocknal_data.ksnd_sched_info[cpt];
+
+               if (!newif && info->ksi_nthreads > 0)
+                       continue;
+
+               rc = ksocknal_start_schedulers(info);
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+
+int
 ksocknal_startup (lnet_ni_t *ni)
 {
         ksock_net_t  *net;
@@ -2646,7 +2785,6 @@ ksocknal_startup (lnet_ni_t *ni)
         if (net == NULL)
                 goto fail_0;
 
-        memset(net, 0, sizeof(*net));
         cfs_spin_lock_init(&net->ksnn_lock);
         net->ksnn_incarnation = ksocknal_new_incarnation();
         ni->ni_data = net;
@@ -2684,12 +2822,21 @@ ksocknal_startup (lnet_ni_t *ni)
                                        ni->ni_interfaces[i]);
                                 goto fail_1;
                         }
-                }
-                net->ksnn_ninterfaces = i;
-        }
 
-        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
-                                net->ksnn_interfaces[0].ksni_ipaddr);
+                       strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+                               ni->ni_interfaces[i], IFNAMSIZ);
+               }
+               net->ksnn_ninterfaces = i;
+       }
+
+       /* call it before add it to ksocknal_data.ksnd_nets */
+       rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto fail_1;
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                               net->ksnn_interfaces[0].ksni_ipaddr);
+       cfs_list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
 
         ksocknal_data.ksnd_nnets++;
 
index 8f98a8f..9384f03 100644 (file)
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
+struct ksock_sched_info;
+
 typedef struct                                  /* per scheduler state */
 {
-        cfs_spinlock_t    kss_lock;             /* serialise */
-        cfs_list_t        kss_rx_conns;         /* conn waiting to be read */
-        cfs_list_t        kss_tx_conns;         /* conn waiting to be written */
-        cfs_list_t        kss_zombie_noop_txs;  /* zombie noop tx list */
-        cfs_waitq_t       kss_waitq;            /* where scheduler sleeps */
-        int               kss_nconns;           /* # connections assigned to this scheduler */
+       cfs_spinlock_t          kss_lock;       /* serialise */
+       cfs_list_t              kss_rx_conns;   /* conn waiting to be read */
+       /* conn waiting to be written */
+       cfs_list_t              kss_tx_conns;
+       /* zombie noop tx list */
+       cfs_list_t              kss_zombie_noop_txs;
+       cfs_waitq_t             kss_waitq;      /* where scheduler sleeps */
+       /* # connections assigned to this scheduler */
+       int                     kss_nconns;
+       struct ksock_sched_info *kss_info;      /* owner of it */
 #if !SOCKNAL_SINGLE_FRAG_RX
-        struct page      *kss_rx_scratch_pgs[LNET_MAX_IOV];
+       struct page             *kss_rx_scratch_pgs[LNET_MAX_IOV];
 #endif
 #if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
-        struct iovec      kss_scratch_iov[LNET_MAX_IOV];
+       struct iovec            kss_scratch_iov[LNET_MAX_IOV];
 #endif
 } ksock_sched_t;
 
-typedef struct
-{
-        unsigned int      ksni_valid:1;         /* been set yet? */
-        unsigned int      ksni_bound:1;         /* bound to a cpu yet? */
-        unsigned int      ksni_sched:6;         /* which scheduler (assumes < 64) */
-} ksock_irqinfo_t;
+struct ksock_sched_info {
+       int                     ksi_nthreads_max; /* max allowed threads */
+       int                     ksi_nthreads;   /* number of threads */
+       int                     ksi_cpt;        /* CPT id */
+       ksock_sched_t           *ksi_scheds;    /* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT                        16
+#define KSOCK_THREAD_ID(cpt, sid)      (((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)           ((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)           ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
 
 typedef struct                                  /* in-use interface */
 {
-        __u32             ksni_ipaddr;          /* interface's IP address */
-        __u32             ksni_netmask;         /* interface's network mask */
-        int               ksni_nroutes;         /* # routes using (active) */
-        int               ksni_npeers;          /* # peers using (passive) */
-        char              ksni_name[16];        /* interface name */
+       __u32           ksni_ipaddr;            /* interface's IP address */
+       __u32           ksni_netmask;           /* interface's network mask */
+       int             ksni_nroutes;           /* # routes using (active) */
+       int             ksni_npeers;            /* # peers using (passive) */
+       char            ksni_name[IFNAMSIZ];    /* interface name */
 } ksock_interface_t;
 
 typedef struct
 {
-        int              *ksnd_timeout;         /* "stuck" socket timeout (seconds) */
+       /* "stuck" socket timeout (seconds) */
+       int              *ksnd_timeout;
+       /* # scheduler threads in each pool while starting */
+       int              *ksnd_nscheds;
         int              *ksnd_nconnds;         /* # connection daemons */
         int              *ksnd_nconnds_max;     /* max # connection daemons */
         int              *ksnd_min_reconnectms; /* first connection retry after (ms)... */
@@ -141,6 +155,7 @@ typedef struct
 {
         __u64             ksnn_incarnation;     /* my epoch */
         cfs_spinlock_t    ksnn_lock;            /* serialise */
+       cfs_list_t        ksnn_list;            /* chain on global list */
         int               ksnn_npeers;          /* # peers */
         int               ksnn_shutdown;        /* shutting down? */
         int               ksnn_ninterfaces;     /* IP interfaces */
@@ -154,17 +169,19 @@ typedef struct
 
 typedef struct
 {
-        int               ksnd_init;           /* initialisation state */
-        int               ksnd_nnets;          /* # networks set up */
-
-        cfs_rwlock_t      ksnd_global_lock;    /* stabilize peer/conn ops */
-        cfs_list_t       *ksnd_peers;          /* hash table of all my known peers */
-        int               ksnd_peer_hash_size; /* size of ksnd_peers */
-
-        int               ksnd_nthreads;       /* # live threads */
-        int               ksnd_shuttingdown;   /* tell threads to exit */
-        int               ksnd_nschedulers;    /* # schedulers */
-        ksock_sched_t    *ksnd_schedulers;     /* their state */
+       int                     ksnd_init;      /* initialisation state */
+       int                     ksnd_nnets;     /* # networks set up */
+       cfs_list_t              ksnd_nets;      /* list of nets */
+       /* stabilize peer/conn ops */
+       cfs_rwlock_t            ksnd_global_lock;
+       /* hash table of all my known peers */
+       cfs_list_t              *ksnd_peers;
+       int                     ksnd_peer_hash_size; /* size of ksnd_peers */
+
+       int                     ksnd_nthreads;  /* # live threads */
+       int                     ksnd_shuttingdown; /* tell threads to exit */
+       /* schedulers information */
+       struct ksock_sched_info **ksnd_sched_info;
 
         cfs_atomic_t      ksnd_nactive_txs;    /* #active txs */
 
@@ -196,8 +213,6 @@ typedef struct
         cfs_list_t        ksnd_idle_noop_txs;  /* list head for freed noop tx */
         cfs_spinlock_t    ksnd_tx_lock;        /* serialise, NOT safe in g_lock */
 
-        ksock_irqinfo_t   ksnd_irqinfo[CFS_NR_IRQS];/* irq->scheduler lookup */
-
 } ksock_nal_data_t;
 
 #define SOCKNAL_INIT_NOTHING    0
@@ -587,9 +602,7 @@ extern void ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn);
 extern void ksocknal_lib_set_callback(cfs_socket_t *sock,  ksock_conn_t *conn);
 extern void ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn);
 extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
-extern void ksocknal_lib_bind_irq (unsigned int irq);
 extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
-extern unsigned int ksocknal_lib_sock_irq (cfs_socket_t *sock);
 extern int ksocknal_lib_setup_sock (cfs_socket_t *so);
 extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
 extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
index ea951af..72add78 100644 (file)
@@ -1388,22 +1388,31 @@ ksocknal_sched_cansleep(ksock_sched_t *sched)
         return (rc);
 }
 
-int ksocknal_scheduler (void *arg)
+int ksocknal_scheduler(void *arg)
 {
-        ksock_sched_t     *sched = (ksock_sched_t *)arg;
-        ksock_conn_t      *conn;
-        ksock_tx_t        *tx;
-        int                rc;
-        int                nloops = 0;
-        int                id = (int)(sched - ksocknal_data.ksnd_schedulers);
-        char               name[16];
-
-        snprintf (name, sizeof (name),"socknal_sd%02d", id);
-        cfs_daemonize (name);
-        cfs_block_allsigs ();
-
-        if (ksocknal_lib_bind_thread_to_cpu(id))
-                CERROR ("Can't set CPU affinity for %s to %d\n", name, id);
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       ksock_conn_t            *conn;
+       ksock_tx_t              *tx;
+       int                     rc;
+       int                     nloops = 0;
+       char                    name[20];
+       long                    id = (long)arg;
+
+       info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+       sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+       snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+                info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+       cfs_daemonize(name);
+       cfs_block_allsigs();
+
+       rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+       if (rc != 0) {
+               CERROR("Can't set CPT affinity for %s to %d: %d\n",
+                      name, info->ksi_cpt, rc);
+       }
 
         cfs_spin_lock_bh (&sched->kss_lock);
 
index 0a95de0..7a01d47 100644 (file)
@@ -161,18 +161,6 @@ ksocknal_lib_tunables_fini ()
  */
 #define KSOCKNAL_MAX_BUFFER        (1152*1024)
 
-void
-ksocknal_lib_bind_irq (unsigned int irq)
-{
-        return;
-}
-
-unsigned int
-ksocknal_lib_sock_irq (cfs_socket_t *sock)
-{
-        return 0;
-}
-
 int
 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 { 
index cbeb38c..621d345 100644 (file)
 
 #include <libcfs/libcfs.h>
 
-static inline
-int ksocknal_nsched(void)
-{ 
-       /* XXX Liang: fix it */
-       return 1;
-}
-
 #endif
index b3bbc13..ae9f5b7 100644 (file)
@@ -404,53 +404,6 @@ ksocknal_lib_tunables_fini ()
 }
 #endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
 
-void
-ksocknal_lib_bind_irq (unsigned int irq)
-{
-#if (defined(CONFIG_SMP) && defined(CPU_AFFINITY))
-        int              bind;
-        int              cpu;
-        char             cmdline[64];
-        ksock_irqinfo_t *info;
-        char            *argv[] = {"/bin/sh",
-                                   "-c",
-                                   cmdline,
-                                   NULL};
-        char            *envp[] = {"HOME=/",
-                                   "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
-                                   NULL};
-
-        LASSERT (irq < NR_IRQS);
-        if (irq == 0)              /* software NIC or affinity disabled */
-                return;
-
-        info = &ksocknal_data.ksnd_irqinfo[irq];
-
-        cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock);
-
-        LASSERT (info->ksni_valid);
-        bind = !info->ksni_bound;
-        info->ksni_bound = 1;
-
-        cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock);
-
-        if (!bind)                              /* bound already */
-                return;
-
-        cpu = ksocknal_irqsched2cpu(info->ksni_sched);
-        snprintf (cmdline, sizeof (cmdline),
-                  "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
-
-        LCONSOLE_INFO("Binding irq %u to CPU %d with cmd: %s\n",
-                      irq, cpu, cmdline);
-
-        /* FIXME: Find a better method of setting IRQ affinity...
-         */
-
-        USERMODEHELPER(argv[0], argv, envp);
-#endif
-}
-
 int
 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 {
@@ -476,32 +429,6 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
         return 0;
 }
 
-unsigned int
-ksocknal_lib_sock_irq (struct socket *sock)
-{
-        int                irq = 0;
-#ifdef CPU_AFFINITY
-        struct dst_entry  *dst;
-
-        if (!*ksocknal_tunables.ksnd_irq_affinity)
-                return 0;
-
-        dst = sk_dst_get (sock->sk);
-        if (dst != NULL) {
-                if (dst->dev != NULL) {
-                        irq = dst->dev->irq;
-                        if (irq >= NR_IRQS) {
-                                CERROR ("Unexpected IRQ %x\n", irq);
-                                irq = 0;
-                        }
-                }
-                dst_release (dst);
-        }
-
-#endif
-        return irq;
-}
-
 int
 ksocknal_lib_zc_capable(ksock_conn_t *conn)
 {
@@ -1270,22 +1197,3 @@ ksocknal_lib_memory_pressure(ksock_conn_t *conn)
 
         return rc;
 }
-
-int
-ksocknal_lib_bind_thread_to_cpu(int id)
-{
-#if defined(CONFIG_SMP) && defined(CPU_AFFINITY)
-        id = ksocknal_sched2cpu(id);
-        if (cpu_online(id)) {
-                cpumask_t m = CPU_MASK_NONE;
-                cpu_set(id, m);
-                set_cpus_allowed(current, m);
-                return 0;
-        }
-
-        return -1;
-
-#else
-        return 0;
-#endif
-}
index 3076427..65ee770 100644 (file)
@@ -52,6 +52,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/uio.h>
+#include <linux/if.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -86,58 +87,8 @@ static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
 #define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
 #define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
 
-#ifndef CONFIG_SMP
-static inline
-int ksocknal_nsched(void)
-{
-        return 1;
-}
-#else
-# if !defined(CONFIG_X86) || defined(CONFIG_X86_64) || !defined(CONFIG_X86_HT)
-static inline int
-ksocknal_nsched(void)
-{
-        return num_online_cpus();
-}
-
-static inline int
-ksocknal_sched2cpu(int i)
-{
-        return i;
-}
-
-static inline int
-ksocknal_irqsched2cpu(int i)
-{
-        return i;
-}
-# else
-static inline int
-ksocknal_nsched(void)
-{
-        if (smp_num_siblings == 1)
-                return (num_online_cpus());
-
-        /* We need to know if this assumption is crap */
-        LASSERT (smp_num_siblings == 2);
-        return (num_online_cpus()/2);
-}
-
-static inline int
-ksocknal_sched2cpu(int i)
-{
-        if (smp_num_siblings == 1)
-                return i;
-
-        return (i * 2);
-}
-
-static inline int
-ksocknal_irqsched2cpu(int i)
-{
-        return (ksocknal_sched2cpu(i) + 1);
-}
-# endif
-#endif
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS                3
+#define SOCKNAL_NSCHEDS_HIGH   (SOCKNAL_NSCHEDS << 1)
 
 #endif
index 68e7fc0..a29e090 100644 (file)
@@ -41,6 +41,12 @@ static int peer_timeout = 180;
 CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
                 "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+               "# scheduler daemons in each pool while starting");
+
 static int nconnds = 4;
 CFS_MODULE_PARM(nconnds, "i", int, 0444,
                 "# connection daemons while starting");
@@ -172,6 +178,7 @@ int ksocknal_tunables_init(void)
 
         /* initialize ksocknal_tunables structure */
         ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+       ksocknal_tunables.ksnd_nscheds            = &nscheds;
         ksocknal_tunables.ksnd_nconnds            = &nconnds;
         ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
         ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
@@ -199,6 +206,12 @@ int ksocknal_tunables_init(void)
         ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
 
 #ifdef CPU_AFFINITY
+       if (enable_irq_affinity) {
+               CWARN("irq_affinity is removed from socklnd because modern "
+                     "computer always has fast CPUs and more cores than "
+                     "# NICs, although you still can set irq_affinity by "
+                     "another way, please check manual for details.\n");
+       }
         ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
 #endif