Whamcloud - gitweb
Branch b1_4
authoradilger <adilger>
Fri, 30 Sep 2005 10:26:10 +0000 (10:26 +0000)
committeradilger <adilger>
Fri, 30 Sep 2005 10:26:10 +0000 (10:26 +0000)
Description: bind OST threads to NUMA nodes to improve performance
Details    : all OST threads are uniformly bound to CPUs on a single NUMA
     node and do their allocations there to localize memory access
b=7342

lustre/ChangeLog
lustre/include/linux/lustre_net.h
lustre/kernel_patches/patches/export_symbol_numa.patch [new file with mode: 0644]
lustre/kernel_patches/series/2.6-rhel4.series
lustre/ost/ost_handler.c
lustre/ptlrpc/service.c

index d4ac642..ce2bb93 100644 (file)
@@ -158,6 +158,12 @@ Bugzilla   : 6163
 Description: lconf did not handle in-kernel recovery with LDAP properly
 Details    : lconf/LustreDB get_refs() is searching the wrong namespace
 
+Severity   : enhancement
+Bugzilla   : 7342
+Description: bind OST threads to NUMA nodes to improve performance
+Details    : all OST threads are uniformly bound to CPUs on a single NUMA
+            node and do their allocations there to localize memory access
+
 ------------------------------------------------------------------------------
 
 08-26-2005  Cluster File Systems, Inc. <info@clusterfs.com>
index eaa71ed..eff0a4f 100644 (file)
@@ -484,10 +484,9 @@ struct ptlrpc_thread {
 
         struct list_head t_link; /* active threads for service, from svc->srv_threads */
 
+        void *t_data;            /* thread-private data (preallocated memory) */
         __u32 t_flags;
 
-        void *t_data; /* thread-private data (preallocated memory) */
-
         unsigned int t_id; /* service thread index, from ptlrpc_start_threads */
         wait_queue_head_t t_ctl_waitq;
 };
@@ -538,7 +537,8 @@ struct ptlrpc_service {
         int              srv_n_active_reqs;     /* # reqs being served */
         int              srv_rqbd_timeout;      /* timeout before re-posting reqs */
         int              srv_watchdog_timeout; /* soft watchdog timeout, in ms */
-        int              srv_num_threads;      /*# of threads to start/started*/
+        int              srv_num_threads;       /* # threads to start/started */
+        unsigned         srv_cpu_affinity:1;    /* bind threads to CPUs */
 
         __u32 srv_req_portal;
         __u32 srv_rep_portal;
diff --git a/lustre/kernel_patches/patches/export_symbol_numa.patch b/lustre/kernel_patches/patches/export_symbol_numa.patch
new file mode 100644 (file)
index 0000000..17ab1e2
--- /dev/null
@@ -0,0 +1,24 @@
+Index: linux-2.6.9-5.0.3.EL/arch/ia64/kernel/smpboot.c
+===================================================================
+--- linux-2.6.9-5.0.3.EL.orig/arch/ia64/kernel/smpboot.c       2004-10-18 15:53:43.000000000 -0600
++++ linux-2.6.9-5.0.3.EL/arch/ia64/kernel/smpboot.c    2005-09-28 16:02:16.000000000 -0600
+@@ -485,6 +485,7 @@
+ EXPORT_SYMBOL(cpu_to_node_map);
+ /* which logical CPUs are on which nodes */
+ cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
++EXPORT_SYMBOL(node_to_cpu_mask);
+ /*
+  * Build cpu to node mapping and initialize the per node cpu masks.
+Index: linux-2.6.9-5.0.3.EL/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-2.6.9-5.0.3.EL.orig/arch/i386/kernel/smpboot.c       2004-10-18 15:54:08.000000000 -0600
++++ linux-2.6.9-5.0.3.EL/arch/i386/kernel/smpboot.c    2005-09-28 16:28:12.000000000 -0600
+@@ -474,6 +474,7 @@
+ /* which logical CPUs are on which nodes */
+ cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
+                               { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
++EXPORT_SYMBOL(node_2_cpu_mask);
+ /* which node each logical CPU is on */
+ int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
+ EXPORT_SYMBOL(cpu_2_node);
index e523bb3..2654457 100644 (file)
@@ -15,3 +15,4 @@ remove-suid-2.6-suse.patch
 export-show_task-2.6-vanilla.patch
 sd_iostats-2.6-rhel4.patch 
 fsprivate-2.6.patch
+export_symbol_numa.patch 
index 6ab3662..4053468 100644 (file)
@@ -1440,6 +1440,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
 
         ost->ost_service->srv_init = ost_thread_init;
         ost->ost_service->srv_done = ost_thread_done;
+        ost->ost_service->srv_cpu_affinity = 1;
         rc = ptlrpc_start_threads(obd, ost->ost_service, "ll_ost");
         if (rc)
                 GOTO(out_service, rc = -EINVAL);
index 3668aa8..ffb288e 100644 (file)
@@ -812,11 +812,13 @@ static int ptlrpc_main(void *arg)
         struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
         struct ptlrpc_service  *svc = data->svc;
         struct ptlrpc_thread   *thread = data->thread;
+        struct ptlrpc_reply_state *rs;
         struct lc_watchdog     *watchdog;
         unsigned long           flags;
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
         struct group_info *ginfo = NULL;
 #endif
+        int rc = 0;
         ENTRY;
 
         lock_kernel();
@@ -833,17 +835,48 @@ static int ptlrpc_main(void *arg)
         THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name);
         unlock_kernel();
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) && CONFIG_NUMA && \
+        LUSTRE_KERNEL_VERSION >= 48
+        /* we need to do this before any per-thread allocation is done so that
+         * we get the per-thread allocations on local node.  bug 7342 */
+        if (svc->srv_cpu_affinity) {
+                int cpu, num_cpu;
+
+                for (cpu = 0, num_cpu = 0; cpu < NR_CPUS; cpu++) {
+                        if (!cpu_online(cpu))
+                                continue;
+                        if (num_cpu == thread->t_id % num_online_cpus())
+                                break;
+                        num_cpu++;
+                }
+                set_cpus_allowed(current, node_to_cpumask(cpu_to_node(cpu)));
+        }
+#endif
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)
         ginfo = groups_alloc(0);
         if (!ginfo) {
-                thread->t_flags = SVC_RUNNING;
-                wake_up(&thread->t_ctl_waitq);
-                return (-ENOMEM);
+                rc = -ENOMEM;
+                goto out;
         }
+
         set_current_groups(ginfo);
         put_group_info(ginfo);
 #endif
 
+        if (svc->srv_init != NULL) {
+                rc = svc->srv_init(thread);
+                if (rc)
+                        goto out;
+        }
+
+        /* Alloc reply state structure for this one */
+        OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, GFP_KERNEL);
+        if (!rs) {
+                rc = -ENOMEM;
+                goto out_srv_init;
+        }
+
         /* Record that the thread is running */
         thread->t_flags = SVC_RUNNING;
         /*
@@ -857,7 +890,11 @@ static int ptlrpc_main(void *arg)
 
         spin_lock_irqsave(&svc->srv_lock, flags);
         svc->srv_nthreads++;
+        list_add(&rs->rs_list, &svc->srv_free_rs_list);
         spin_unlock_irqrestore(&svc->srv_lock, flags);
+        wake_up(&svc->srv_free_rs_waitq);
+
+        CDEBUG(D_NET, "service thread %d started\n", thread->t_id);
 
         /* XXX maintain a list of all managed devices: insert here */
 
@@ -908,9 +945,13 @@ static int ptlrpc_main(void *arg)
         /*
          * deconstruct service specific state created by ptlrpc_start_thread()
          */
+        lc_watchdog_delete(watchdog);
+
+out_srv_init:
         if (svc->srv_done != NULL)
                 svc->srv_done(thread);
 
+out:
         spin_lock_irqsave(&svc->srv_lock, flags);
 
         svc->srv_nthreads--;                    /* must know immediately */
@@ -919,10 +960,10 @@ static int ptlrpc_main(void *arg)
 
         spin_unlock_irqrestore(&svc->srv_lock, flags);
 
-        lc_watchdog_delete(watchdog);
+        CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc);
+        thread->t_id = rc;
 
-        CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
-        return 0;
+        return rc;
 }
 
 static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
@@ -991,7 +1032,6 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         struct ptlrpc_svc_data d;
         struct ptlrpc_thread *thread;
         unsigned long flags;
-        struct ptlrpc_reply_state *rs;
         int rc;
         ENTRY;
 
@@ -1000,22 +1040,10 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                 RETURN(-ENOMEM);
         init_waitqueue_head(&thread->t_ctl_waitq);
         thread->t_id = id;
-          
-        if (svc->srv_init != NULL) {
-                rc = svc->srv_init(thread);
-                if (rc != 0)
-                        RETURN(rc);
-        }
 
-        /* Alloc reply state structure for this one */
-        OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, GFP_KERNEL);
-        if (!rs)
-                RETURN(-ENOMEM);
         spin_lock_irqsave(&svc->srv_lock, flags);
-        list_add(&rs->rs_list, &svc->srv_free_rs_list);
         list_add(&thread->t_link, &svc->srv_threads);
         spin_unlock_irqrestore(&svc->srv_lock, flags);
-        wake_up(&svc->srv_free_rs_waitq);
 
         d.dev = dev;
         d.svc = svc;
@@ -1033,15 +1061,14 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                 list_del(&thread->t_link);
                 spin_unlock_irqrestore(&svc->srv_lock, flags);
 
-                if (svc->srv_done != NULL)
-                        svc->srv_done(thread);
-
                 OBD_FREE(thread, sizeof(*thread));
                 RETURN(rc);
         }
-        l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi);
+        l_wait_event(thread->t_ctl_waitq,
+                     thread->t_flags & (SVC_RUNNING | SVC_STOPPED), &lwi);
 
-        RETURN(0);
+        rc = (thread->t_flags & SVC_STOPPED) ? thread->t_id : 0;
+        RETURN(rc);
 }
 #endif