From: adilger Date: Fri, 30 Sep 2005 10:26:10 +0000 (+0000) Subject: Branch b1_4 X-Git-Tag: v1_7_100~1^103~4^2~260^2~61 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=e84022aaa80744b59cf62bec5e82dc27ca8f5e1f;p=fs%2Flustre-release.git Branch b1_4 Description: bind OST threads to NUMA nodes to improve performance Details : all OST threads are uniformly bound to CPUs on a single NUMA node and do their allocations there to localize memory access b=7342 --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index d4ac642..ce2bb93 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -158,6 +158,12 @@ Bugzilla : 6163 Description: lconf did not handle in-kernel recovery with LDAP properly Details : lconf/LustreDB get_refs() is searching the wrong namespace +Severity : enhancement +Bugzilla : 7342 +Description: bind OST threads to NUMA nodes to improve performance +Details : all OST threads are uniformly bound to CPUs on a single NUMA + node and do their allocations there to localize memory access + ------------------------------------------------------------------------------ 08-26-2005 Cluster File Systems, Inc. diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index eaa71ed..eff0a4f 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -484,10 +484,9 @@ struct ptlrpc_thread { struct list_head t_link; /* active threads for service, from svc->srv_threads */ + void *t_data; /* thread-private data (preallocated memory) */ __u32 t_flags; - void *t_data; /* thread-private data (preallocated memory) */ - unsigned int t_id; /* service thread index, from ptlrpc_start_threads */ wait_queue_head_t t_ctl_waitq; }; @@ -538,7 +537,8 @@ struct ptlrpc_service { int srv_n_active_reqs; /* # reqs being served */ int srv_rqbd_timeout; /* timeout before re-posting reqs */ int srv_watchdog_timeout; /* soft watchdog timeout, in ms */ - int srv_num_threads; /*# of threads to start/started*/ + int srv_num_threads; /* # threads to start/started */ + unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ __u32 srv_req_portal; __u32 srv_rep_portal; diff --git a/lustre/kernel_patches/patches/export_symbol_numa.patch b/lustre/kernel_patches/patches/export_symbol_numa.patch new file mode 100644 index 0000000..17ab1e2 --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbol_numa.patch @@ -0,0 +1,24 @@ +Index: linux-2.6.9-5.0.3.EL/arch/ia64/kernel/smpboot.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/arch/ia64/kernel/smpboot.c 2004-10-18 15:53:43.000000000 -0600 ++++ linux-2.6.9-5.0.3.EL/arch/ia64/kernel/smpboot.c 2005-09-28 16:02:16.000000000 -0600 +@@ -485,6 +485,7 @@ + EXPORT_SYMBOL(cpu_to_node_map); + /* which logical CPUs are on which nodes */ + cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; ++EXPORT_SYMBOL(node_to_cpu_mask); + + /* + * Build cpu to node mapping and initialize the per node cpu masks. +Index: linux-2.6.9-5.0.3.EL/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/arch/i386/kernel/smpboot.c 2004-10-18 15:54:08.000000000 -0600 ++++ linux-2.6.9-5.0.3.EL/arch/i386/kernel/smpboot.c 2005-09-28 16:28:12.000000000 -0600 +@@ -474,6 +474,7 @@ + /* which logical CPUs are on which nodes */ + cpumask_t node_2_cpu_mask[MAX_NUMNODES] = + { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; ++EXPORT_SYMBOL(node_2_cpu_mask); + /* which node each logical CPU is on */ + int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; + EXPORT_SYMBOL(cpu_2_node); diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index e523bb3..2654457 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -15,3 +15,4 @@ remove-suid-2.6-suse.patch export-show_task-2.6-vanilla.patch sd_iostats-2.6-rhel4.patch fsprivate-2.6.patch +export_symbol_numa.patch diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 6ab36626..4053468 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1440,6 +1440,7 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ost->ost_service->srv_init = ost_thread_init; ost->ost_service->srv_done = ost_thread_done; + ost->ost_service->srv_cpu_affinity = 1; rc = ptlrpc_start_threads(obd, ost->ost_service, "ll_ost"); if (rc) GOTO(out_service, rc = -EINVAL); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 3668aa8..ffb288e 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -812,11 +812,13 @@ static int ptlrpc_main(void *arg) struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; struct ptlrpc_service *svc = data->svc; struct ptlrpc_thread *thread = data->thread; + struct ptlrpc_reply_state *rs; struct lc_watchdog *watchdog; unsigned long flags; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) struct group_info *ginfo = NULL; #endif + int rc = 0; ENTRY; lock_kernel(); @@ -833,17 +835,48 @@ static int ptlrpc_main(void *arg) THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name); unlock_kernel(); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) && CONFIG_NUMA && \ + LUSTRE_KERNEL_VERSION >= 48 + /* we need to do this before any per-thread allocation is done so that + * we get the per-thread allocations on local node. bug 7342 */ + if (svc->srv_cpu_affinity) { + int cpu, num_cpu; + + for (cpu = 0, num_cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; + if (num_cpu == thread->t_id % num_online_cpus()) + break; + num_cpu++; + } + set_cpus_allowed(current, node_to_cpumask(cpu_to_node(cpu))); + } +#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) ginfo = groups_alloc(0); if (!ginfo) { - thread->t_flags = SVC_RUNNING; - wake_up(&thread->t_ctl_waitq); - return (-ENOMEM); + rc = -ENOMEM; + goto out; } + set_current_groups(ginfo); put_group_info(ginfo); #endif + if (svc->srv_init != NULL) { + rc = svc->srv_init(thread); + if (rc) + goto out; + } + + /* Alloc reply state structure for this one */ + OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, GFP_KERNEL); + if (!rs) { + rc = -ENOMEM; + goto out_srv_init; + } + /* Record that the thread is running */ thread->t_flags = SVC_RUNNING; /* @@ -857,7 +890,11 @@ static int ptlrpc_main(void *arg) spin_lock_irqsave(&svc->srv_lock, flags); svc->srv_nthreads++; + list_add(&rs->rs_list, &svc->srv_free_rs_list); spin_unlock_irqrestore(&svc->srv_lock, flags); + wake_up(&svc->srv_free_rs_waitq); + + CDEBUG(D_NET, "service thread %d started\n", thread->t_id); /* XXX maintain a list of all managed devices: insert here */ @@ -908,9 +945,13 @@ static int ptlrpc_main(void *arg) /* * deconstruct service specific state created by ptlrpc_start_thread() */ + lc_watchdog_delete(watchdog); + +out_srv_init: if (svc->srv_done != NULL) svc->srv_done(thread); +out: spin_lock_irqsave(&svc->srv_lock, flags); svc->srv_nthreads--; /* must know immediately */ @@ -919,10 +960,10 @@ static int ptlrpc_main(void *arg) spin_unlock_irqrestore(&svc->srv_lock, flags); - lc_watchdog_delete(watchdog); + CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc); + thread->t_id = rc; - CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid); - return 0; + return rc; } static void ptlrpc_stop_thread(struct ptlrpc_service *svc, @@ -991,7 +1032,6 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, struct ptlrpc_svc_data d; struct ptlrpc_thread *thread; unsigned long flags; - struct ptlrpc_reply_state *rs; int rc; ENTRY; @@ -1000,22 +1040,10 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, RETURN(-ENOMEM); init_waitqueue_head(&thread->t_ctl_waitq); thread->t_id = id; - - if (svc->srv_init != NULL) { - rc = svc->srv_init(thread); - if (rc != 0) - RETURN(rc); - } - /* Alloc reply state structure for this one */ - OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, GFP_KERNEL); - if (!rs) - RETURN(-ENOMEM); spin_lock_irqsave(&svc->srv_lock, flags); - list_add(&rs->rs_list, &svc->srv_free_rs_list); list_add(&thread->t_link, &svc->srv_threads); spin_unlock_irqrestore(&svc->srv_lock, flags); - wake_up(&svc->srv_free_rs_waitq); d.dev = dev; d.svc = svc; @@ -1033,15 +1061,14 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, list_del(&thread->t_link); spin_unlock_irqrestore(&svc->srv_lock, flags); - if (svc->srv_done != NULL) - svc->srv_done(thread); - OBD_FREE(thread, sizeof(*thread)); RETURN(rc); } - l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi); + l_wait_event(thread->t_ctl_waitq, + thread->t_flags & (SVC_RUNNING | SVC_STOPPED), &lwi); - RETURN(0); + rc = (thread->t_flags & SVC_STOPPED) ? thread->t_id : 0; + RETURN(rc); } #endif