From 617e8e1229637908d4cce6725878dd5668960420 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 5 Apr 2012 23:46:21 +0800 Subject: [PATCH] LU-56 libcfs: implementation of cpu partition MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch is implementation of CPU partition (CPT) and CPT table. CPU partition (CPT) is conceptually similar to the 'cpuset' of Linux. It is designed to provide an abstract layer that can be conveniently used by kernel threads, unlike the 'cpuset' that is most accessible from userspace. CPT table is a set of CPTs: - A CPT table can contain 1-N CPU partitions - CPUs in one CPT shouldn’t overlap with CPUs in CPTs of the same CPT table - A CPT table can cover all CPUs in the system, or a subset of system CPUs. Signed-off-by: Liang Zhen Change-Id: Idd3168f21ea82891d11dffe06cbb386e1505889c Reviewed-on: http://review.whamcloud.com/2461 Tested-by: Hudson Reviewed-by: Doug Oucharek Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- libcfs/autoconf/lustre-libcfs.m4 | 137 +++- libcfs/include/libcfs/Makefile.am | 2 +- libcfs/include/libcfs/libcfs.h | 1 + libcfs/include/libcfs/libcfs_cpu.h | 214 ++++++ libcfs/include/libcfs/linux/Makefile.am | 2 +- libcfs/include/libcfs/linux/libcfs.h | 1 + libcfs/include/libcfs/linux/linux-cpu.h | 181 +++++ libcfs/libcfs/Makefile.in | 8 +- libcfs/libcfs/autoMakefile.am | 7 +- libcfs/libcfs/libcfs_cpu.c | 204 ++++++ libcfs/libcfs/linux/Makefile.am | 2 +- libcfs/libcfs/linux/linux-cpu.c | 1117 +++++++++++++++++++++++++++++++ libcfs/libcfs/linux/linux-proc.c | 52 ++ libcfs/libcfs/module.c | 7 +- 14 files changed, 1923 insertions(+), 12 deletions(-) create mode 100644 libcfs/include/libcfs/libcfs_cpu.h create mode 100644 libcfs/include/libcfs/linux/linux-cpu.h create mode 100644 libcfs/libcfs/libcfs_cpu.c create mode 100644 libcfs/libcfs/linux/linux-cpu.c diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4 index b4b89d0..dfd9c03 100644 --- a/libcfs/autoconf/lustre-libcfs.m4 +++ b/libcfs/autoconf/lustre-libcfs.m4 @@ -112,6 +112,116 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# check cpumask_size (2.6.28) +AC_DEFUN([LIBCFS_CPUMASK_SIZE], +[AC_MSG_CHECKING([whether have cpumask_size()]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int size = cpumask_size(); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CPUMASK_SIZE, 1, [have cpumask_size()]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# check cpu topology functions +# +# topology_core_cpumask (2.6.29, not on all archs) +# topology_core_siblings (not on all archs) +# topology_thread_cpumask (2.6.29, not on all archs) +# topology_thread_siblings (not on all archs) +# cpumask_of_node/node_to_cpumask (not always exported) +AC_DEFUN([LIBCFS_CPU_TOPOLOGY], +[AC_MSG_CHECKING([whether have topology.h]) +LB_LINUX_TRY_COMPILE([ + #include +],[],[ + AC_DEFINE(HAVE_CPU_TOPOLOGY, 1, [have CPU topology]) + AC_MSG_RESULT(yes) + + AC_MSG_CHECKING([whether have topology_core_cpumask]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t *mask = topology_core_cpumask(0); + ],[ + AC_DEFINE(HAVE_TOPOLOGY_CORE_CPUMASK, 1, + [have topology_core_cpumask]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether have topology_core_siblings]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t mask = topology_core_siblings(0); + ],[ + AC_DEFINE(HAVE_TOPOLOGY_CORE_SIBLINGS, 1, + [have topology_core_siblings]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether have topology_thread_cpumask]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t *mask = topology_thread_cpumask(0); + ],[ + AC_DEFINE(HAVE_TOPOLOGY_THREAD_CPUMASK, 1, + [have topology_thread_cpumask]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether have topology_thread_siblings]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t mask = topology_thread_siblings(0); + ],[ + AC_DEFINE(HAVE_TOPOLOGY_THREAD_SIBLINGS, 1, + [have topology_thread_siblings]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether have cpumask_of_node]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t *mask = cpumask_of_node(0); + ],[ + AC_DEFINE(HAVE_CPUMASK_OF_NODE, 1, [have cpumask_of_node]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether have node_to_cpumask]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + cpumask_t mask = node_to_cpumask(0); + ],[ + AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, [have node_to_cpumask]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +],[ + AC_MSG_RESULT(NO) +]) +]) + # 2.6.20 API change INIT_WORK use 2 args and not # store data inside AC_DEFUN([LIBCFS_3ARGS_INIT_WORK], @@ -454,6 +564,27 @@ AC_DEFUN([LIBCFS_HAVE_OOM_H], ]) # +# check set_mems_allowed +# 2.6.31 adds function set_mems_allowed in cpuset.h +# +AC_DEFUN([LIBCFS_HAVE_SET_MEMS_ALLOWED], +[AC_MSG_CHECKING([whether have set_mems_allowed()]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + nodemask_t mask; + + set_mems_allowed(mask); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_MEMS_ALLOWED, 1, [have set_mems_allowed()]) +],[ + AC_MSG_RESULT(NO) +]) +]) + + +# # RHEL6/2.6.32 want to have pointer to shrinker self pointer in handler function # AC_DEFUN([LC_SHRINKER_WANT_SHRINK_PTR], @@ -578,7 +709,6 @@ AC_DEFUN([LIBCFS_PROG_LINUX], LIBCFS_CONFIG_PANIC_DUMPLOG LIBCFS_U64_LONG_LONG_LINUX - # 2.6.18 LIBCFS_TASKLIST_LOCK LIBCFS_HAVE_IS_COMPAT_TASK @@ -600,11 +730,16 @@ LIBCFS_FUNC_DUMP_TRACE LIBCFS_SEM_COUNT # 2.6.27 LIBCFS_CRED_WRAPPERS +# 2.6.28 +LIBCFS_CPUMASK_SIZE # 2.6.29 LIBCFS_STRUCT_CRED_IN_TASK +LIBCFS_CPU_TOPOLOGY # 2.6.30 LIBCFS_FUNC_UNSHARE_FS_STRUCT LIBCFS_SOCK_MAP_FD_2ARG +# 2.6.31 +LIBCFS_HAVE_SET_MEMS_ALLOWED # 2.6.32 LIBCFS_STACKTRACE_OPS_HAVE_WALK_STACK LC_SHRINKER_WANT_SHRINK_PTR diff --git a/libcfs/include/libcfs/Makefile.am b/libcfs/include/libcfs/Makefile.am index cd61d27..79f923a 100644 --- a/libcfs/include/libcfs/Makefile.am +++ b/libcfs/include/libcfs/Makefile.am @@ -7,7 +7,7 @@ DIST_SUBDIRS := linux posix util darwin EXTRA_DIST := curproc.h libcfs_private.h libcfs.h list.h lltrace.h \ user-lock.h user-prim.h user-time.h user-mem.h \ user-tcpip.h user-bitops.h bitmap.h \ - libcfs_prim.h libcfs_time.h libcfs_hash.h \ + libcfs_prim.h libcfs_time.h libcfs_hash.h libcfs_cpu.h \ libcfs_debug.h libcfsutil.h libcfs_ioctl.h \ libcfs_pack.h libcfs_unpack.h libcfs_string.h \ libcfs_kernelcomm.h libcfs_workitem.h lucache.h \ diff --git a/libcfs/include/libcfs/libcfs.h b/libcfs/include/libcfs/libcfs.h index c20f34e..f93285f 100644 --- a/libcfs/include/libcfs/libcfs.h +++ b/libcfs/include/libcfs/libcfs.h @@ -302,6 +302,7 @@ void cfs_srand(unsigned int, unsigned int); void cfs_get_random_bytes(void *buf, int size); #include +#include #include #include #include diff --git a/libcfs/include/libcfs/libcfs_cpu.h b/libcfs/include/libcfs/libcfs_cpu.h new file mode 100644 index 0000000..736f2b2 --- /dev/null +++ b/libcfs/include/libcfs/libcfs_cpu.h @@ -0,0 +1,214 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_cpu.h + * + * CPU partition + * . CPU partition is virtual processing unit + * + * . CPU partition can present 1-N cores, or 1-N NUMA nodes, + * in other words, CPU partition is a processors pool. + * + * CPU Partition Table (CPT) + * . a set of CPU partitions + * + * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP + * + * . User can specify total number of CPU partitions while creating a + * CPT, ID of CPU partition is always start from 0. + * + * Example: if there are 8 cores on the system, while creating a CPT + * with cpu_npartitions=4: + * core[0, 1] = partition[0], core[2, 3] = partition[1] + * core[4, 5] = partition[2], core[6, 7] = partition[3] + * + * cpu_npartitions=1: + * core[0, 1, ... 7] = partition[0] + * + * . User can also specifiy CPU partitions by string pattern + * + * Examples: cpu_partitions="0[0,1], 1[2,3]" + * cpu_partitions="N 0[0-3], 1[4-8]" + * + * The first charactor "N" means following numbers are numa ID + * + * . NUMA allocators, CPU affinity threads are built over CPU partitions, + * instead of HW CPUs or HW nodes. + * + * . By default, Lustre modules should refer to the global cfs_cpt_table, + * instead of accessing HW CPUs directly, so concurrency of Lustre can be + * configured by cpu_npartitions of the global cfs_cpt_table + * + * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * same way as 2.2 or earlier verison + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_CPU_H__ +#define __LIBCFS_CPU_H__ + +#ifndef HAVE_LIBCFS_CPT + +typedef unsigned long cpumask_t; +typedef unsigned long nodemask_t; + +struct cfs_cpt_table { + /* # of CPU partitions */ + int ctb_nparts; + /* cpu mask */ + cpumask_t ctb_mask; + /* node mask */ + nodemask_t ctb_nodemask; + /* version */ + __u64 ctb_version; +}; + +#endif /* !HAVE_LIBCFS_CPT */ + +/* any CPU partition */ +#define CFS_CPT_ANY (-1) + +extern struct cfs_cpt_table *cfs_cpt_table; + +/** + * destroy a CPU partition table + */ +void cfs_cpt_table_free(struct cfs_cpt_table *cptab); +/** + * create a cfs_cpt_table with \a ncpt number of partitions + */ +struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt); +/** + * print string information of cpt-table + */ +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * return total number of CPU partitions in \a cptab + */ +int +cfs_cpt_number(struct cfs_cpt_table *cptab); +/** + * return number of HW cores or hypter-threadings in a CPU partition \a cpt + */ +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); +/** + * is there any online CPU in CPU partition \a cpt + */ +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); +/** + * return cpumask of CPU partition \a cpt + */ +cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); +/** + * return nodemask of CPU partition \a cpt + */ +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); +/** + * shadow current HW processor ID to CPU-partition ID of \a cptab + */ +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); +/** + * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); +/** + * bind current thread on a CPU-partition \a cpt of \a cptab + */ +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); +/** + * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success, + * otherwise 0 is returned + */ +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * remove \a cpu from CPU partition \a cpt of \a cptab + */ +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * add all cpus in \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * remove all cpus in \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * add all cpus in NUMA node \a node to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * remove all cpus in NUMA node \a node from CPU partition \a cpt + */ +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); + +/** + * add all cpus in node mask \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * remove all cpus in node mask \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * unset all cpus for CPU partition \a cpt + */ +void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt); +/** + * convert partition id \a cpt to numa node id, if there are more than one + * nodes in this partition, it might return a different node id each time. + */ +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); + +/** + * iterate over all CPU partitions in \a cptab + */ +#define cfs_cpt_for_each(i, cptab) \ + for (i = 0; i < cfs_cpt_number(cptab); i++) + +#ifndef __read_mostly +# define __read_mostly +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned +#endif + +int cfs_cpu_init(void); +void cfs_cpu_fini(void); + +#endif /* __LIBCFS_CPU_H__ */ diff --git a/libcfs/include/libcfs/linux/Makefile.am b/libcfs/include/libcfs/linux/Makefile.am index 874efed..a7ca7cc 100644 --- a/libcfs/include/libcfs/linux/Makefile.am +++ b/libcfs/include/libcfs/linux/Makefile.am @@ -1,3 +1,3 @@ EXTRA_DIST := kp30.h libcfs.h linux-fs.h linux-lock.h linux-mem.h \ - linux-prim.h linux-time.h linux-tcpip.h lltrace.h \ + linux-prim.h linux-time.h linux-tcpip.h lltrace.h linux-cpu.h \ portals_compat25.h linux-bitops.h linux-types.h diff --git a/libcfs/include/libcfs/linux/libcfs.h b/libcfs/include/libcfs/linux/libcfs.h index b3ab7cb..ce07e80 100644 --- a/libcfs/include/libcfs/linux/libcfs.h +++ b/libcfs/include/libcfs/linux/libcfs.h @@ -45,6 +45,7 @@ #include +#include #include #include #include diff --git a/libcfs/include/libcfs/linux/linux-cpu.h b/libcfs/include/libcfs/linux/linux-cpu.h new file mode 100644 index 0000000..124523f --- /dev/null +++ b/libcfs/include/libcfs/linux/linux-cpu.h @@ -0,0 +1,181 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_LINUX_CPU_H__ +#define __LIBCFS_LINUX_CPU_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifndef __KERNEL__ +#error This include is only for kernel use. +#endif + +#include +#include +#include +#include + +#ifndef HAVE_CPUMASK_SIZE +#define cpumask_size() sizeof(cpumask_t) +#endif + +#ifdef CONFIG_SMP + +#define HAVE_LIBCFS_CPT + +/** virtual processing unit */ +struct cfs_cpu_partition { + /* CPUs mask for this partition */ + cpumask_t *cpt_cpumask; + /* nodes mask for this partition */ + nodemask_t *cpt_nodemask; + /* spread rotor for NUMA allocator */ + unsigned cpt_spread_rotor; +}; + +/** descriptor for CPU partitions */ +struct cfs_cpt_table { + /* version, reserved for hotplug */ + unsigned ctb_version; + /* spread rotor for NUMA allocator */ + unsigned ctb_spread_rotor; + /* # of CPU partitions */ + unsigned ctb_nparts; + /* partitions tables */ + struct cfs_cpu_partition *ctb_parts; + /* shadow HW CPU to CPU partition ID */ + int *ctb_cpu2cpt; + /* all cpus in this partition table */ + cpumask_t *ctb_cpumask; + /* all nodes in this partition table */ + nodemask_t *ctb_nodemask; +}; + +void cfs_cpu_core_siblings(int cpu, cpumask_t *mask); +void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask); +void cfs_node_to_cpumask(int node, cpumask_t *mask); +int cfs_cpu_core_nsiblings(int cpu); +int cfs_cpu_ht_nsiblings(int cpu); + +/** + * comment out definitions for compatible layer + * #define CFS_CPU_NR NR_CPUS + * + * typedef cpumask_t cfs_cpumask_t; + * + * #define cfs_cpu_current() smp_processor_id() + * #define cfs_cpu_online(i) cpu_online(i) + * #define cfs_cpu_online_num() num_online_cpus() + * #define cfs_cpu_online_for_each(i) for_each_online_cpu(i) + * #define cfs_cpu_possible_num() num_possible_cpus() + * #define cfs_cpu_possible_for_each(i) for_each_possible_cpu(i) + * + * #ifdef CONFIG_CPUMASK_SIZE + * #define cfs_cpu_mask_size() cpumask_size() + * #else + * #define cfs_cpu_mask_size() sizeof(cfs_cpumask_t) + * #endif + * + * #define cfs_cpu_mask_set(i, mask) cpu_set(i, mask) + * #define cfs_cpu_mask_unset(i, mask) cpu_clear(i, mask) + * #define cfs_cpu_mask_isset(i, mask) cpu_isset(i, mask) + * #define cfs_cpu_mask_clear(mask) cpus_clear(mask) + * #define cfs_cpu_mask_empty(mask) cpus_empty(mask) + * #define cfs_cpu_mask_weight(mask) cpus_weight(mask) + * #define cfs_cpu_mask_first(mask) first_cpu(mask) + * #define cfs_cpu_mask_any_online(mask) (any_online_cpu(mask) != NR_CPUS) + * #define cfs_cpu_mask_for_each(i, mask) for_each_cpu_mask(i, mask) + * #define cfs_cpu_mask_bind(t, mask) set_cpus_allowed(t, mask) + * + * #ifdef HAVE_CPUMASK_COPY + * #define cfs_cpu_mask_copy(dst, src) cpumask_copy(dst, src) + * #else + * #define cfs_cpu_mask_copy(dst, src) memcpy(dst, src, sizeof(*src)) + * #endif + * + * static inline void + * cfs_cpu_mask_of_online(cfs_cpumask_t *mask) + * { + * cfs_cpu_mask_copy(mask, &cpu_online_map); + * } + * + * #ifdef CONFIG_NUMA + * + * #define CFS_NODE_NR MAX_NUMNODES + * + * typedef nodemask_t cfs_node_mask_t; + * + * #define cfs_node_of_cpu(cpu) cpu_to_node(cpu) + * #define cfs_node_online(i) node_online(i) + * #define cfs_node_online_num() num_online_nodes() + * #define cfs_node_online_for_each(i) for_each_online_node(i) + * #define cfs_node_possible_num() num_possible_nodes() + * #define cfs_node_possible_for_each(i) for_each_node(i) + * + * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask) + * { + * #if defined(HAVE_NODE_TO_CPUMASK) + * *mask = node_to_cpumask(node); + * #elif defined(HAVE_CPUMASK_OF_NODE) + * cfs_cpu_mask_copy(mask, cpumask_of_node(node)); + * #else + * # error "Needs node_to_cpumask or cpumask_of_node" + * #endif + * } + * + * #define cfs_node_mask_set(i, mask) node_set(i, mask) + * #define cfs_node_mask_unset(i, mask) node_clear(i, mask) + * #define cfs_node_mask_isset(i, mask) node_isset(i, mask) + * #define cfs_node_mask_clear(mask) nodes_reset(mask) + * #define cfs_node_mask_empty(mask) nodes_empty(mask) + * #define cfs_node_mask_weight(mask) nodes_weight(mask) + * #define cfs_node_mask_for_each(i, mask) for_each_node_mask(i, mask) + * #define cfs_node_mask_copy(dst, src) memcpy(dst, src, sizeof(*src)) + * + * static inline void + * cfs_node_mask_of_online(cfs_node_mask_t *mask) + * { + * cfs_node_mask_copy(mask, &node_online_map); + * } + * + * #endif + */ + +#endif /* CONFIG_SMP */ +#endif /* __LIBCFS_LINUX_CPU_H__ */ diff --git a/libcfs/libcfs/Makefile.in b/libcfs/libcfs/Makefile.in index c5026a4..b8273b8 100644 --- a/libcfs/libcfs/Makefile.in +++ b/libcfs/libcfs/Makefile.in @@ -1,7 +1,7 @@ MODULES = libcfs libcfs-linux-objs := linux-tracefile.o linux-debug.o -libcfs-linux-objs += linux-prim.o linux-mem.o +libcfs-linux-objs += linux-prim.o linux-mem.o linux-cpu.o libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o libcfs-linux-objs += linux-lwt.o linux-proc.o linux-curproc.o libcfs-linux-objs += linux-utils.o linux-module.o @@ -22,9 +22,9 @@ libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) endif -libcfs-all-objs := debug.o fail.o nidstrings.o lwt.o module.o tracefile.o watchdog.o \ - libcfs_string.o hash.o kernel_user_comm.o prng.o workitem.o \ - upcall_cache.o +libcfs-all-objs := debug.o fail.o nidstrings.o lwt.o module.o tracefile.o \ + watchdog.o libcfs_string.o hash.o kernel_user_comm.o \ + prng.o workitem.o upcall_cache.o libcfs_cpu.o libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) diff --git a/libcfs/libcfs/autoMakefile.am b/libcfs/libcfs/autoMakefile.am index dbbc1fb..2b41949 100644 --- a/libcfs/libcfs/autoMakefile.am +++ b/libcfs/libcfs/autoMakefile.am @@ -44,7 +44,7 @@ if LIBLUSTRE noinst_LIBRARIES= libcfs.a libcfs_a_SOURCES= posix/posix-debug.c user-prim.c user-lock.c user-tcpip.c \ prng.c user-bitops.c user-mem.c hash.c kernel_user_comm.c \ - workitem.c fail.c + workitem.c fail.c libcfs_cpu.c libcfs_a_CPPFLAGS = $(LLCPPFLAGS) libcfs_a_CFLAGS = $(LLCFLAGS) endif @@ -88,5 +88,6 @@ install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux-*.c linux/*.o darwin/*.o libcfs EXTRA_DIST := $(libcfs-all-objs:%.o=%.c) Info.plist tracefile.h prng.c \ - user-lock.c user-tcpip.c user-bitops.c user-prim.c workitem.c \ - user-mem.c kernel_user_comm.c fail.c linux/linux-tracefile.h + user-lock.c user-tcpip.c user-bitops.c user-prim.c workitem.c \ + user-mem.c kernel_user_comm.c fail.c libcfs_cpu.c \ + linux/linux-tracefile.h diff --git a/libcfs/libcfs/libcfs_cpu.c b/libcfs/libcfs/libcfs_cpu.c new file mode 100644 index 0000000..98c0cb99 --- /dev/null +++ b/libcfs/libcfs/libcfs_cpu.c @@ -0,0 +1,204 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * + * Author: liang@whamcloud.com + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** Global cpu partition table */ +struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL; +EXPORT_SYMBOL(cfs_cpt_table); + +#ifndef HAVE_LIBCFS_CPT + +#define CFS_CPU_VERSION_MAGIC 0xbabecafe + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + + if (ncpt != 1) { + CERROR("Can't support cpu partition number %d\n", ncpt); + return NULL; + } + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab != NULL) { + cptab->ctb_version = CFS_CPU_VERSION_MAGIC; + cptab->ctb_nparts = ncpt; + } + + return cptab; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_online); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_from_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) { + cfs_cpt_table_free(cfs_cpt_table); + cfs_cpt_table = NULL; + } +} + +int +cfs_cpu_init(void) +{ + cfs_cpt_table = cfs_cpt_table_alloc(1); + + return cfs_cpt_table != NULL ? 0 : -1; +} + +#endif /* HAVE_LIBCFS_CPT */ diff --git a/libcfs/libcfs/linux/Makefile.am b/libcfs/libcfs/linux/Makefile.am index 48c4dfd..4792d73 100644 --- a/libcfs/libcfs/linux/Makefile.am +++ b/libcfs/libcfs/linux/Makefile.am @@ -1,5 +1,5 @@ EXTRA_DIST := linux-debug.c linux-lwt.c linux-prim.c linux-tracefile.c \ linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c \ - linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c + linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c linux-cpu.c diff --git a/libcfs/libcfs/linux/linux-cpu.c b/libcfs/libcfs/linux/linux-cpu.c new file mode 100644 index 0000000..896fd7d --- /dev/null +++ b/libcfs/libcfs/linux/linux-cpu.c @@ -0,0 +1,1117 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include + +#ifdef CONFIG_SMP + +/** + * modparam for setting number of partitions + * + * 0 : estimate best value based on cores or NUMA nodes + * 1 : disable multiple partitions + * >1 : specify number of partitions + */ +/* NB: we set it to 1 for now, multiple partitions will be enabled after + * all smp node affinity code landed */ +static int cpu_npartitions = 1; +CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions"); + +/** + * modparam for setting CPU partitions patterns: + * + * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * number in bracket is processor ID (core or HT) + * + * i.e: "N 0[0,1] 1[2,3]" the first character 'n' means numbers in bracket + * are NUMA node ID, number before bracket is CPU partition ID. + * + * NB: If user specified cpu_pattern, cpu_npartitions will be ignored + */ +static char *cpu_pattern = ""; +CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern"); + +struct cfs_cpt_data { + /* serialize hotplug etc */ + spinlock_t cpt_lock; + /* reserved for hotplug */ + unsigned long cpt_version; + /* mutex to protect cpt_cpumask */ + struct semaphore cpt_mutex; + /* scratch buffer for set/unset_node */ + cpumask_t *cpt_cpumask; +}; + +static struct cfs_cpt_data cpt_data; + +void +cfs_cpu_core_siblings(int cpu, cpumask_t *mask) +{ +#if defined(HAVE_TOPOLOGY_CORE_CPUMASK) + /* return cpumask of cores in the same socket */ + cpumask_copy(mask, topology_core_cpumask(cpu)); +#elif defined(HAVE_TOPOLOGY_CORE_SIBLINGS) + *mask = topology_core_siblings(cpu); +#else + cpus_clear(*mask); + cpu_set(cpu, *mask); +#endif +} +EXPORT_SYMBOL(cfs_cpu_core_siblings); + +/* return number of cores in the same socket of \a cpu */ +int +cfs_cpu_core_nsiblings(int cpu) +{ + int num; + + down(&cpt_data.cpt_mutex); + + cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask); + num = cpus_weight(*cpt_data.cpt_cpumask); + + up(&cpt_data.cpt_mutex); + + return num; +} +EXPORT_SYMBOL(cfs_cpu_core_nsiblings); + +/* return cpumask of HTs in the same core */ +void +cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) +{ +#if defined(HAVE_TOPOLOGY_THREAD_CPUMASK) + cpumask_copy(mask, topology_thread_cpumask(cpu)); +#elif defined(HAVE_TOPOLOGY_THREAD_SIBLINGS) + *mask = topology_thread_siblings(cpu); +#else + cpus_clear(*mask); + cpu_set(cpu, *mask); +#endif +} +EXPORT_SYMBOL(cfs_cpu_ht_siblings); + +/* return number of HTs in the same core of \a cpu */ +int +cfs_cpu_ht_nsiblings(int cpu) +{ + int num; + + down(&cpt_data.cpt_mutex); + + cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask); + num = cpus_weight(*cpt_data.cpt_cpumask); + + up(&cpt_data.cpt_mutex); + + return num; +} +EXPORT_SYMBOL(cfs_cpu_ht_nsiblings); + +void +cfs_node_to_cpumask(int node, cpumask_t *mask) +{ +#if defined(HAVE_CPUMASK_OF_NODE) + cpumask_copy(mask, cpumask_of_node(node)); +#elif defined(HAVE_NODE_TO_CPUMASK) + *mask = node_to_cpumask(node); +#else + cpus_clear(*mask); + for_each_online_cpu(i) { + if (cpu_to_node(i) == node) + cpu_set(i, *mask); + } +#endif +} +EXPORT_SYMBOL(cfs_node_to_cpumask); + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + int i; + + if (cptab->ctb_cpu2cpt != NULL) { + LIBCFS_FREE(cptab->ctb_cpu2cpt, + num_possible_cpus() * + sizeof(cptab->ctb_cpu2cpt[0])); + } + + for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask != NULL) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + if (part->cpt_cpumask != NULL) + LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); + } + + if (cptab->ctb_parts != NULL) { + LIBCFS_FREE(cptab->ctb_parts, + cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); + } + + if (cptab->ctb_nodemask != NULL) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (cptab->ctb_cpumask != NULL) + LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + int i; + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab == NULL) + return NULL; + + cptab->ctb_nparts = ncpt; + + LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size()); + LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + + if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL) + goto failed; + + LIBCFS_ALLOC(cptab->ctb_cpu2cpt, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + if (cptab->ctb_cpu2cpt == NULL) + goto failed; + + memset(cptab->ctb_cpu2cpt, -1, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + + LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0])); + if (cptab->ctb_parts == NULL) + goto failed; + + for (i = 0; i < ncpt; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size()); + LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); + if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL) + goto failed; + } + + spin_lock(&cpt_data.cpt_lock); + /* Reserved for hotplug */ + cptab->ctb_version = cpt_data.cpt_version; + spin_unlock(&cpt_data.cpt_lock); + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +int +cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc = 0; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len > 0) { + rc = snprintf(tmp, len, "%d\t: ", i); + len -= rc; + } + + if (len <= 0) { + rc = -EFBIG; + goto out; + } + + tmp += rc; + for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) { + rc = snprintf(tmp, len, "%d ", j); + len -= rc; + if (len <= 0) { + rc = -EFBIG; + goto out; + } + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + out: + if (rc < 0) + return rc; + + return tmp - buf; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return cptab->ctb_nparts; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpus_weight(*cptab->ctb_cpumask) : + cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask); +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS : + any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS; +} +EXPORT_SYMBOL(cfs_cpt_online); + +cpumask_t * +cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask; +} +EXPORT_SYMBOL(cfs_cpt_cpumask); + +nodemask_t * +cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; +} +EXPORT_SYMBOL(cfs_cpt_nodemask); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + + LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + + if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) { + CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); + return 0; + } + + if (cptab->ctb_cpu2cpt[cpu] != -1) { + CDEBUG(D_INFO, "CPU %d is already in partition %d\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + cptab->ctb_cpu2cpt[cpu] = cpt; + + LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask)); + LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); + + cpu_set(cpu, *cptab->ctb_cpumask); + cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); + + node = cpu_to_node(cpu); + + /* first CPU of @node in this CPT table */ + if (!node_isset(node, *cptab->ctb_nodemask)) + node_set(node, *cptab->ctb_nodemask); + + /* first CPU of @node in this partition */ + if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)) + node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpu < 0 || cpu >= NR_CPUS) { + CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); + return; + } + + if (cpt == CFS_CPT_ANY) { + /* caller doesn't know the partition ID */ + cpt = cptab->ctb_cpu2cpt[cpu]; + if (cpt < 0) { /* not set in this CPT-table */ + CDEBUG(D_INFO, "Try to unset cpu %d which is " + "not in CPT-table %p\n", cpt, cptab); + return; + } + + } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { + CDEBUG(D_INFO, + "CPU %d is not in cpu-partition %d\n", cpu, cpt); + return; + } + + LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); + LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask)); + + cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); + cpu_clear(cpu, *cptab->ctb_cpumask); + cptab->ctb_cpu2cpt[cpu] = -1; + + node = cpu_to_node(cpu); + + LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)); + LASSERT(node_isset(node, *cptab->ctb_nodemask)); + + for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) { + /* this CPT has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i == NR_CPUS) + node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + for_each_cpu_mask(i, *cptab->ctb_cpumask) { + /* this CPT-table has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i == NR_CPUS) + node_clear(node, *cptab->ctb_nodemask); + + return; +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) { + CDEBUG(D_INFO, "No online CPU is found in the CPU mask " + "for CPU partition %d\n", cpt); + return 0; + } + + for_each_cpu_mask(i, *mask) { + if (!cfs_cpt_set_cpu(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + for_each_cpu_mask(i, *mask) + cfs_cpt_unset_cpu(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + int rc; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return 0; + } + + down(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + rc = cfs_cpt_set_cpumask(cptab, cpt, mask); + + up(&cpt_data.cpt_mutex); + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return; + } + + down(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + cfs_cpt_unset_cpumask(cptab, cpt, mask); + + up(&cpt_data.cpt_mutex); +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) { + if (!cfs_cpt_set_node(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) + cfs_cpt_unset_node(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ + int last; + int i; + + if (cpt == CFS_CPT_ANY) { + last = cptab->ctb_nparts - 1; + cpt = 0; + } else { + last = cpt; + } + + for (; cpt <= last; cpt++) { + for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) + cfs_cpt_unset_cpu(cptab, cpt, i); + } +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *mask; + int weight; + int rotor; + int node; + + /* convert CPU partition ID to HW node id */ + + if (cpt < 0 || cpt >= cptab->ctb_nparts) { + mask = cptab->ctb_nodemask; + rotor = cptab->ctb_spread_rotor++; + } else { + mask = cptab->ctb_parts[cpt].cpt_nodemask; + rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + } + + weight = nodes_weight(*mask); + LASSERT(weight > 0); + + rotor %= weight; + + for_each_node_mask(node, *mask) { + if (rotor-- == 0) + return node; + } + + LBUG(); + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + int cpu = smp_processor_id(); + int cpt = cptab->ctb_cpu2cpt[cpu]; + + if (cpt < 0) { + if (!remap) + return cpt; + + /* don't return negative value for safety of upper layer, + * instead we shadow the unknown cpu to a valid partition ID */ + cpt = cpu % cptab->ctb_nparts; + } + + return cpt; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + LASSERT(cpu >= 0 && cpu < NR_CPUS); + + return cptab->ctb_cpu2cpt[cpu]; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + cpumask_t *cpumask; + nodemask_t *nodemask; + int rc; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpt == CFS_CPT_ANY) { + cpumask = cptab->ctb_cpumask; + nodemask = cptab->ctb_nodemask; + } else { + cpumask = cptab->ctb_parts[cpt].cpt_cpumask; + nodemask = cptab->ctb_parts[cpt].cpt_nodemask; + } + + if (any_online_cpu(*cpumask) == NR_CPUS) { + CERROR("No online CPU found in CPU partition %d, did someone " + "do CPU hotplug on system? You might need to reload " + "Lustre modules to keep system working well.\n", cpt); + return -EINVAL; + } + + for_each_online_cpu(i) { + if (cpu_isset(i, *cpumask)) + continue; + +#ifdef HAVE_SET_CPUS_ALLOWED + rc = set_cpus_allowed(cfs_current(), *cpumask); +#else + rc = set_cpus_allowed_ptr(cfs_current(), cpumask); +#endif +#ifdef HAVE_SET_MEMS_ALLOWED + set_mems_allowed(*nodemask); +#endif + if (rc == 0) + cfs_schedule(); /* switch to allowed CPU */ + + return rc; + } + + /* don't need to set affinity baecause all online CPUs are covered */ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +/** + * Choose max to \a number CPUs from \a node and set them in \a cpt. + * We always prefer to choose CPU in the same core/socket. + */ +static int +cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, + cpumask_t *node, int number) +{ + cpumask_t *socket = NULL; + cpumask_t *core = NULL; + int rc = 0; + int cpu; + + LASSERT(number > 0); + + if (number >= cpus_weight(*node)) { + while (!cpus_empty(*node)) { + cpu = first_cpu(*node); + + rc = cfs_cpt_set_cpu(cptab, cpt, cpu); + if (!rc) + return -EINVAL; + cpu_clear(cpu, *node); + } + return 0; + } + + /* allocate scratch buffer */ + LIBCFS_ALLOC(socket, cpumask_size()); + LIBCFS_ALLOC(core, cpumask_size()); + if (socket == NULL || core == NULL) { + rc = -ENOMEM; + goto out; + } + + while (!cpus_empty(*node)) { + cpu = first_cpu(*node); + + /* get cpumask for cores in the same socket */ + cfs_cpu_core_siblings(cpu, socket); + cpus_and(*socket, *socket, *node); + + LASSERT(!cpus_empty(*socket)); + + while (!cpus_empty(*socket)) { + int i; + + /* get cpumask for hts in the same core */ + cfs_cpu_ht_siblings(cpu, core); + cpus_and(*core, *core, *node); + + LASSERT(!cpus_empty(*core)); + + for_each_cpu_mask(i, *core) { + cpu_clear(i, *socket); + cpu_clear(i, *node); + + rc = cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + rc = -EINVAL; + goto out; + } + + if (--number == 0) + goto out; + } + cpu = first_cpu(*socket); + } + } + + out: + if (socket != NULL) + LIBCFS_FREE(socket, cpumask_size()); + if (core != NULL) + LIBCFS_FREE(core, cpumask_size()); + return rc; +} + +#define CPT_WEIGHT_MIN 4u + +static unsigned int +cfs_cpt_num_estimate(void) +{ + unsigned nnode = num_online_nodes(); + unsigned ncpu = num_online_cpus(); + unsigned ncpt; + + if (ncpu <= CPT_WEIGHT_MIN) { + ncpt = 1; + goto out; + } + + /* generate reasonable number of CPU partitions based on total number + * of CPUs, Preferred N should be power2 and match this condition: + * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */ + for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {} + + if (ncpt <= nnode) { /* fat numa system */ + while (nnode > ncpt) + nnode >>= 1; + + } else { /* ncpt > nnode */ + while ((nnode << 1) <= ncpt) + nnode <<= 1; + } + + ncpt = nnode; + + out: +#if (BITS_PER_LONG == 32) + /* config many CPU partitions on 32-bit system could consume + * too much memory */ + ncpt = min(2U, ncpt); +#endif + while (ncpu % ncpt != 0) + ncpt--; /* worst case is 1 */ + + return ncpt; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create(int ncpt) +{ + struct cfs_cpt_table *cptab = NULL; + cpumask_t *mask = NULL; + int cpt = 0; + int num; + int rc; + int i; + + rc = cfs_cpt_num_estimate(); + if (ncpt <= 0) + ncpt = rc; + + if (ncpt > rc) { + CWARN("CPU partition number %d is larger than suggested " + "value(%d), your system may have performance" + "issue or run out of memory while under pressure\n", + ncpt, rc); + } + + if (num_online_cpus() % ncpt != 0) { + CERROR("CPU number %d is not multiple of cpu_npartition %d, " + "please try different cpu_npartitions value or" + "set pattern string by cpu_pattern=STRING\n", + (int)num_online_cpus(), ncpt); + goto failed; + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate CPU map(%d)\n", ncpt); + goto failed; + } + + num = num_online_cpus() / ncpt; + if (num == 0) { + CERROR("CPU changed while setting CPU partition\n"); + goto failed; + } + + LIBCFS_ALLOC(mask, cpumask_size()); + if (mask == NULL) { + CERROR("Failed to allocate scratch cpumask\n"); + goto failed; + } + + for_each_online_node(i) { + cfs_node_to_cpumask(i, mask); + + while (!cpus_empty(*mask)) { + struct cfs_cpu_partition *part; + int n; + + if (cpt >= ncpt) + goto failed; + + part = &cptab->ctb_parts[cpt]; + + n = num - cpus_weight(*part->cpt_cpumask); + LASSERT(n > 0); + + rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); + if (rc < 0) + goto failed; + + LASSERT(num >= cpus_weight(*part->cpt_cpumask)); + if (num == cpus_weight(*part->cpt_cpumask)) + cpt++; + } + } + + if (cpt != ncpt || + num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { + CERROR("Expect %d(%d) CPU partitions but got %d(%d), " + "CPU hotplug/unplug while setting?\n", + cptab->ctb_nparts, num, cpt, + cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)); + goto failed; + } + + LIBCFS_FREE(mask, cpumask_size()); + + return cptab; + + failed: + CERROR("Failed to setup CPU-partition-table with %d " + "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", + ncpt, num_online_nodes(), num_online_cpus()); + + if (mask != NULL) + LIBCFS_FREE(mask, cpumask_size()); + + if (cptab != NULL) + cfs_cpt_table_free(cptab); + + return NULL; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create_pattern(char *pattern) +{ + struct cfs_cpt_table *cptab; + char *str = pattern; + int node = 0; + int high; + int ncpt; + int c; + + for (ncpt = 0;; ncpt++) { /* quick scan bracket */ + str = strchr(str, '['); + if (str == NULL) + break; + str++; + } + + str = cfs_trimwhite(pattern); + if (*str == 'n' || *str == 'N') { + pattern = str + 1; + node = 1; + } + + if (ncpt == 0 || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern %s, or too many partitions %d\n", + pattern, ncpt); + return NULL; + } + + high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1; + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate cpu partition table\n"); + return NULL; + } + + for (str = cfs_trimwhite(pattern), c = 0;; c++) { + struct cfs_range_expr *range; + struct cfs_expr_list *el; + char *bracket = strchr(str, '['); + int cpt; + int rc; + int i; + int n; + + if (bracket == NULL) { + if (*str != 0) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } else if (c != ncpt) { + CERROR("expect %d partitions but found %d\n", + ncpt, c); + goto failed; + } + break; + } + + if (sscanf(str, "%u%n", &cpt, &n) < 1) { + CERROR("Invalid cpu pattern %s\n", str); + goto failed; + } + + if (cpt < 0 || cpt >= ncpt) { + CERROR("Invalid partition id %d, total partitions %d\n", + cpt, ncpt); + goto failed; + } + + if (cfs_cpt_weight(cptab, cpt) != 0) { + CERROR("Partition %d has already been set.\n", cpt); + goto failed; + } + + str = cfs_trimwhite(str + n); + if (str != bracket) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } + + bracket = strchr(str, ']'); + if (bracket == NULL) { + CERROR("missing right bracket for cpt %d, %s\n", + cpt, str); + goto failed; + } + + if (cfs_expr_list_parse(str, (bracket - str) + 1, + 0, high, &el) != 0) { + CERROR("Can't parse number range: %s\n", str); + goto failed; + } + + cfs_list_for_each_entry(range, &el->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; i++) { + if ((i - range->re_lo) % range->re_stride != 0) + continue; + + rc = node ? cfs_cpt_set_node(cptab, cpt, i) : + cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + cfs_expr_list_free(el); + goto failed; + } + } + } + + cfs_expr_list_free(el); + + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("No online CPU is found on partition %d\n", cpt); + goto failed; + } + + str = cfs_trimwhite(bracket + 1); + } + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} + +#ifdef CONFIG_HOTPLUG_CPU +static int +cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&cpt_data.cpt_lock); + cpt_data.cpt_version++; + spin_unlock(&cpt_data.cpt_lock); + default: + CWARN("Lustre: can't support CPU hotplug well now, " + "performance and stability could be impacted" + "[CPU %u notify: %lx]\n", cpu, action); + } + + return NOTIFY_OK; +} + +static struct notifier_block cfs_cpu_notifier = { + .notifier_call = cfs_cpu_notify, + .priority = 0 +}; + +#endif + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) + cfs_cpt_table_free(cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif + if (cpt_data.cpt_cpumask != NULL) + LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size()); +} + +int +cfs_cpu_init(void) +{ + LASSERT(cfs_cpt_table == NULL); + + memset(&cpt_data, 0, sizeof(cpt_data)); + + LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size()); + if (cpt_data.cpt_cpumask == NULL) { + CERROR("Failed to allocate scratch buffer\n"); + return -1; + } + + spin_lock_init(&cpt_data.cpt_lock); + sema_init(&cpt_data.cpt_mutex, 1); + +#ifdef CONFIG_HOTPLUG_CPU + register_hotcpu_notifier(&cfs_cpu_notifier); +#endif + + if (*cpu_pattern != 0) { + cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create cptab from pattern %s\n", + cpu_pattern); + goto failed; + } + + } else { + cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create ptable with npartitions %d\n", + cpu_npartitions); + goto failed; + } + } + + spin_lock(&cpt_data.cpt_lock); + if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) { + spin_unlock(&cpt_data.cpt_lock); + CERROR("CPU hotplug/unplug during setup\n"); + goto failed; + } + spin_unlock(&cpt_data.cpt_lock); + + LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n", + num_online_cpus(), cfs_cpt_number(cfs_cpt_table)); + return 0; + + failed: + cfs_cpu_fini(); + return -1; +} + +#endif diff --git a/libcfs/libcfs/linux/linux-proc.c b/libcfs/libcfs/linux/linux-proc.c index c6c81e4..e82b675 100644 --- a/libcfs/libcfs/linux/linux-proc.c +++ b/libcfs/libcfs/linux/linux-proc.c @@ -97,6 +97,7 @@ enum { PSDEV_CONSOLE_BACKOFF, /* delay increase factor */ PSDEV_DEBUG_PATH, /* crashdump log location */ PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ + PSDEV_CPT_TABLE, /* information about cpu partitions */ PSDEV_LNET_UPCALL, /* User mode upcall script */ PSDEV_LNET_MEMUSED, /* bytes currently PORTAL_ALLOCated */ PSDEV_LNET_CATASTROPHE, /* if we have LBUGged or panic'd */ @@ -121,6 +122,7 @@ enum { #define PSDEV_CONSOLE_BACKOFF CTL_UNNUMBERED #define PSDEV_DEBUG_PATH CTL_UNNUMBERED #define PSDEV_DEBUG_DUMP_PATH CTL_UNNUMBERED +#define PSDEV_CPT_TABLE CTL_UNNUMBERED #define PSDEV_LNET_UPCALL CTL_UNNUMBERED #define PSDEV_LNET_MEMUSED CTL_UNNUMBERED #define PSDEV_LNET_CATASTROPHE CTL_UNNUMBERED @@ -356,6 +358,48 @@ int LL_PROC_PROTO(proc_fail_loc) return rc; } +static int __proc_cpt_table(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + LASSERT(cfs_cpt_table != NULL); + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_table, buf, len); + if (rc >= 0) + break; + + LIBCFS_FREE(buf, len); + if (rc == -EFBIG) { + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); + out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} +DECLARE_PROC_HANDLER(proc_cpt_table) + static cfs_sysctl_table_t lnet_table[] = { /* * NB No .strategy entries have been provided since sysctl(8) prefers @@ -424,6 +468,14 @@ static cfs_sysctl_table_t lnet_table[] = { .proc_handler = &proc_dostring, }, + { + .ctl_name = PSDEV_CPT_TABLE, + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + { INIT_CTL_NAME(PSDEV_LNET_UPCALL) .procname = "upcall", diff --git a/libcfs/libcfs/module.c b/libcfs/libcfs/module.c index 2529b38..757723e 100644 --- a/libcfs/libcfs/module.c +++ b/libcfs/libcfs/module.c @@ -392,6 +392,10 @@ static int init_libcfs_module(void) return (rc); } + rc = cfs_cpu_init(); + if (rc != 0) + goto cleanup_debug; + #if LWT_SUPPORT rc = lwt_init(); if (rc != 0) { @@ -427,8 +431,8 @@ static int init_libcfs_module(void) cleanup_lwt: #if LWT_SUPPORT lwt_fini(); - cleanup_debug: #endif + cleanup_debug: libcfs_debug_cleanup(); return rc; } @@ -450,6 +454,7 @@ static void exit_libcfs_module(void) #if LWT_SUPPORT lwt_fini(); #endif + cfs_cpu_fini(); if (cfs_atomic_read(&libcfs_kmemory) != 0) CERROR("Portals memory leaked: %d bytes\n", -- 1.8.3.1