*
* . User can also specify CPU partitions by string pattern
*
- * Examples: cpu_partitions="0[0,1], 1[2,3]"
- * cpu_partitions="N 0[0-3], 1[4-8]"
+ * Examples: cpu_pattern="0[0,1] 1[2,3]"
+ * cpu_pattern="N 0[0-3] 1[4-8]"
+ * cpu_pattern="C[0-3]"
*
- * The first character "N" means following numbers are numa ID
+ * The first character "N" means following numbers are NUMA ID.
+ *
+ * The first character "C" means the relative cores are excluded from each
+ * NUMA node. This allows reserving cores on each node for non-Lustre tasks,
+ * such as HA/monitors.
*
* . NUMA allocators, CPU affinity threads are built over CPU partitions,
* instead of HW CPUs or HW nodes.
*
* . By default, Lustre modules should refer to the global cfs_cpt_tab,
* instead of accessing HW CPUs directly, so concurrency of Lustre can be
- * configured by cpu_npartitions of the global cfs_cpt_tab
+ * configured by cpu_npartitions of the global cfs_cpt_tab. This is
+ * equivalent to specifying cpu_pattern="N"
*
- * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ * . If cpu_npartitions=1 (all CPUs in one pool), Lustre should work the
* same way as 2.2 or earlier versions
*
* Author: liang@whamcloud.com
int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
/**
* add \a cpu to CPU partition @cpt of \a cptab, return 1 for success,
- * otherwise 0 is returned
+ * otherwise return 0
*/
int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
/**
*/
void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
/**
+ * add all cpus in NUMA node within include range \a node to
+ * CPU partition \a return 1 if succesfully set selected node
+ * cores, otherwise return 0
+ */
+int cfs_cpt_set_node_core(struct cfs_cpt_table *cptab, int cpt,
+ int include_lo, int include_hi);
+/**
+ * remove all cpus in NUMA node within exclude range \a node to
+ * CPU partition \a cpt
+ */
+void cfs_cpt_unset_node_core(struct cfs_cpt_table *cptab, int cpt,
+ int exclude_lo, int exclude_hi);
+/**
* add all cpus in node mask \a mask to CPU partition \a cpt
* return 1 if successfully set all CPUs, otherwise return 0
*/
/**
* modparam for setting CPU partitions patterns:
*
- * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ * i.e: "0[0-3] 1[4,5,7]", number before bracket is CPU partition ID,
* number in bracket is processor ID (core or HT)
*
- * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
* are NUMA node ID, number before bracket is CPU partition ID.
*
- * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ * i.e: "N C[0-1]" or "C[0-1], the character 'C' means numbers in bracket are
+ * relative core numbers to exclude for each NUMA node, all other cores
+ * are included. As per the example, the first two cores of each NUMA node
+ * will be excluded, all other cores on all nodes are included.
+ *
+ * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
+ * This is the default behavior if the cpu_pattern and cpu_npartitions
+ * are not specified.
*
* NB: If user specified cpu_pattern, cpu_npartitions will be ignored
*/
int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
{
- LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+ LASSERTF(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts),
+ "cpt=%d, nparts=%d\n", cpt, cptab->ctb_nparts);
return cpt == CFS_CPT_ANY ?
cpumask_any_and(cptab->ctb_cpumask,
int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
- LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+ LASSERTF(cpt >= 0 && cpt < cptab->ctb_nparts, "cpt=%d, nparts=%d\n",
+ cpt, cptab->ctb_nparts);
if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
mask = cpumask_of_node(node);
- for_each_cpu(cpu, mask)
- cfs_cpt_add_cpu(cptab, cpt, cpu);
+ if (!cpumask_empty(mask))
+ for_each_cpu(cpu, mask) {
+ CDEBUG(D_INFO,
+ "set_node() cpu=%d cpt=%d\n", cpu, cpt);
+ cfs_cpt_add_cpu(cptab, cpt, cpu);
+ }
cfs_cpt_add_node(cptab, cpt, node);
mask = cpumask_of_node(node);
- for_each_cpu(cpu, mask)
- cfs_cpt_del_cpu(cptab, cpt, cpu);
+ if (!cpumask_empty(mask))
+ for_each_cpu(cpu, mask)
+ cfs_cpt_del_cpu(cptab, cpt, cpu);
cfs_cpt_del_node(cptab, cpt, node);
}
EXPORT_SYMBOL(cfs_cpt_unset_node);
+int cfs_cpt_set_node_core(struct cfs_cpt_table *cptab, int cpt,
+ int include_lo, int include_hi)
+{
+ const cpumask_t *mask;
+ int node, cpu;
+ int offset = -1;
+
+ for_each_online_node(node) {
+ mask = cpumask_of_node(node);
+ if (cpumask_empty(mask))
+ continue;
+
+ for_each_cpu(cpu, mask) {
+ if (offset < 0)
+ offset = cpu;
+ if (include_lo + offset <= cpu &&
+ include_hi + offset >= cpu)
+ cfs_cpt_add_cpu(cptab, cpt, cpu);
+ }
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node_core);
+
+void cfs_cpt_unset_node_core(struct cfs_cpt_table *cptab, int cpt,
+ int exclude_lo, int exclude_hi)
+{
+ const cpumask_t *mask;
+ int node, cpu;
+ int offset = -1;
+
+ for_each_online_node(node) {
+ mask = cpumask_of_node(node);
+ if (cpumask_empty(mask))
+ continue;
+
+ for_each_cpu(cpu, mask) {
+ if (offset < 0)
+ offset = cpu;
+ if (exclude_lo + offset <= cpu &&
+ exclude_hi + offset >= cpu)
+ cfs_cpt_del_cpu(cptab, cpt, cpu);
+ }
+ }
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node_core);
+
int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
const nodemask_t *mask)
{
char *pattern_dup;
char *bracket;
char *str;
+ bool exclude = false;
int node = 0;
int ncpt = 0;
int cpt = 0;
- int high;
+ int high = 0;
int rc;
int c;
int i;
str++; /* skip 'N' char */
node = 1; /* NUMA pattern */
if (*str == '\0') {
- node = -1;
- for_each_online_node(i) {
- if (!cpumask_empty(cpumask_of_node(i)))
- ncpt++;
- }
- if (ncpt == 1) { /* single NUMA node */
+ if (cpu_npartitions) {
kfree(pattern_dup);
return cfs_cpt_table_create(cpu_npartitions);
}
+ node = -1;
+ for_each_online_node(i)
+ if (!cpumask_empty(cpumask_of_node(i)))
+ ncpt++;
}
+ str = strim(str);
}
- if (!ncpt) { /* scanning bracket which is mark of partition */
+ if (*str == 'c' || *str == 'C') {
+ str++; /* skip 'C' char */
+ exclude = true;
+ node = -1; /* initialize all nodes to be set */
+ for_each_online_node(i)
+ if (!cpumask_empty(cpumask_of_node(i)))
+ ncpt++;
+ } else if (!ncpt) { /* scan for bracket at start of partition */
bracket = str;
while ((bracket = strchr(bracket, '['))) {
bracket++;
ncpt++;
}
- }
-
- if (!ncpt ||
- (node && ncpt > num_online_nodes()) ||
- (!node && ncpt > num_online_cpus())) {
- CERROR("Invalid pattern '%s', or too many partitions %d\n",
- pattern_dup, ncpt);
- rc = -EINVAL;
- goto err_free_str;
+ if ((!ncpt && !exclude) ||
+ (node && ncpt > num_online_nodes()) ||
+ (!node && ncpt > num_online_cpus())) {
+ CERROR("Invalid pattern '%s', or too many partitions %d\n",
+ pattern_dup, ncpt);
+ rc = -EINVAL;
+ goto err_free_str;
+ }
}
cptab = cfs_cpt_table_alloc(ncpt);
rc = -EINVAL;
goto err_free_table;
}
+
+ if (exclude) {
+ c = 0;
+ for_each_cpu(rc, cpumask_of_node(i))
+ c++;
+ if (high == 0 || c < high)
+ high = c;
+ }
+ }
+ if (!exclude) {
+ kfree(pattern_dup);
+ return cptab;
}
- kfree(pattern_dup);
- return cptab;
}
- high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
+ if (!exclude)
+ high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
for (str = strim(str), c = 0; /* until break */; c++) {
struct cfs_range_expr *range;
bracket = strchr(str, '[');
if (!bracket) {
if (*str) {
- CERROR("Invalid pattern '%s'\n", str);
+ CERROR("Invalid pattern '%s'\n",
+ str);
rc = -EINVAL;
goto err_free_table;
- } else if (c != ncpt) {
+ } else if (!exclude && c != ncpt) {
CERROR("Expect %d partitions but found %d\n",
ncpt, c);
rc = -EINVAL;
break;
}
- if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+ if (!exclude && sscanf(str, "%d%n", &cpt, &n) < 1) {
CERROR("Invalid CPU pattern '%s'\n", str);
rc = -EINVAL;
goto err_free_table;
}
- if (cpt < 0 || cpt >= ncpt) {
+ if (!exclude && (cpt < 0 || cpt >= ncpt)) {
CERROR("Invalid partition id %d, total partitions %d\n",
cpt, ncpt);
rc = -EINVAL;
goto err_free_table;
}
- if (cfs_cpt_weight(cptab, cpt)) {
+ if (!exclude && cfs_cpt_weight(cptab, cpt)) {
CERROR("Partition %d has already been set.\n", cpt);
rc = -EPERM;
goto err_free_table;
}
- str = strim(str + n);
- if (str != bracket) {
- CERROR("Invalid pattern '%s'\n", str);
+ str = exclude ? bracket : strim(str + n); /* jump to next '[' */
+ if (!exclude && str != bracket) {
+ CERROR("Invalid pattern '%s' does not start with '['\n",
+ str);
rc = -EINVAL;
goto err_free_table;
}
}
list_for_each_entry(range, &el->el_exprs, re_link) {
+ if (exclude && node) {
+ for (cpt = 0; cpt < ncpt; cpt++) {
+ cfs_cpt_unset_node_core(cptab, cpt,
+ range->re_lo,
+ range->re_hi);
+ if (!cfs_cpt_online(cptab, cpt)) {
+ CERROR("All cores are excluded on partition %d\n",
+ cpt);
+ rc = -ENODEV;
+ goto err_free_table;
+ }
+ }
+ continue;
+ }
+
for (i = range->re_lo; i <= range->re_hi; i++) {
if ((i - range->re_lo) % range->re_stride)
continue;
- rc = node ? cfs_cpt_set_node(cptab, cpt, i)
- : cfs_cpt_set_cpu(cptab, cpt, i);
+ rc = node ?
+ cfs_cpt_set_node(cptab, cpt, i)
+ : cfs_cpt_set_cpu(cptab, cpt, i);
+
if (!rc) {
cfs_expr_list_free(el);
rc = -EINVAL;
cfs_expr_list_free(el);
- if (!cfs_cpt_online(cptab, cpt)) {
+ if (!exclude && !cfs_cpt_online(cptab, cpt)) {
CERROR("No online CPU is found on partition %d\n", cpt);
rc = -ENODEV;
goto err_free_table;
local nostdevname=$(ostdevname $nost)
setupall
+ stack_trap "reformat_and_config"
test_mkdir -i 1 -c1 $DIR/$tdir || error "can't mkdir"
log "ADD OST$nost"
}
run_test 154 "expand .. on rename after MDT backup restore"
+cleanup_200() {
+ local modopts=$1
+ stopall
+ $LUSTRE_RMMOD
+ [[ -z $modopts ]] || MODOPTS_LIBCFS=$modopts
+}
+
+test_200a() {
+ cleanup_200
+
+ local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}')
+ local old_modopts=$MODOPTS_LIBCFS
+ stack_trap "cleanup_200 $old_modopts"
+
+ MODOPTS_LIBCFS="cpu_npartitions=$cpus"
+
+ load_modules_local libcfs
+ $LCTL get_param -n cpu_partition_table
+
+ local expected=$(cat /sys/module/libcfs/parameters/cpu_npartitions)
+ local result=$($LCTL get_param -n cpu_partition_table | wc -l)
+
+ (( $result == $expected )) ||
+ error "CPU partitions not $expected, found: $result"
+}
+run_test 200a "check CPU partitions"
+
+test_200b() {
+ cleanup_200
+
+ local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}')
+ local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}')
+ local old_modopts=$MODOPTS_LIBCFS
+ stack_trap "cleanup_200 $old_modopts"
+
+ local pattern="0[$(lscpu | awk '/CPU.s. list:/ {print $NF}')]"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ grep . /sys/module/libcfs/parameters/cpu*
+ $LCTL get_param -n cpu_partition_table
+ local expected=cpus
+ local table=$($LCTL get_param -n cpu_partition_table)
+ # ignore partition num and ':'
+ local actual=$(( $(awk '{print NF; exit}' <<< $table) - 2 ))
+
+ (( expected == actual )) || {
+ echo -e "layout wrong:\n$table"
+ error "partition 0 is missing CPUs from pattern: '$pattern'"
+ }
+
+ (( $(echo $table | wc -l) == 1 )) || {
+ echo -e "layout wrong\n$table"
+ error "layout has too many partitions from pattern: '$pattern'"
+ }
+
+ (( cpus >= 4 )) || skip "need at least 4 cpu cores"
+ cleanup
+
+ pattern="0[1-2]"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ $LCTL get_param -n cpu_partition_table
+ expected="0 : 1 2"
+ table=$($LCTL get_param -n cpu_partition_table)
+
+ [[ $table == $expected ]] ||
+ error "CPU pattern not $expected, found: $table"
+}
+run_test 200b "set CPU pattern using core selection"
+
+test_200c() {
+ cleanup_200
+
+ local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}')
+ local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}')
+
+ local old_modopts=$MODOPTS_LIBCFS
+ stack_trap "cleanup_200 $old_modopts"
+
+ local pattern="N"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ grep . /sys/module/libcfs/parameters/cpu*
+ $LCTL get_param -n cpu_partition_table
+ local expected=$nodes
+ local table=$($LCTL get_param -n cpu_partition_table)
+ local actual=$(echo $table | wc -l)
+
+ (( actual == expected )) ||
+ error "CPU partitions not $expected, found: $actual"
+
+ cleanup
+
+ pattern="0[$(lscpu | awk '/^NUMA node0 CPU.s.:/ {print $NF}')]"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ expected=$($LCTL get_param -n cpu_partition_table)
+
+ cleanup
+
+ pattern="N 0[0]"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ $LCTL get_param -n cpu_partition_table
+ local table=$($LCTL get_param -n cpu_partition_table)
+
+ [[ $table == $expected ]] ||
+ error "CPU pattern not $expected, found: $table"
+}
+run_test 200c "set CPU pattern using NUMA node layout"
+
+test_200e() {
+ cleanup_200
+
+ local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}')
+ local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}')
+
+ local old_modopts=$MODOPTS_LIBCFS
+ stack_trap "cleanup_200 $old_modopts"
+
+ pattern="N"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ echo "full_table:"
+ $LCTL get_param -n cpu_partition_table
+ local full_table=$($LCTL get_param -n cpu_partition_table)
+ (( $(awk '/0.:/ {print NF - 3; exit}' <<< $full_table) > 0 )) ||
+ skip "need at least 2 cores in each CPT to exclude one"
+
+ cleanup
+
+ pattern="N C[0]"
+ MODOPTS_LIBCFS="cpu_pattern=\"$pattern\""
+
+ load_modules_local libcfs
+ echo "table:"
+ grep . /sys/module/libcfs/parameters/cpu*
+ $LCTL get_param -n cpu_partition_table
+ table=$($LCTL get_param -n cpu_partition_table)
+
+ local expected
+ local actual
+ local excluded
+ local partition
+
+ for (( i = 0; i < nodes; i++ )); do
+ expected=$(awk '/'$i'.:/ {print NF - 3; exit}' <<< $full_table)
+ actual=$(awk '/'$i'.:/ {print NF - 2; exit}' <<< $table)
+
+ (( actual == expected )) ||
+ error "CPU count not $expected, found: $actual"
+
+ excluded=$(awk '/'$i'.:/ {print $3; exit}' <<< $full_table)
+ partition=$(awk '/'$i'.:/ {print $3; exit}' <<< $table)
+
+ ! [[ "$partition" =~ "$excluded" ]] || {
+ echo -e "layout wrong:\n$table"
+ error "excluded the wrong CPU with pattern: $pattern"
+ }
+ done
+}
+run_test 200e "set CPU pattern using relative core exclusion"
+
#
# (This was sanity/802a)
#