From: Frederick Dilger Date: Wed, 26 Jun 2024 22:16:56 +0000 (-0600) Subject: LU-17501 libcfs: allow CPT exclude list for cores X-Git-Tag: 2.16.51~120 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=111f5836ec;p=fs%2Flustre-release.git LU-17501 libcfs: allow CPT exclude list for cores Allow a relative CPU core exclude list to be specified when constructing the CPT distribution instead of having to specify all of the CPU cores except the ones that should be avoided. The CPU core numbers are the relative core numbers to each NUMA node, such that for each node, the Mth to Nth cores are excluded. The exclude list is set by specifying 'C' in /etc/modprobe.d/lustre.conf for the CPT number, with either a comma-separated list or a range of CPU cores: options libcfs cpu_pattern="N C[0-3]" or: options libcfs cpu_pattern="C[0,1,2,3]" Both options are equivalent. This would exclude cores 0-3 on each NUMA node from use by LNet and Lustre. It isn't possible to specify both include and exclude lists at the same time, but it doesn't really make sense to do this anyway. Signed-off-by: Frederick Dilger Change-Id: Ief5f36e0bbc49865317cee199c91c9b4a350c418 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55544 Reviewed-by: Andreas Dilger Reviewed-by: Andrew Perepechko Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- diff --git a/lnet/include/lnet/lib-cpt.h b/lnet/include/lnet/lib-cpt.h index 0763285..4b40760 100644 --- a/lnet/include/lnet/lib-cpt.h +++ b/lnet/include/lnet/lib-cpt.h @@ -31,19 +31,25 @@ * * . User can also specify CPU partitions by string pattern * - * Examples: cpu_partitions="0[0,1], 1[2,3]" - * cpu_partitions="N 0[0-3], 1[4-8]" + * Examples: cpu_pattern="0[0,1] 1[2,3]" + * cpu_pattern="N 0[0-3] 1[4-8]" + * cpu_pattern="C[0-3]" * - * The first character "N" means following numbers are numa ID + * The first character "N" means following numbers are NUMA ID. + * + * The first character "C" means the relative cores are excluded from each + * NUMA node. This allows reserving cores on each node for non-Lustre tasks, + * such as HA/monitors. * * . NUMA allocators, CPU affinity threads are built over CPU partitions, * instead of HW CPUs or HW nodes. * * . By default, Lustre modules should refer to the global cfs_cpt_tab, * instead of accessing HW CPUs directly, so concurrency of Lustre can be - * configured by cpu_npartitions of the global cfs_cpt_tab + * configured by cpu_npartitions of the global cfs_cpt_tab. This is + * equivalent to specifying cpu_pattern="N" * - * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * . If cpu_npartitions=1 (all CPUs in one pool), Lustre should work the * same way as 2.2 or earlier versions * * Author: liang@whamcloud.com @@ -127,7 +133,7 @@ unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2); int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); /** * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success, - * otherwise 0 is returned + * otherwise return 0 */ int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); /** @@ -155,6 +161,19 @@ int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); */ void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); /** + * add all cpus in NUMA node within include range \a node to + * CPU partition \a return 1 if succesfully set selected node + * cores, otherwise return 0 + */ +int cfs_cpt_set_node_core(struct cfs_cpt_table *cptab, int cpt, + int include_lo, int include_hi); +/** + * remove all cpus in NUMA node within exclude range \a node to + * CPU partition \a cpt + */ +void cfs_cpt_unset_node_core(struct cfs_cpt_table *cptab, int cpt, + int exclude_lo, int exclude_hi); +/** * add all cpus in node mask \a mask to CPU partition \a cpt * return 1 if successfully set all CPUs, otherwise return 0 */ diff --git a/lnet/lnet/lib-cpt.c b/lnet/lnet/lib-cpt.c index 4724b65..5e845cf 100644 --- a/lnet/lnet/lib-cpt.c +++ b/lnet/lnet/lib-cpt.c @@ -73,13 +73,20 @@ MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); /** * modparam for setting CPU partitions patterns: * - * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * i.e: "0[0-3] 1[4,5,7]", number before bracket is CPU partition ID, * number in bracket is processor ID (core or HT) * - * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket + * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket * are NUMA node ID, number before bracket is CPU partition ID. * - * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology + * i.e: "N C[0-1]" or "C[0-1], the character 'C' means numbers in bracket are + * relative core numbers to exclude for each NUMA node, all other cores + * are included. As per the example, the first two cores of each NUMA node + * will be excluded, all other cores on all nodes are included. + * + * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology + * This is the default behavior if the cpu_pattern and cpu_npartitions + * are not specified. * * NB: If user specified cpu_pattern, cpu_npartitions will be ignored */ @@ -303,7 +310,8 @@ EXPORT_SYMBOL(cfs_cpt_weight); int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) { - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + LASSERTF(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts), + "cpt=%d, nparts=%d\n", cpt, cptab->ctb_nparts); return cpt == CFS_CPT_ANY ? cpumask_any_and(cptab->ctb_cpumask, @@ -469,7 +477,8 @@ static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node) int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) { - LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + LASSERTF(cpt >= 0 && cpt < cptab->ctb_nparts, "cpt=%d, nparts=%d\n", + cpt, cptab->ctb_nparts); if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) { CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); @@ -580,8 +589,12 @@ int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) mask = cpumask_of_node(node); - for_each_cpu(cpu, mask) - cfs_cpt_add_cpu(cptab, cpt, cpu); + if (!cpumask_empty(mask)) + for_each_cpu(cpu, mask) { + CDEBUG(D_INFO, + "set_node() cpu=%d cpt=%d\n", cpu, cpt); + cfs_cpt_add_cpu(cptab, cpt, cpu); + } cfs_cpt_add_node(cptab, cpt, node); @@ -602,13 +615,62 @@ void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) mask = cpumask_of_node(node); - for_each_cpu(cpu, mask) - cfs_cpt_del_cpu(cptab, cpt, cpu); + if (!cpumask_empty(mask)) + for_each_cpu(cpu, mask) + cfs_cpt_del_cpu(cptab, cpt, cpu); cfs_cpt_del_node(cptab, cpt, node); } EXPORT_SYMBOL(cfs_cpt_unset_node); +int cfs_cpt_set_node_core(struct cfs_cpt_table *cptab, int cpt, + int include_lo, int include_hi) +{ + const cpumask_t *mask; + int node, cpu; + int offset = -1; + + for_each_online_node(node) { + mask = cpumask_of_node(node); + if (cpumask_empty(mask)) + continue; + + for_each_cpu(cpu, mask) { + if (offset < 0) + offset = cpu; + if (include_lo + offset <= cpu && + include_hi + offset >= cpu) + cfs_cpt_add_cpu(cptab, cpt, cpu); + } + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node_core); + +void cfs_cpt_unset_node_core(struct cfs_cpt_table *cptab, int cpt, + int exclude_lo, int exclude_hi) +{ + const cpumask_t *mask; + int node, cpu; + int offset = -1; + + for_each_online_node(node) { + mask = cpumask_of_node(node); + if (cpumask_empty(mask)) + continue; + + for_each_cpu(cpu, mask) { + if (offset < 0) + offset = cpu; + if (exclude_lo + offset <= cpu && + exclude_hi + offset >= cpu) + cfs_cpt_del_cpu(cptab, cpt, cpu); + } + } +} +EXPORT_SYMBOL(cfs_cpt_unset_node_core); + int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, const nodemask_t *mask) { @@ -931,10 +993,11 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) char *pattern_dup; char *bracket; char *str; + bool exclude = false; int node = 0; int ncpt = 0; int cpt = 0; - int high; + int high = 0; int rc; int c; int i; @@ -950,33 +1013,39 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) str++; /* skip 'N' char */ node = 1; /* NUMA pattern */ if (*str == '\0') { - node = -1; - for_each_online_node(i) { - if (!cpumask_empty(cpumask_of_node(i))) - ncpt++; - } - if (ncpt == 1) { /* single NUMA node */ + if (cpu_npartitions) { kfree(pattern_dup); return cfs_cpt_table_create(cpu_npartitions); } + node = -1; + for_each_online_node(i) + if (!cpumask_empty(cpumask_of_node(i))) + ncpt++; } + str = strim(str); } - if (!ncpt) { /* scanning bracket which is mark of partition */ + if (*str == 'c' || *str == 'C') { + str++; /* skip 'C' char */ + exclude = true; + node = -1; /* initialize all nodes to be set */ + for_each_online_node(i) + if (!cpumask_empty(cpumask_of_node(i))) + ncpt++; + } else if (!ncpt) { /* scan for bracket at start of partition */ bracket = str; while ((bracket = strchr(bracket, '['))) { bracket++; ncpt++; } - } - - if (!ncpt || - (node && ncpt > num_online_nodes()) || - (!node && ncpt > num_online_cpus())) { - CERROR("Invalid pattern '%s', or too many partitions %d\n", - pattern_dup, ncpt); - rc = -EINVAL; - goto err_free_str; + if ((!ncpt && !exclude) || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern '%s', or too many partitions %d\n", + pattern_dup, ncpt); + rc = -EINVAL; + goto err_free_str; + } } cptab = cfs_cpt_table_alloc(ncpt); @@ -996,12 +1065,23 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) rc = -EINVAL; goto err_free_table; } + + if (exclude) { + c = 0; + for_each_cpu(rc, cpumask_of_node(i)) + c++; + if (high == 0 || c < high) + high = c; + } + } + if (!exclude) { + kfree(pattern_dup); + return cptab; } - kfree(pattern_dup); - return cptab; } - high = node ? nr_node_ids - 1 : nr_cpu_ids - 1; + if (!exclude) + high = node ? nr_node_ids - 1 : nr_cpu_ids - 1; for (str = strim(str), c = 0; /* until break */; c++) { struct cfs_range_expr *range; @@ -1011,10 +1091,11 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) bracket = strchr(str, '['); if (!bracket) { if (*str) { - CERROR("Invalid pattern '%s'\n", str); + CERROR("Invalid pattern '%s'\n", + str); rc = -EINVAL; goto err_free_table; - } else if (c != ncpt) { + } else if (!exclude && c != ncpt) { CERROR("Expect %d partitions but found %d\n", ncpt, c); rc = -EINVAL; @@ -1023,28 +1104,29 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) break; } - if (sscanf(str, "%d%n", &cpt, &n) < 1) { + if (!exclude && sscanf(str, "%d%n", &cpt, &n) < 1) { CERROR("Invalid CPU pattern '%s'\n", str); rc = -EINVAL; goto err_free_table; } - if (cpt < 0 || cpt >= ncpt) { + if (!exclude && (cpt < 0 || cpt >= ncpt)) { CERROR("Invalid partition id %d, total partitions %d\n", cpt, ncpt); rc = -EINVAL; goto err_free_table; } - if (cfs_cpt_weight(cptab, cpt)) { + if (!exclude && cfs_cpt_weight(cptab, cpt)) { CERROR("Partition %d has already been set.\n", cpt); rc = -EPERM; goto err_free_table; } - str = strim(str + n); - if (str != bracket) { - CERROR("Invalid pattern '%s'\n", str); + str = exclude ? bracket : strim(str + n); /* jump to next '[' */ + if (!exclude && str != bracket) { + CERROR("Invalid pattern '%s' does not start with '['\n", + str); rc = -EINVAL; goto err_free_table; } @@ -1066,12 +1148,29 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) } list_for_each_entry(range, &el->el_exprs, re_link) { + if (exclude && node) { + for (cpt = 0; cpt < ncpt; cpt++) { + cfs_cpt_unset_node_core(cptab, cpt, + range->re_lo, + range->re_hi); + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("All cores are excluded on partition %d\n", + cpt); + rc = -ENODEV; + goto err_free_table; + } + } + continue; + } + for (i = range->re_lo; i <= range->re_hi; i++) { if ((i - range->re_lo) % range->re_stride) continue; - rc = node ? cfs_cpt_set_node(cptab, cpt, i) - : cfs_cpt_set_cpu(cptab, cpt, i); + rc = node ? + cfs_cpt_set_node(cptab, cpt, i) + : cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { cfs_expr_list_free(el); rc = -EINVAL; @@ -1082,7 +1181,7 @@ static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern) cfs_expr_list_free(el); - if (!cfs_cpt_online(cptab, cpt)) { + if (!exclude && !cfs_cpt_online(cptab, cpt)) { CERROR("No online CPU is found on partition %d\n", cpt); rc = -ENODEV; goto err_free_table; diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 2cc72ae..e23b75b 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11706,6 +11706,7 @@ test_152() { local nostdevname=$(ostdevname $nost) setupall + stack_trap "reformat_and_config" test_mkdir -i 1 -c1 $DIR/$tdir || error "can't mkdir" log "ADD OST$nost" @@ -11996,6 +11997,175 @@ test_154() { } run_test 154 "expand .. on rename after MDT backup restore" +cleanup_200() { + local modopts=$1 + stopall + $LUSTRE_RMMOD + [[ -z $modopts ]] || MODOPTS_LIBCFS=$modopts +} + +test_200a() { + cleanup_200 + + local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}') + local old_modopts=$MODOPTS_LIBCFS + stack_trap "cleanup_200 $old_modopts" + + MODOPTS_LIBCFS="cpu_npartitions=$cpus" + + load_modules_local libcfs + $LCTL get_param -n cpu_partition_table + + local expected=$(cat /sys/module/libcfs/parameters/cpu_npartitions) + local result=$($LCTL get_param -n cpu_partition_table | wc -l) + + (( $result == $expected )) || + error "CPU partitions not $expected, found: $result" +} +run_test 200a "check CPU partitions" + +test_200b() { + cleanup_200 + + local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}') + local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}') + local old_modopts=$MODOPTS_LIBCFS + stack_trap "cleanup_200 $old_modopts" + + local pattern="0[$(lscpu | awk '/CPU.s. list:/ {print $NF}')]" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + grep . /sys/module/libcfs/parameters/cpu* + $LCTL get_param -n cpu_partition_table + local expected=cpus + local table=$($LCTL get_param -n cpu_partition_table) + # ignore partition num and ':' + local actual=$(( $(awk '{print NF; exit}' <<< $table) - 2 )) + + (( expected == actual )) || { + echo -e "layout wrong:\n$table" + error "partition 0 is missing CPUs from pattern: '$pattern'" + } + + (( $(echo $table | wc -l) == 1 )) || { + echo -e "layout wrong\n$table" + error "layout has too many partitions from pattern: '$pattern'" + } + + (( cpus >= 4 )) || skip "need at least 4 cpu cores" + cleanup + + pattern="0[1-2]" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + $LCTL get_param -n cpu_partition_table + expected="0 : 1 2" + table=$($LCTL get_param -n cpu_partition_table) + + [[ $table == $expected ]] || + error "CPU pattern not $expected, found: $table" +} +run_test 200b "set CPU pattern using core selection" + +test_200c() { + cleanup_200 + + local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}') + local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}') + + local old_modopts=$MODOPTS_LIBCFS + stack_trap "cleanup_200 $old_modopts" + + local pattern="N" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + grep . /sys/module/libcfs/parameters/cpu* + $LCTL get_param -n cpu_partition_table + local expected=$nodes + local table=$($LCTL get_param -n cpu_partition_table) + local actual=$(echo $table | wc -l) + + (( actual == expected )) || + error "CPU partitions not $expected, found: $actual" + + cleanup + + pattern="0[$(lscpu | awk '/^NUMA node0 CPU.s.:/ {print $NF}')]" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + expected=$($LCTL get_param -n cpu_partition_table) + + cleanup + + pattern="N 0[0]" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + $LCTL get_param -n cpu_partition_table + local table=$($LCTL get_param -n cpu_partition_table) + + [[ $table == $expected ]] || + error "CPU pattern not $expected, found: $table" +} +run_test 200c "set CPU pattern using NUMA node layout" + +test_200e() { + cleanup_200 + + local cpus=$(lscpu | awk '/^CPU.s.:/ {print $NF}') + local nodes=$(lscpu | awk '/NUMA node.s.:/ {print $NF}') + + local old_modopts=$MODOPTS_LIBCFS + stack_trap "cleanup_200 $old_modopts" + + pattern="N" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + echo "full_table:" + $LCTL get_param -n cpu_partition_table + local full_table=$($LCTL get_param -n cpu_partition_table) + (( $(awk '/0.:/ {print NF - 3; exit}' <<< $full_table) > 0 )) || + skip "need at least 2 cores in each CPT to exclude one" + + cleanup + + pattern="N C[0]" + MODOPTS_LIBCFS="cpu_pattern=\"$pattern\"" + + load_modules_local libcfs + echo "table:" + grep . /sys/module/libcfs/parameters/cpu* + $LCTL get_param -n cpu_partition_table + table=$($LCTL get_param -n cpu_partition_table) + + local expected + local actual + local excluded + local partition + + for (( i = 0; i < nodes; i++ )); do + expected=$(awk '/'$i'.:/ {print NF - 3; exit}' <<< $full_table) + actual=$(awk '/'$i'.:/ {print NF - 2; exit}' <<< $table) + + (( actual == expected )) || + error "CPU count not $expected, found: $actual" + + excluded=$(awk '/'$i'.:/ {print $3; exit}' <<< $full_table) + partition=$(awk '/'$i'.:/ {print $3; exit}' <<< $table) + + ! [[ "$partition" =~ "$excluded" ]] || { + echo -e "layout wrong:\n$table" + error "excluded the wrong CPU with pattern: $pattern" + } + done +} +run_test 200e "set CPU pattern using relative core exclusion" + # # (This was sanity/802a) # diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 61c8274..4599e51 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1021,7 +1021,9 @@ load_lnet() { # system with 2 or 4 cores local saved_opts="$MODOPTS_LIBCFS" - if [ $ncpus -le 4 ] && [ $ncpus -gt 1 ]; then + echo "MODOPTS_LIBCFS=$MODOPTS_LIBCFS" + if ! [[ "$MODOPTS_LIBCFS" =~ "cpu_" ]] && + (( $ncpus <= 4 && $ncpus > 1 )); then # force to enable multiple CPU partitions echo "Force libcfs to create 2 CPU partitions" MODOPTS_LIBCFS="cpu_npartitions=2 $MODOPTS_LIBCFS"