From 42bf19a573a5c967e54302cc08c7b51effac3dd9 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin Date: Fri, 21 Oct 2016 15:32:28 +0300 Subject: [PATCH] LU-8703 libcfs: make tolerant to offline CPUs and empty NUMA nodes Rework CPU partition code in the way of make it more tolerant to offline CPUs and empty nodes. For example, in KNL: available: 8 nodes (0-7) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 node 0 size: 24472 MB node 0 free: 12409 MB node 1 cpus: 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 node 1 size: 24576 MB node 1 free: 20388 MB node 2 cpus: 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 node 2 size: 24576 MB node 2 free: 20621 MB node 3 cpus: 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 node 3 size: 24576 MB node 3 free: 21183 MB node 4 cpus: node 4 size: 4096 MB node 4 free: 3982 MB node 5 cpus: node 5 size: 4096 MB node 5 free: 3982 MB node 6 cpus: node 6 size: 4096 MB node 6 free: 3982 MB node 7 cpus: node 7 size: 4096 MB node 7 free: 3981 MB node distances: node 0 1 2 3 4 5 6 7 0: 10 21 21 21 31 41 41 41 1: 21 10 21 21 41 31 41 41 2: 21 21 10 21 41 41 31 41 3: 21 21 21 10 41 41 41 31 4: 31 41 41 41 10 41 41 41 5: 41 31 41 41 41 10 41 41 6: 41 41 31 41 41 41 10 41 7: 41 41 41 31 41 41 41 10 Contain the fix for LU-8492 ptlrpc: Correctly calculate hrp->hrp_nthrs Fix an error code return which was introduced in commit def25e9c7eff57cdaf8a6ee5e8e7db005bab6525 Change-Id: I7f64a20ee009a88e836f592ce044400f07ffbcdd Signed-off-by: Dmitry Eremin Reviewed-on: https://review.whamcloud.com/23222 Reviewed-by: Amir Shehata Reviewed-by: James Simmons Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- libcfs/include/libcfs/linux/linux-cpu.h | 2 + libcfs/libcfs/linux/linux-cpu.c | 268 +++++++++++++++----------------- lnet/lnet/lib-msg.c | 5 +- lustre/ptlrpc/service.c | 22 ++- 4 files changed, 137 insertions(+), 160 deletions(-) diff --git a/libcfs/include/libcfs/linux/linux-cpu.h b/libcfs/include/libcfs/linux/linux-cpu.h index de5e97d..a46e252 100644 --- a/libcfs/include/libcfs/linux/linux-cpu.h +++ b/libcfs/include/libcfs/linux/linux-cpu.h @@ -66,6 +66,8 @@ struct cfs_cpu_partition { unsigned *cpt_distance; /* spread rotor for NUMA allocator */ int cpt_spread_rotor; + /* NUMA node if cpt_nodemask is empty */ + int cpt_node; }; /** descriptor for CPU partitions */ diff --git a/libcfs/libcfs/linux/linux-cpu.c b/libcfs/libcfs/linux/linux-cpu.c index a1cb867..a58bc49 100644 --- a/libcfs/libcfs/linux/linux-cpu.c +++ b/libcfs/libcfs/linux/linux-cpu.c @@ -178,27 +178,27 @@ EXPORT_SYMBOL(cfs_cpt_table_alloc); int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) { - char *tmp = buf; - int rc = -EFBIG; - int i; - int j; + char *tmp = buf; + int rc; + int i; + int j; for (i = 0; i < cptab->ctb_nparts; i++) { if (len <= 0) - goto out; + goto err; rc = snprintf(tmp, len, "%d\t:", i); len -= rc; if (len <= 0) - goto out; + goto err; tmp += rc; for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) { rc = snprintf(tmp, len, " %d", j); len -= rc; if (len <= 0) - goto out; + goto err; tmp += rc; } @@ -206,31 +206,30 @@ int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) tmp++; len--; } - rc = 0; - out: - if (rc < 0) - return rc; return tmp - buf; + +err: + return -E2BIG; } EXPORT_SYMBOL(cfs_cpt_table_print); int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) { - char *tmp = buf; - int rc = -EFBIG; - int i; - int j; + char *tmp = buf; + int rc; + int i; + int j; for (i = 0; i < cptab->ctb_nparts; i++) { if (len <= 0) - goto out; + goto err; rc = snprintf(tmp, len, "%d\t:", i); len -= rc; if (len <= 0) - goto out; + goto err; tmp += rc; for (j = 0; j < cptab->ctb_nparts; j++) { @@ -238,7 +237,7 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) j, cptab->ctb_parts[i].cpt_distance[j]); len -= rc; if (len <= 0) - goto out; + goto err; tmp += rc; } @@ -246,12 +245,11 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len) tmp++; len--; } - rc = 0; - out: - if (rc < 0) - return rc; return tmp - buf; + +err: + return -E2BIG; } EXPORT_SYMBOL(cfs_cpt_distance_print); @@ -445,8 +443,15 @@ int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) return 0; } - LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask)); - LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); + if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu); + return 0; + } + if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) { + CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } cfs_cpt_add_cpu(cptab, cpt, cpu); cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu)); @@ -513,15 +518,17 @@ void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, { int cpu; - for_each_cpu(cpu, mask) - cfs_cpt_unset_cpu(cptab, cpt, cpu); + for_each_cpu(cpu, mask) { + cfs_cpt_del_cpu(cptab, cpt, cpu); + cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu)); + } } EXPORT_SYMBOL(cfs_cpt_unset_cpumask); int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) { const cpumask_t *mask; - int cpu; + int cpu; if (node < 0 || node >= nr_node_ids) { CDEBUG(D_INFO, @@ -563,12 +570,10 @@ EXPORT_SYMBOL(cfs_cpt_unset_node); int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, const nodemask_t *mask) { - int i; + int node; - for_each_node_mask(i, *mask) { - if (!cfs_cpt_set_node(cptab, cpt, i)) - return 0; - } + for_each_node_mask(node, *mask) + cfs_cpt_set_node(cptab, cpt, node); return 1; } @@ -577,42 +582,42 @@ EXPORT_SYMBOL(cfs_cpt_set_nodemask); void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, const nodemask_t *mask) { - int i; + int node; - for_each_node_mask(i, *mask) - cfs_cpt_unset_node(cptab, cpt, i); + for_each_node_mask(node, *mask) + cfs_cpt_unset_node(cptab, cpt, node); } EXPORT_SYMBOL(cfs_cpt_unset_nodemask); int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) { - nodemask_t *mask; - int weight; - int rotor; - int node; + nodemask_t *mask; + int weight; + int rotor; + int node = 0; /* convert CPU partition ID to HW node id */ if (cpt < 0 || cpt >= cptab->ctb_nparts) { - mask = cptab->ctb_nodemask; + mask = cptab->ctb_nodemask; rotor = cptab->ctb_spread_rotor++; } else { - mask = cptab->ctb_parts[cpt].cpt_nodemask; + mask = cptab->ctb_parts[cpt].cpt_nodemask; rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + node = cptab->ctb_parts[cpt].cpt_node; } weight = nodes_weight(*mask); - LASSERT(weight > 0); - - rotor %= weight; + if (weight > 0) { + rotor %= weight; - for_each_node_mask(node, *mask) { - if (rotor-- == 0) - return node; + for_each_node_mask(node, *mask) { + if (rotor-- == 0) + return node; + } } - LBUG(); - return 0; + return node; } EXPORT_SYMBOL(cfs_cpt_spread_node); @@ -653,10 +658,10 @@ EXPORT_SYMBOL(cfs_cpt_of_node); int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) { - cpumask_t *cpumask; - nodemask_t *nodemask; - int rc; - int i; + nodemask_t *nodemask; + cpumask_t *cpumask; + int cpu; + int rc; LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); @@ -675,8 +680,8 @@ int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) return -EINVAL; } - for_each_online_cpu(i) { - if (cpumask_test_cpu(i, cpumask)) + for_each_online_cpu(cpu) { + if (cpumask_test_cpu(cpu, cpumask)) continue; rc = set_cpus_allowed_ptr(current, cpumask); @@ -697,55 +702,55 @@ EXPORT_SYMBOL(cfs_cpt_bind); * We always prefer to choose CPU in the same core/socket. */ static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, - cpumask_t *node, int number) + cpumask_t *node_mask, int number) { - cpumask_t *socket = NULL; - cpumask_t *core = NULL; + cpumask_t *socket_mask = NULL; + cpumask_t *core_mask = NULL; int rc = 0; int cpu; int i; LASSERT(number > 0); - if (number >= cpumask_weight(node)) { - while (!cpumask_empty(node)) { - cpu = cpumask_first(node); + if (number >= cpumask_weight(node_mask)) { + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); + cpumask_clear_cpu(cpu, node_mask); + + if (!cpu_online(cpu)) + continue; rc = cfs_cpt_set_cpu(cptab, cpt, cpu); if (!rc) return -EINVAL; - cpumask_clear_cpu(cpu, node); } return 0; } /* allocate scratch buffer */ - LIBCFS_ALLOC(socket, cpumask_size()); - LIBCFS_ALLOC(core, cpumask_size()); - if (socket == NULL || core == NULL) { + LIBCFS_ALLOC(socket_mask, cpumask_size()); + LIBCFS_ALLOC(core_mask, cpumask_size()); + if (socket_mask == NULL || core_mask == NULL) { rc = -ENOMEM; goto out; } - while (!cpumask_empty(node)) { - cpu = cpumask_first(node); + while (!cpumask_empty(node_mask)) { + cpu = cpumask_first(node_mask); /* get cpumask for cores in the same socket */ - cpumask_copy(socket, topology_core_cpumask(cpu)); - cpumask_and(socket, socket, node); - - LASSERT(!cpumask_empty(socket)); - - while (!cpumask_empty(socket)) { + cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask); + while (!cpumask_empty(socket_mask)) { /* get cpumask for hts in the same core */ - cpumask_copy(core, topology_sibling_cpumask(cpu)); - cpumask_and(core, core, node); + cpumask_and(core_mask, + topology_sibling_cpumask(cpu), node_mask); - LASSERT(!cpumask_empty(core)); + for_each_cpu(i, core_mask) { + cpumask_clear_cpu(i, socket_mask); + cpumask_clear_cpu(i, node_mask); - for_each_cpu(i, core) { - cpumask_clear_cpu(i, socket); - cpumask_clear_cpu(i, node); + if (!cpu_online(i)) + continue; rc = cfs_cpt_set_cpu(cptab, cpt, i); if (!rc) { @@ -756,15 +761,15 @@ static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, if (--number == 0) goto out; } - cpu = cpumask_first(socket); + cpu = cpumask_first(socket_mask); } } out: - if (socket != NULL) - LIBCFS_FREE(socket, cpumask_size()); - if (core != NULL) - LIBCFS_FREE(core, cpumask_size()); + if (core_mask != NULL) + LIBCFS_FREE(core_mask, cpumask_size()); + if (socket_mask != NULL) + LIBCFS_FREE(socket_mask, cpumask_size()); return rc; } @@ -784,7 +789,8 @@ static int cfs_cpt_num_estimate(void) /* generate reasonable number of CPU partitions based on total number * of CPUs, Preferred N should be power2 and match this condition: * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */ - for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {} + for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) + ; if (ncpt <= nnode) { /* fat numa system */ while (nnode > ncpt) @@ -812,29 +818,22 @@ out: static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt) { struct cfs_cpt_table *cptab = NULL; - cpumask_t *mask = NULL; + cpumask_t *node_mask = NULL; int cpt = 0; + int node; int num; - int rc; - int i; + int rem; + int rc = 0; - rc = cfs_cpt_num_estimate(); + num = cfs_cpt_num_estimate(); if (ncpt <= 0) - ncpt = rc; + ncpt = num; - if (ncpt > num_online_cpus() || ncpt > 4 * rc) { + if (ncpt > num_online_cpus() || ncpt > 4 * num) { CWARN("CPU partition number %d is larger than suggested " - "value (%d), your system may have performance" + "value (%d), your system may have performance " "issue or run out of memory while under pressure\n", - ncpt, rc); - } - - if (num_online_cpus() % ncpt != 0) { - CERROR("CPU number %d is not multiple of cpu_npartition %d, " - "please try different cpu_npartitions value or" - "set pattern string by cpu_pattern=STRING\n", - (int)num_online_cpus(), ncpt); - goto failed; + ncpt, num); } cptab = cfs_cpt_table_alloc(ncpt); @@ -843,67 +842,44 @@ static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt) goto failed; } - num = num_online_cpus() / ncpt; - if (num == 0) { - CERROR("CPU changed while setting CPU partition\n"); - goto failed; - } - - LIBCFS_ALLOC(mask, cpumask_size()); - if (mask == NULL) { + LIBCFS_ALLOC(node_mask, cpumask_size()); + if (node_mask == NULL) { CERROR("Failed to allocate scratch cpumask\n"); goto failed; } - for_each_online_node(i) { - cpumask_copy(mask, cpumask_of_node(i)); - - while (!cpumask_empty(mask)) { - struct cfs_cpu_partition *part; - int n; - - /* Each emulated NUMA node has all allowed CPUs in - * the mask. - * End loop when all partitions have assigned CPUs. - */ - if (cpt == ncpt) - break; - - part = &cptab->ctb_parts[cpt]; + num = num_online_cpus() / ncpt; + rem = num_online_cpus() % ncpt; + for_each_online_node(node) { + cpumask_copy(node_mask, cpumask_of_node(node)); - n = num - cpumask_weight(part->cpt_cpumask); - LASSERT(n > 0); + while (cpt < ncpt && !cpumask_empty(node_mask)) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt]; + int ncpu = cpumask_weight(part->cpt_cpumask); - rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); + rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask, + num - ncpu); if (rc < 0) goto failed; - LASSERT(num >= cpumask_weight(part->cpt_cpumask)); - if (num == cpumask_weight(part->cpt_cpumask)) + ncpu = cpumask_weight(part->cpt_cpumask); + if (ncpu == num + !!(rem > 0)) { cpt++; + rem--; + } } } - if (cpt != ncpt || - num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { - CERROR("Expect %d(%d) CPU partitions but got %d(%d), " - "CPU hotplug/unplug while setting?\n", - cptab->ctb_nparts, num, cpt, - cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)); - goto failed; - } - - LIBCFS_FREE(mask, cpumask_size()); - + LIBCFS_FREE(node_mask, cpumask_size()); return cptab; - failed: - CERROR("Failed to setup CPU-partition-table with %d " - "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", - ncpt, num_online_nodes(), num_online_cpus()); +failed: + CERROR("Failed (rc=%d) to setup CPU partition table with %d " + "partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n", + rc, ncpt, num_online_nodes(), num_online_cpus()); - if (mask != NULL) - LIBCFS_FREE(mask, cpumask_size()); + if (node_mask != NULL) + LIBCFS_FREE(node_mask, cpumask_size()); if (cptab != NULL) cfs_cpt_table_free(cptab); diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 9dd1d1b..1b90855 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -573,16 +573,17 @@ lnet_msg_container_cleanup(struct lnet_msg_container *container) int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) { - int rc; + int rc = 0; container->msc_init = 1; INIT_LIST_HEAD(&container->msc_active); INIT_LIST_HEAD(&container->msc_finalizing); - rc = 0; /* number of CPUs */ container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + if (container->msc_nfinalizers == 0) + container->msc_nfinalizers = 1; LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, container->msc_nfinalizers * diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index e65bcae..04a9147 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -567,7 +567,6 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, } weight = cfs_cpt_weight(svc->srv_cptable, 0); - LASSERT(weight > 0); for (; factor > 0 && weight > 0; factor--, weight -= fade) nthrs += min(weight, fade) * factor; @@ -2943,8 +2942,8 @@ int ptlrpc_hr_init(void) struct ptlrpc_hr_partition *hrp; struct ptlrpc_hr_thread *hrt; int rc; + int cpt; int i; - int j; int weight; ENTRY; @@ -2960,27 +2959,26 @@ int ptlrpc_hr_init(void) weight = cpumask_weight(topology_sibling_cpumask(smp_processor_id())); - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - hrp->hrp_cpt = i; + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { + hrp->hrp_cpt = cpt; atomic_set(&hrp->hrp_nstarted, 0); atomic_set(&hrp->hrp_nstopped, 0); - hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); - + hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, cpt); hrp->hrp_nthrs /= weight; if (hrp->hrp_nthrs == 0) hrp->hrp_nthrs = 1; - OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i, + OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, cpt, hrp->hrp_nthrs * sizeof(*hrt)); if (hrp->hrp_thrs == NULL) GOTO(out, rc = -ENOMEM); - for (j = 0; j < hrp->hrp_nthrs; j++) { - hrt = &hrp->hrp_thrs[j]; + for (i = 0; i < hrp->hrp_nthrs; i++) { + hrt = &hrp->hrp_thrs[i]; - hrt->hrt_id = j; + hrt->hrt_id = i; hrt->hrt_partition = hrp; init_waitqueue_head(&hrt->hrt_waitq); spin_lock_init(&hrt->hrt_lock); @@ -2998,14 +2996,14 @@ out: void ptlrpc_hr_fini(void) { struct ptlrpc_hr_partition *hrp; - int i; + int cpt; if (ptlrpc_hr.hr_partitions == NULL) return; ptlrpc_stop_hr_threads(); - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + cfs_percpt_for_each(hrp, cpt, ptlrpc_hr.hr_partitions) { if (hrp->hrp_thrs != NULL) { OBD_FREE(hrp->hrp_thrs, hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0])); -- 1.8.3.1