4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA
24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2012, 2016, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * Author: liang@whamcloud.com
35 #define DEBUG_SUBSYSTEM S_LNET
37 #include <linux/cpu.h>
38 #include <linux/sched.h>
39 #include <libcfs/libcfs.h>
44 * modparam for setting number of partitions
46 * 0 : estimate best value based on cores or NUMA nodes
47 * 1 : disable multiple partitions
48 * >1 : specify number of partitions
50 static int cpu_npartitions;
51 module_param(cpu_npartitions, int, 0444);
52 MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
55 * modparam for setting CPU partitions patterns:
57 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
58 * number in bracket is processor ID (core or HT)
60 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
61 * are NUMA node ID, number before bracket is CPU partition ID.
63 * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
65 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
67 static char *cpu_pattern = "N";
68 module_param(cpu_pattern, charp, 0444);
69 MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
71 void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
75 if (cptab->ctb_cpu2cpt != NULL) {
76 LIBCFS_FREE(cptab->ctb_cpu2cpt,
77 nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
80 if (cptab->ctb_node2cpt != NULL) {
81 LIBCFS_FREE(cptab->ctb_node2cpt,
82 nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
85 for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
86 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
88 if (part->cpt_nodemask != NULL) {
89 LIBCFS_FREE(part->cpt_nodemask,
90 sizeof(*part->cpt_nodemask));
93 if (part->cpt_cpumask != NULL)
94 LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
96 if (part->cpt_distance) {
97 LIBCFS_FREE(part->cpt_distance,
99 sizeof(part->cpt_distance[0]));
103 if (cptab->ctb_parts != NULL) {
104 LIBCFS_FREE(cptab->ctb_parts,
105 cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
108 if (cptab->ctb_nodemask != NULL)
109 LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
110 if (cptab->ctb_cpumask != NULL)
111 LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
113 LIBCFS_FREE(cptab, sizeof(*cptab));
115 EXPORT_SYMBOL(cfs_cpt_table_free);
117 struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
119 struct cfs_cpt_table *cptab;
122 LIBCFS_ALLOC(cptab, sizeof(*cptab));
126 cptab->ctb_nparts = ncpt;
128 LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
129 LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
131 if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
134 LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
135 nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
136 if (cptab->ctb_cpu2cpt == NULL)
139 memset(cptab->ctb_cpu2cpt, -1,
140 nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
142 LIBCFS_ALLOC(cptab->ctb_node2cpt,
143 nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
144 if (cptab->ctb_node2cpt == NULL)
147 memset(cptab->ctb_node2cpt, -1,
148 nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
150 LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
151 if (cptab->ctb_parts == NULL)
154 for (i = 0; i < ncpt; i++) {
155 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
157 LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
158 if (!part->cpt_cpumask)
161 LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
162 if (!part->cpt_nodemask)
165 LIBCFS_ALLOC(part->cpt_distance,
166 cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
167 if (!part->cpt_distance)
174 cfs_cpt_table_free(cptab);
177 EXPORT_SYMBOL(cfs_cpt_table_alloc);
179 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
186 for (i = 0; i < cptab->ctb_nparts; i++) {
190 rc = snprintf(tmp, len, "%d\t:", i);
197 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
198 rc = snprintf(tmp, len, " %d", j);
215 EXPORT_SYMBOL(cfs_cpt_table_print);
217 int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
224 for (i = 0; i < cptab->ctb_nparts; i++) {
228 rc = snprintf(tmp, len, "%d\t:", i);
235 for (j = 0; j < cptab->ctb_nparts; j++) {
236 rc = snprintf(tmp, len, " %d:%d",
237 j, cptab->ctb_parts[i].cpt_distance[j]);
254 EXPORT_SYMBOL(cfs_cpt_distance_print);
256 int cfs_cpt_number(struct cfs_cpt_table *cptab)
258 return cptab->ctb_nparts;
260 EXPORT_SYMBOL(cfs_cpt_number);
262 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
264 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
266 return cpt == CFS_CPT_ANY ?
267 cpumask_weight(cptab->ctb_cpumask) :
268 cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
270 EXPORT_SYMBOL(cfs_cpt_weight);
272 int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
274 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
276 return cpt == CFS_CPT_ANY ?
277 cpumask_any_and(cptab->ctb_cpumask,
278 cpu_online_mask) < nr_cpu_ids :
279 cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
280 cpu_online_mask) < nr_cpu_ids;
282 EXPORT_SYMBOL(cfs_cpt_online);
284 cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
286 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
288 return cpt == CFS_CPT_ANY ?
289 cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
291 EXPORT_SYMBOL(cfs_cpt_cpumask);
293 nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
295 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
297 return cpt == CFS_CPT_ANY ?
298 cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
300 EXPORT_SYMBOL(cfs_cpt_nodemask);
302 unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
304 LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
305 LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
307 if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
308 return cptab->ctb_distance;
310 return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
312 EXPORT_SYMBOL(cfs_cpt_distance);
315 * Calculate the maximum NUMA distance between all nodes in the
316 * from_mask and all nodes in the to_mask.
318 static unsigned cfs_cpt_distance_calculate(nodemask_t *from_mask,
327 for_each_node_mask(from, *from_mask) {
328 for_each_node_mask(to, *to_mask) {
329 distance = node_distance(from, to);
330 if (maximum < distance)
337 static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
339 cptab->ctb_cpu2cpt[cpu] = cpt;
341 cpumask_set_cpu(cpu, cptab->ctb_cpumask);
342 cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
345 static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
347 cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
348 cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
350 cptab->ctb_cpu2cpt[cpu] = -1;
353 static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
356 struct cfs_cpu_partition *part;
357 struct cfs_cpu_partition *part2;
359 if (!node_isset(node, *cptab->ctb_nodemask)) {
360 /* first time node is added to the CPT table */
361 node_set(node, *cptab->ctb_nodemask);
362 cptab->ctb_node2cpt[node] = cpt;
363 cptab->ctb_distance = cfs_cpt_distance_calculate(
365 cptab->ctb_nodemask);
368 part = &cptab->ctb_parts[cpt];
369 if (!node_isset(node, *part->cpt_nodemask)) {
370 /* first time node is added to this CPT */
371 node_set(node, *part->cpt_nodemask);
372 for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
373 part2 = &cptab->ctb_parts[cpt2];
374 part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
376 part2->cpt_nodemask);
377 part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
384 static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
388 struct cfs_cpu_partition *part;
389 struct cfs_cpu_partition *part2;
391 part = &cptab->ctb_parts[cpt];
393 for_each_cpu(cpu, part->cpt_cpumask) {
394 /* this CPT has other CPU belonging to this node? */
395 if (cpu_to_node(cpu) == node)
399 if (cpu >= nr_cpu_ids && node_isset(node, *part->cpt_nodemask)) {
400 /* No more CPUs in the node for this CPT. */
401 node_clear(node, *part->cpt_nodemask);
402 for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
403 part2 = &cptab->ctb_parts[cpt2];
404 if (node_isset(node, *part2->cpt_nodemask))
405 cptab->ctb_node2cpt[node] = cpt2;
406 part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
408 part2->cpt_nodemask);
409 part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
415 for_each_cpu(cpu, cptab->ctb_cpumask) {
416 /* this CPT-table has other CPUs belonging to this node? */
417 if (cpu_to_node(cpu) == node)
421 if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
422 /* No more CPUs in the table for this node. */
423 node_clear(node, *cptab->ctb_nodemask);
424 cptab->ctb_node2cpt[node] = -1;
425 cptab->ctb_distance =
426 cfs_cpt_distance_calculate(cptab->ctb_nodemask,
427 cptab->ctb_nodemask);
431 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
433 LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
435 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
436 CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
440 if (cptab->ctb_cpu2cpt[cpu] != -1) {
441 CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
442 cpu, cptab->ctb_cpu2cpt[cpu]);
446 if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
447 CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
450 if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
451 CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
452 cpu, cptab->ctb_cpu2cpt[cpu]);
456 cfs_cpt_add_cpu(cptab, cpt, cpu);
457 cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
461 EXPORT_SYMBOL(cfs_cpt_set_cpu);
463 void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
465 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
467 if (cpu < 0 || cpu >= nr_cpu_ids) {
468 CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
472 if (cpt == CFS_CPT_ANY) {
473 /* caller doesn't know the partition ID */
474 cpt = cptab->ctb_cpu2cpt[cpu];
475 if (cpt < 0) { /* not set in this CPT-table */
476 CDEBUG(D_INFO, "Try to unset cpu %d which is "
477 "not in CPT-table %p\n", cpt, cptab);
481 } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
482 CDEBUG(D_INFO, "CPU %d is not in CPU partition %d\n", cpu, cpt);
486 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
487 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
489 cfs_cpt_del_cpu(cptab, cpt, cpu);
490 cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
492 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
494 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
495 const cpumask_t *mask)
499 if (cpumask_weight(mask) == 0 ||
500 cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
501 CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
502 "for CPU partition %d\n", cpt);
506 for_each_cpu(cpu, mask) {
507 cfs_cpt_add_cpu(cptab, cpt, cpu);
508 cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
513 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
515 void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
516 const cpumask_t *mask)
520 for_each_cpu(cpu, mask) {
521 cfs_cpt_del_cpu(cptab, cpt, cpu);
522 cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
525 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
527 int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
529 const cpumask_t *mask;
532 if (node < 0 || node >= nr_node_ids) {
534 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
538 mask = cpumask_of_node(node);
540 for_each_cpu(cpu, mask)
541 cfs_cpt_add_cpu(cptab, cpt, cpu);
543 cfs_cpt_add_node(cptab, cpt, node);
547 EXPORT_SYMBOL(cfs_cpt_set_node);
549 void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
551 const cpumask_t *mask;
554 if (node < 0 || node >= nr_node_ids) {
556 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
560 mask = cpumask_of_node(node);
562 for_each_cpu(cpu, mask)
563 cfs_cpt_del_cpu(cptab, cpt, cpu);
565 cfs_cpt_del_node(cptab, cpt, node);
567 EXPORT_SYMBOL(cfs_cpt_unset_node);
569 int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
570 const nodemask_t *mask)
574 for_each_node_mask(node, *mask)
575 cfs_cpt_set_node(cptab, cpt, node);
579 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
581 void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
582 const nodemask_t *mask)
586 for_each_node_mask(node, *mask)
587 cfs_cpt_unset_node(cptab, cpt, node);
589 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
591 int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
598 /* convert CPU partition ID to HW node id */
600 if (cpt < 0 || cpt >= cptab->ctb_nparts) {
601 mask = cptab->ctb_nodemask;
602 rotor = cptab->ctb_spread_rotor++;
604 mask = cptab->ctb_parts[cpt].cpt_nodemask;
605 rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
606 node = cptab->ctb_parts[cpt].cpt_node;
609 weight = nodes_weight(*mask);
613 for_each_node_mask(node, *mask) {
621 EXPORT_SYMBOL(cfs_cpt_spread_node);
623 int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
625 int cpu = smp_processor_id();
626 int cpt = cptab->ctb_cpu2cpt[cpu];
632 /* don't return negative value for safety of upper layer,
633 * instead we shadow the unknown cpu to a valid partition ID */
634 cpt = cpu % cptab->ctb_nparts;
639 EXPORT_SYMBOL(cfs_cpt_current);
641 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
643 LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
645 return cptab->ctb_cpu2cpt[cpu];
647 EXPORT_SYMBOL(cfs_cpt_of_cpu);
649 int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
651 if (node < 0 || node > nr_node_ids)
654 return cptab->ctb_node2cpt[node];
656 EXPORT_SYMBOL(cfs_cpt_of_node);
658 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
660 nodemask_t *nodemask;
665 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
667 if (cpt == CFS_CPT_ANY) {
668 cpumask = cptab->ctb_cpumask;
669 nodemask = cptab->ctb_nodemask;
671 cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
672 nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
675 if (!cpumask_intersects(cpumask, cpu_online_mask)) {
676 CDEBUG(D_INFO, "No online CPU found in CPU partition %d, did "
677 "someone do CPU hotplug on system? You might need to "
678 "reload Lustre modules to keep system working well.\n",
683 for_each_online_cpu(cpu) {
684 if (cpumask_test_cpu(cpu, cpumask))
687 rc = set_cpus_allowed_ptr(current, cpumask);
688 set_mems_allowed(*nodemask);
690 schedule(); /* switch to allowed CPU */
695 /* don't need to set affinity because all online CPUs are covered */
698 EXPORT_SYMBOL(cfs_cpt_bind);
701 * Choose max to \a number CPUs from \a node and set them in \a cpt.
702 * We always prefer to choose CPU in the same core/socket.
704 static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
705 cpumask_t *node_mask, int number)
707 cpumask_t *socket_mask = NULL;
708 cpumask_t *core_mask = NULL;
715 if (number >= cpumask_weight(node_mask)) {
716 while (!cpumask_empty(node_mask)) {
717 cpu = cpumask_first(node_mask);
718 cpumask_clear_cpu(cpu, node_mask);
720 if (!cpu_online(cpu))
723 rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
730 /* allocate scratch buffer */
731 LIBCFS_ALLOC(socket_mask, cpumask_size());
732 LIBCFS_ALLOC(core_mask, cpumask_size());
733 if (socket_mask == NULL || core_mask == NULL) {
738 while (!cpumask_empty(node_mask)) {
739 cpu = cpumask_first(node_mask);
741 /* get cpumask for cores in the same socket */
742 cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
743 while (!cpumask_empty(socket_mask)) {
744 /* get cpumask for hts in the same core */
745 cpumask_and(core_mask,
746 topology_sibling_cpumask(cpu), node_mask);
748 for_each_cpu(i, core_mask) {
749 cpumask_clear_cpu(i, socket_mask);
750 cpumask_clear_cpu(i, node_mask);
755 rc = cfs_cpt_set_cpu(cptab, cpt, i);
764 cpu = cpumask_first(socket_mask);
769 if (core_mask != NULL)
770 LIBCFS_FREE(core_mask, cpumask_size());
771 if (socket_mask != NULL)
772 LIBCFS_FREE(socket_mask, cpumask_size());
776 #define CPT_WEIGHT_MIN 4
778 static int cfs_cpt_num_estimate(void)
780 int nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
781 int ncpu = num_online_cpus();
784 if (ncpu > CPT_WEIGHT_MIN)
785 for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++);
788 #if (BITS_PER_LONG == 32)
789 /* config many CPU partitions on 32-bit system could consume
793 while (ncpu % ncpt != 0)
794 ncpt--; /* worst case is 1 */
799 static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
801 struct cfs_cpt_table *cptab = NULL;
802 cpumask_t *node_mask = NULL;
809 num = cfs_cpt_num_estimate();
813 if (ncpt > num_online_cpus() || ncpt > 4 * num) {
814 CWARN("CPU partition number %d is larger than suggested "
815 "value (%d), your system may have performance "
816 "issue or run out of memory while under pressure\n",
820 cptab = cfs_cpt_table_alloc(ncpt);
822 CERROR("Failed to allocate CPU map(%d)\n", ncpt);
827 LIBCFS_ALLOC(node_mask, cpumask_size());
828 if (node_mask == NULL) {
829 CERROR("Failed to allocate scratch cpumask\n");
834 num = num_online_cpus() / ncpt;
835 rem = num_online_cpus() % ncpt;
836 for_each_online_node(node) {
837 cpumask_copy(node_mask, cpumask_of_node(node));
839 while (cpt < ncpt && !cpumask_empty(node_mask)) {
840 struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
841 int ncpu = cpumask_weight(part->cpt_cpumask);
843 rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
850 ncpu = cpumask_weight(part->cpt_cpumask);
851 if (ncpu == num + !!(rem > 0)) {
858 LIBCFS_FREE(node_mask, cpumask_size());
862 CERROR("Failed (rc=%d) to setup CPU partition table with %d "
863 "partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
864 rc, ncpt, num_online_nodes(), num_online_cpus());
866 if (node_mask != NULL)
867 LIBCFS_FREE(node_mask, cpumask_size());
870 cfs_cpt_table_free(cptab);
875 static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
877 struct cfs_cpt_table *cptab;
889 pattern_dup = kstrdup(pattern, GFP_KERNEL);
890 if (pattern_dup == NULL) {
891 CERROR("Failed to duplicate pattern '%s'\n", pattern);
892 return ERR_PTR(-ENOMEM);
895 str = cfs_trimwhite(pattern_dup);
896 if (*str == 'n' || *str == 'N') {
897 str++; /* skip 'N' char */
898 node = 1; /* NUMA pattern */
901 for_each_online_node(i) {
902 if (!cpumask_empty(cpumask_of_node(i)))
908 if (ncpt == 0) { /* scanning bracket which is mark of partition */
910 while ((bracket = strchr(bracket, '['))) {
917 (node && ncpt > num_online_nodes()) ||
918 (!node && ncpt > num_online_cpus())) {
919 CERROR("Invalid pattern '%s', or too many partitions %d\n",
925 cptab = cfs_cpt_table_alloc(ncpt);
927 CERROR("Failed to allocate CPU partition table\n");
932 if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
933 for_each_online_node(i) {
934 if (cpumask_empty(cpumask_of_node(i)))
937 rc = cfs_cpt_set_node(cptab, cpt++, i);
947 high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
949 for (str = cfs_trimwhite(str), c = 0; /* until break */; c++) {
950 struct cfs_range_expr *range;
951 struct cfs_expr_list *el;
954 bracket = strchr(str, '[');
955 if (bracket == NULL) {
957 CERROR("Invalid pattern '%s'\n", str);
960 } else if (c != ncpt) {
961 CERROR("Expect %d partitions but found %d\n",
969 if (sscanf(str, "%d%n", &cpt, &n) < 1) {
970 CERROR("Invalid CPU pattern '%s'\n", str);
975 if (cpt < 0 || cpt >= ncpt) {
976 CERROR("Invalid partition id %d, total partitions %d\n",
982 if (cfs_cpt_weight(cptab, cpt) != 0) {
983 CERROR("Partition %d has already been set.\n", cpt);
988 str = cfs_trimwhite(str + n);
989 if (str != bracket) {
990 CERROR("Invalid pattern '%s'\n", str);
995 bracket = strchr(str, ']');
996 if (bracket == NULL) {
997 CERROR("Missing right bracket for partition "
998 "%d in '%s'\n", cpt, str);
1000 goto err_free_table;
1003 rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
1006 CERROR("Can't parse number range in '%s'\n", str);
1008 goto err_free_table;
1011 list_for_each_entry(range, &el->el_exprs, re_link) {
1012 for (i = range->re_lo; i <= range->re_hi; i++) {
1013 if ((i - range->re_lo) % range->re_stride != 0)
1016 rc = node ? cfs_cpt_set_node(cptab, cpt, i)
1017 : cfs_cpt_set_cpu(cptab, cpt, i);
1019 cfs_expr_list_free(el);
1021 goto err_free_table;
1026 cfs_expr_list_free(el);
1028 if (!cfs_cpt_online(cptab, cpt)) {
1029 CERROR("No online CPU is found on partition %d\n", cpt);
1031 goto err_free_table;
1034 str = cfs_trimwhite(bracket + 1);
1041 cfs_cpt_table_free(cptab);
1047 #ifdef CONFIG_HOTPLUG_CPU
1048 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1049 static enum cpuhp_state lustre_cpu_online;
1051 static int cfs_cpu_online(unsigned int cpu)
1057 static int cfs_cpu_dead(unsigned int cpu)
1061 /* if all HTs in a core are offline, it may break affinity */
1062 warn = cpumask_any_and(topology_sibling_cpumask(cpu),
1063 cpu_online_mask) >= nr_cpu_ids;
1064 CDEBUG(warn ? D_WARNING : D_INFO,
1065 "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
1070 #ifndef HAVE_HOTPLUG_STATE_MACHINE
1071 static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
1074 int cpu = (unsigned long)hcpu;
1078 case CPU_DEAD_FROZEN:
1080 case CPU_ONLINE_FROZEN:
1082 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
1083 CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
1094 static struct notifier_block cfs_cpu_notifier = {
1095 .notifier_call = cfs_cpu_notify,
1098 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1099 #endif /* CONFIG_HOTPLUG_CPU */
1101 void cfs_cpu_fini(void)
1103 if (!IS_ERR_OR_NULL(cfs_cpt_table))
1104 cfs_cpt_table_free(cfs_cpt_table);
1106 #ifdef CONFIG_HOTPLUG_CPU
1107 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1108 if (lustre_cpu_online > 0)
1109 cpuhp_remove_state_nocalls(lustre_cpu_online);
1110 cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD);
1112 unregister_hotcpu_notifier(&cfs_cpu_notifier);
1113 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1114 #endif /* CONFIG_HOTPLUG_CPU */
1117 int cfs_cpu_init(void)
1121 LASSERT(!cfs_cpt_table);
1123 #ifdef CONFIG_HOTPLUG_CPU
1124 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1125 ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD,
1126 "fs/lustre/cfe:dead", NULL,
1130 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
1131 "fs/lustre/cfe:online",
1132 cfs_cpu_online, NULL);
1135 lustre_cpu_online = ret;
1137 register_hotcpu_notifier(&cfs_cpu_notifier);
1138 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1139 #endif /* CONFIG_HOTPLUG_CPU */
1143 if (*cpu_pattern != 0) {
1144 cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1145 if (IS_ERR(cfs_cpt_table)) {
1146 CERROR("Failed to create cptab from pattern '%s'\n",
1148 ret = PTR_ERR(cfs_cpt_table);
1153 cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1154 if (IS_ERR(cfs_cpt_table)) {
1155 CERROR("Failed to create cptab with npartitions %d\n",
1157 ret = PTR_ERR(cfs_cpt_table);
1163 LCONSOLE(0, "HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
1164 num_online_nodes(), num_online_cpus(),
1165 cfs_cpt_number(cfs_cpt_table));