1 // SPDX-License-Identifier: GPL-2.0
3 /* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
4 * Copyright (c) 2012, 2017, Intel Corporation.
7 /* This file is part of Lustre, http://www.lustre.org/
9 * Please see comments in include/lnet/lib-cpt.h for introduction
11 * Author: liang@whamcloud.com
14 #define DEBUG_SUBSYSTEM S_LNET
16 #include <linux/cpu.h>
17 #include <linux/sched.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
21 #include <libcfs/libcfs_string.h>
22 #include <libcfs/libcfs.h>
23 #include <lnet/lib-cpt.h>
25 /** virtual processing unit */
26 struct cfs_cpu_partition {
27 /* CPUs mask for this partition */
28 cpumask_var_t cpt_cpumask;
29 /* nodes mask for this partition */
30 nodemask_t *cpt_nodemask;
31 /* NUMA distance between CPTs */
32 unsigned int *cpt_distance;
33 /* spread rotor for NUMA allocator */
34 unsigned int cpt_spread_rotor;
35 /* NUMA node if cpt_nodemask is empty */
39 /** descriptor for CPU partitions */
40 struct cfs_cpt_table {
41 /* spread rotor for NUMA allocator */
42 unsigned int ctb_spread_rotor;
43 /* maximum NUMA distance between all nodes in table */
44 unsigned int ctb_distance;
45 /* # of CPU partitions */
47 /* partitions tables */
48 struct cfs_cpu_partition *ctb_parts;
49 /* shadow HW CPU to CPU partition ID */
51 /* all cpus in this partition table */
52 cpumask_var_t ctb_cpumask;
53 /* shadow HW node to CPU partition ID */
55 /* all nodes in this partition table */
56 nodemask_t *ctb_nodemask;
59 /** Global CPU partition table */
60 struct cfs_cpt_table *cfs_cpt_tab __read_mostly;
61 EXPORT_SYMBOL(cfs_cpt_tab);
64 * modparam for setting number of partitions
66 * 0 : estimate best value based on cores or NUMA nodes
67 * 1 : disable multiple partitions
68 * >1 : specify number of partitions
70 module_param(cpu_npartitions, int, 0444);
71 MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
74 * modparam for setting CPU partitions patterns:
76 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
77 * number in bracket is processor ID (core or HT)
79 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
80 * are NUMA node ID, number before bracket is CPU partition ID.
82 * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology
84 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
86 module_param(cpu_pattern, charp, 0444);
87 MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
89 struct cfs_cpt_table *cfs_cpt_table_alloc(int ncpt)
91 struct cfs_cpt_table *cptab;
94 LIBCFS_ALLOC(cptab, sizeof(*cptab));
98 cptab->ctb_nparts = ncpt;
100 if (!zalloc_cpumask_var(&cptab->ctb_cpumask, GFP_NOFS))
101 goto failed_alloc_cpumask;
103 LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
104 if (!cptab->ctb_nodemask)
105 goto failed_alloc_nodemask;
107 CFS_ALLOC_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
108 if (!cptab->ctb_cpu2cpt)
109 goto failed_alloc_cpu2cpt;
111 memset(cptab->ctb_cpu2cpt, -1,
112 nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
114 CFS_ALLOC_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
115 if (!cptab->ctb_node2cpt)
116 goto failed_alloc_node2cpt;
118 memset(cptab->ctb_node2cpt, -1,
119 nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
121 CFS_ALLOC_PTR_ARRAY(cptab->ctb_parts, ncpt);
122 if (!cptab->ctb_parts)
123 goto failed_alloc_ctb_parts;
125 memset(cptab->ctb_parts, -1, ncpt * sizeof(cptab->ctb_parts[0]));
127 for (i = 0; i < ncpt; i++) {
128 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
130 if (!zalloc_cpumask_var(&part->cpt_cpumask, GFP_NOFS))
131 goto failed_setting_ctb_parts;
133 LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
134 if (!part->cpt_nodemask)
135 goto failed_setting_ctb_parts;
137 CFS_ALLOC_PTR_ARRAY(part->cpt_distance, cptab->ctb_nparts);
138 if (!part->cpt_distance)
139 goto failed_setting_ctb_parts;
141 memset(part->cpt_distance, -1,
142 cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
147 failed_setting_ctb_parts:
149 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
151 if (part->cpt_nodemask) {
152 LIBCFS_FREE(part->cpt_nodemask,
153 sizeof(*part->cpt_nodemask));
156 free_cpumask_var(part->cpt_cpumask);
158 if (part->cpt_distance) {
159 CFS_FREE_PTR_ARRAY(part->cpt_distance,
164 if (cptab->ctb_parts)
165 CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);
167 failed_alloc_ctb_parts:
168 if (cptab->ctb_node2cpt)
169 CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
171 failed_alloc_node2cpt:
172 if (cptab->ctb_cpu2cpt)
173 CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
175 failed_alloc_cpu2cpt:
176 if (cptab->ctb_nodemask)
177 LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
178 failed_alloc_nodemask:
179 free_cpumask_var(cptab->ctb_cpumask);
180 failed_alloc_cpumask:
181 LIBCFS_FREE(cptab, sizeof(*cptab));
184 EXPORT_SYMBOL(cfs_cpt_table_alloc);
186 void cfs_cpt_table_free(struct cfs_cpt_table *cptab)
190 if (cptab->ctb_cpu2cpt)
191 CFS_FREE_PTR_ARRAY(cptab->ctb_cpu2cpt, nr_cpu_ids);
193 if (cptab->ctb_node2cpt)
194 CFS_FREE_PTR_ARRAY(cptab->ctb_node2cpt, nr_node_ids);
196 for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
197 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
199 if (part->cpt_nodemask) {
200 LIBCFS_FREE(part->cpt_nodemask,
201 sizeof(*part->cpt_nodemask));
204 free_cpumask_var(part->cpt_cpumask);
206 if (part->cpt_distance)
207 CFS_FREE_PTR_ARRAY(part->cpt_distance,
211 if (cptab->ctb_parts)
212 CFS_FREE_PTR_ARRAY(cptab->ctb_parts, cptab->ctb_nparts);
214 if (cptab->ctb_nodemask)
215 LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
216 free_cpumask_var(cptab->ctb_cpumask);
218 LIBCFS_FREE(cptab, sizeof(*cptab));
220 EXPORT_SYMBOL(cfs_cpt_table_free);
222 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
229 for (i = 0; i < cptab->ctb_nparts; i++) {
233 rc = snprintf(tmp, len, "%d\t:", i);
240 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
241 rc = snprintf(tmp, len, " %d", j);
257 EXPORT_SYMBOL(cfs_cpt_table_print);
259 int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
266 for (i = 0; i < cptab->ctb_nparts; i++) {
270 rc = snprintf(tmp, len, "%d\t:", i);
277 for (j = 0; j < cptab->ctb_nparts; j++) {
278 rc = snprintf(tmp, len, " %d:%d", j,
279 cptab->ctb_parts[i].cpt_distance[j]);
295 EXPORT_SYMBOL(cfs_cpt_distance_print);
297 int cfs_cpt_number(struct cfs_cpt_table *cptab)
299 return cptab->ctb_nparts;
301 EXPORT_SYMBOL(cfs_cpt_number);
303 int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
305 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
307 return cpt == CFS_CPT_ANY ?
308 cpumask_weight(cptab->ctb_cpumask) :
309 cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
311 EXPORT_SYMBOL(cfs_cpt_weight);
313 int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
315 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
317 return cpt == CFS_CPT_ANY ?
318 cpumask_any_and(cptab->ctb_cpumask,
319 cpu_online_mask) < nr_cpu_ids :
320 cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
321 cpu_online_mask) < nr_cpu_ids;
323 EXPORT_SYMBOL(cfs_cpt_online);
325 cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
327 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
329 return cpt == CFS_CPT_ANY ?
330 &cptab->ctb_cpumask : &cptab->ctb_parts[cpt].cpt_cpumask;
332 EXPORT_SYMBOL(cfs_cpt_cpumask);
334 nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
336 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
338 return cpt == CFS_CPT_ANY ?
339 cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
341 EXPORT_SYMBOL(cfs_cpt_nodemask);
343 unsigned int cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
345 LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
346 LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
348 if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
349 return cptab->ctb_distance;
351 return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
353 EXPORT_SYMBOL(cfs_cpt_distance);
355 /* Calculate the maximum NUMA distance between all nodes in the
356 * from_mask and all nodes in the to_mask.
358 static unsigned int cfs_cpt_distance_calculate(nodemask_t *from_mask,
361 unsigned int maximum;
362 unsigned int distance;
367 for_each_node_mask(from, *from_mask) {
368 for_each_node_mask(to, *to_mask) {
369 distance = node_distance(from, to);
370 if (maximum < distance)
377 static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
379 cptab->ctb_cpu2cpt[cpu] = cpt;
381 cpumask_set_cpu(cpu, cptab->ctb_cpumask);
382 cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
385 static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
387 cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
388 cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
390 cptab->ctb_cpu2cpt[cpu] = -1;
393 static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
395 struct cfs_cpu_partition *part;
397 if (!node_isset(node, *cptab->ctb_nodemask)) {
400 /* first time node is added to the CPT table */
401 node_set(node, *cptab->ctb_nodemask);
402 cptab->ctb_node2cpt[node] = cpt;
404 dist = cfs_cpt_distance_calculate(cptab->ctb_nodemask,
405 cptab->ctb_nodemask);
406 cptab->ctb_distance = dist;
409 part = &cptab->ctb_parts[cpt];
410 if (!node_isset(node, *part->cpt_nodemask)) {
413 /* first time node is added to this CPT */
414 node_set(node, *part->cpt_nodemask);
415 for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
416 struct cfs_cpu_partition *part2;
419 part2 = &cptab->ctb_parts[cpt2];
420 dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
421 part2->cpt_nodemask);
422 part->cpt_distance[cpt2] = dist;
423 dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
425 part2->cpt_distance[cpt] = dist;
430 static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
432 struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
435 for_each_cpu(cpu, part->cpt_cpumask) {
436 /* this CPT has other CPU belonging to this node? */
437 if (cpu_to_node(cpu) == node)
441 if (cpu >= nr_cpu_ids && node_isset(node, *part->cpt_nodemask)) {
444 /* No more CPUs in the node for this CPT. */
445 node_clear(node, *part->cpt_nodemask);
446 for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
447 struct cfs_cpu_partition *part2;
450 part2 = &cptab->ctb_parts[cpt2];
451 if (node_isset(node, *part2->cpt_nodemask))
452 cptab->ctb_node2cpt[node] = cpt2;
454 dist = cfs_cpt_distance_calculate(part->cpt_nodemask,
455 part2->cpt_nodemask);
456 part->cpt_distance[cpt2] = dist;
457 dist = cfs_cpt_distance_calculate(part2->cpt_nodemask,
459 part2->cpt_distance[cpt] = dist;
463 for_each_cpu(cpu, cptab->ctb_cpumask) {
464 /* this CPT-table has other CPUs belonging to this node? */
465 if (cpu_to_node(cpu) == node)
469 if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
470 /* No more CPUs in the table for this node. */
471 node_clear(node, *cptab->ctb_nodemask);
472 cptab->ctb_node2cpt[node] = -1;
473 cptab->ctb_distance =
474 cfs_cpt_distance_calculate(cptab->ctb_nodemask,
475 cptab->ctb_nodemask);
479 int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
481 LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
483 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
484 CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
488 if (cptab->ctb_cpu2cpt[cpu] != -1) {
489 CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
490 cpu, cptab->ctb_cpu2cpt[cpu]);
494 if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
495 CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
499 if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
500 CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
501 cpu, cptab->ctb_cpu2cpt[cpu]);
505 cfs_cpt_add_cpu(cptab, cpt, cpu);
506 cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
510 EXPORT_SYMBOL(cfs_cpt_set_cpu);
512 void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
514 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
516 if (cpu < 0 || cpu >= nr_cpu_ids) {
517 CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
521 if (cpt == CFS_CPT_ANY) {
522 /* caller doesn't know the partition ID */
523 cpt = cptab->ctb_cpu2cpt[cpu];
524 if (cpt < 0) { /* not set in this CPT-table */
526 "Try to unset cpu %d which is not in CPT-table %p\n",
531 } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
533 "CPU %d is not in CPU partition %d\n", cpu, cpt);
537 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
538 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
540 cfs_cpt_del_cpu(cptab, cpt, cpu);
541 cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
543 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
545 int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt,
546 const cpumask_t *mask)
550 if (!cpumask_weight(mask) ||
551 cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
553 "No online CPU is found in the CPU mask for CPU partition %d\n",
558 for_each_cpu(cpu, mask) {
559 cfs_cpt_add_cpu(cptab, cpt, cpu);
560 cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
565 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
567 void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
568 const cpumask_t *mask)
572 for_each_cpu(cpu, mask) {
573 cfs_cpt_del_cpu(cptab, cpt, cpu);
574 cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
577 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
579 int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
581 const cpumask_t *mask;
584 if (node < 0 || node >= nr_node_ids) {
586 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
590 mask = cpumask_of_node(node);
592 for_each_cpu(cpu, mask)
593 cfs_cpt_add_cpu(cptab, cpt, cpu);
595 cfs_cpt_add_node(cptab, cpt, node);
599 EXPORT_SYMBOL(cfs_cpt_set_node);
601 void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
603 const cpumask_t *mask;
606 if (node < 0 || node >= nr_node_ids) {
608 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
612 mask = cpumask_of_node(node);
614 for_each_cpu(cpu, mask)
615 cfs_cpt_del_cpu(cptab, cpt, cpu);
617 cfs_cpt_del_node(cptab, cpt, node);
619 EXPORT_SYMBOL(cfs_cpt_unset_node);
621 int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
622 const nodemask_t *mask)
626 for_each_node_mask(node, *mask)
627 cfs_cpt_set_node(cptab, cpt, node);
631 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
633 void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
634 const nodemask_t *mask)
638 for_each_node_mask(node, *mask)
639 cfs_cpt_unset_node(cptab, cpt, node);
641 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
643 int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
650 /* convert CPU partition ID to HW node id */
652 if (cpt < 0 || cpt >= cptab->ctb_nparts) {
653 mask = cptab->ctb_nodemask;
654 rotor = cptab->ctb_spread_rotor++;
656 mask = cptab->ctb_parts[cpt].cpt_nodemask;
657 rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
658 node = cptab->ctb_parts[cpt].cpt_node;
661 weight = nodes_weight(*mask);
665 for_each_node_mask(node, *mask) {
673 EXPORT_SYMBOL(cfs_cpt_spread_node);
675 int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
681 cpu = smp_processor_id();
682 cpt = cptab->ctb_cpu2cpt[cpu];
684 if (cpt < 0 && remap) {
685 /* don't return negative value for safety of upper layer,
686 * instead we shadow the unknown cpu to a valid partition ID
688 cpt = cpu % cptab->ctb_nparts;
693 EXPORT_SYMBOL(cfs_cpt_current);
695 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
697 LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
699 return cptab->ctb_cpu2cpt[cpu];
701 EXPORT_SYMBOL(cfs_cpt_of_cpu);
703 int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
705 if (node < 0 || node > nr_node_ids)
708 return cptab->ctb_node2cpt[node];
710 EXPORT_SYMBOL(cfs_cpt_of_node);
712 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
714 nodemask_t *nodemask;
719 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
721 if (cpt == CFS_CPT_ANY) {
722 cpumask = cptab->ctb_cpumask;
723 nodemask = cptab->ctb_nodemask;
725 cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
726 nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
729 if (!cpumask_intersects(cpumask, cpu_online_mask)) {
731 "No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
736 for_each_online_cpu(cpu) {
737 if (cpumask_test_cpu(cpu, cpumask))
740 rc = set_cpus_allowed_ptr(current, cpumask);
741 set_mems_allowed(*nodemask);
743 schedule(); /* switch to allowed CPU */
748 /* don't need to set affinity because all online CPUs are covered */
751 EXPORT_SYMBOL(cfs_cpt_bind);
754 * Choose max to \a number CPUs from \a node and set them in \a cpt.
755 * We always prefer to choose CPU in the same core/socket.
757 static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
758 cpumask_t *node_mask, int number)
760 cpumask_var_t socket_mask;
761 cpumask_var_t core_mask;
768 if (number >= cpumask_weight(node_mask)) {
769 while (!cpumask_empty(node_mask)) {
770 cpu = cpumask_first(node_mask);
771 cpumask_clear_cpu(cpu, node_mask);
773 if (!cpu_online(cpu))
776 rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
783 /* Allocate scratch buffers
784 * As we cannot initialize a cpumask_var_t, we need
785 * to alloc both before we can risk trying to free either
787 if (!zalloc_cpumask_var(&socket_mask, GFP_NOFS))
789 if (!zalloc_cpumask_var(&core_mask, GFP_NOFS))
794 while (!cpumask_empty(node_mask)) {
795 cpu = cpumask_first(node_mask);
797 /* get cpumask for cores in the same socket */
798 cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
799 while (!cpumask_empty(socket_mask)) {
800 /* get cpumask for hts in the same core */
801 cpumask_and(core_mask, topology_sibling_cpumask(cpu),
804 for_each_cpu(i, core_mask) {
805 cpumask_clear_cpu(i, socket_mask);
806 cpumask_clear_cpu(i, node_mask);
811 rc = cfs_cpt_set_cpu(cptab, cpt, i);
820 cpu = cpumask_first(socket_mask);
825 free_cpumask_var(socket_mask);
826 free_cpumask_var(core_mask);
830 #define CPT_WEIGHT_MIN 4u
832 static unsigned int cfs_cpt_num_estimate(void)
835 unsigned int ncpu = num_online_cpus();
836 unsigned int ncpt = 1;
839 nthr = cpumask_weight(topology_sibling_cpumask(smp_processor_id()));
842 if (ncpu > CPT_WEIGHT_MIN)
843 for (ncpt = 2; ncpu > 2 * nthr * ncpt; ncpt++)
846 #if (BITS_PER_LONG == 32)
847 /* config many CPU partitions on 32-bit system could consume
850 ncpt = min(2U, ncpt);
853 ncpt--; /* worst case is 1 */
858 static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
860 struct cfs_cpt_table *cptab = NULL;
861 cpumask_var_t node_mask;
868 num = cfs_cpt_num_estimate();
872 if (ncpt > num_online_cpus()) {
874 CERROR("libcfs: CPU partition count %d > cores %d: rc = %d\n",
875 ncpt, num_online_cpus(), rc);
879 if (ncpt > 4 * num) {
880 CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
884 cptab = cfs_cpt_table_alloc(ncpt);
886 CERROR("Failed to allocate CPU map(%d)\n", ncpt);
891 if (!zalloc_cpumask_var(&node_mask, GFP_NOFS)) {
892 CERROR("Failed to allocate scratch cpumask\n");
897 num = num_online_cpus() / ncpt;
898 rem = num_online_cpus() % ncpt;
899 for_each_online_node(node) {
900 cpumask_copy(node_mask, cpumask_of_node(node));
902 while (cpt < ncpt && !cpumask_empty(node_mask)) {
903 struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
904 int ncpu = cpumask_weight(part->cpt_cpumask);
906 rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
907 (rem > 0) + num - ncpu);
913 ncpu = cpumask_weight(part->cpt_cpumask);
914 if (ncpu == num + !!(rem > 0)) {
921 free_cpumask_var(node_mask);
926 free_cpumask_var(node_mask);
928 CERROR("Failed (rc = %d) to setup CPU partition table with %d partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
929 rc, ncpt, num_online_nodes(), num_online_cpus());
932 cfs_cpt_table_free(cptab);
937 static struct cfs_cpt_table *cfs_cpt_table_create_pattern(const char *pattern)
939 struct cfs_cpt_table *cptab;
951 pattern_dup = kstrdup(pattern, GFP_KERNEL);
953 CERROR("Failed to duplicate pattern '%s'\n", pattern);
954 return ERR_PTR(-ENOMEM);
957 str = strim(pattern_dup);
958 if (*str == 'n' || *str == 'N') {
959 str++; /* skip 'N' char */
960 node = 1; /* NUMA pattern */
963 for_each_online_node(i) {
964 if (!cpumask_empty(cpumask_of_node(i)))
967 if (ncpt == 1) { /* single NUMA node */
969 return cfs_cpt_table_create(cpu_npartitions);
974 if (!ncpt) { /* scanning bracket which is mark of partition */
976 while ((bracket = strchr(bracket, '['))) {
983 (node && ncpt > num_online_nodes()) ||
984 (!node && ncpt > num_online_cpus())) {
985 CERROR("Invalid pattern '%s', or too many partitions %d\n",
991 cptab = cfs_cpt_table_alloc(ncpt);
993 CERROR("Failed to allocate CPU partition table\n");
998 if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */
999 for_each_online_node(i) {
1000 if (cpumask_empty(cpumask_of_node(i)))
1003 rc = cfs_cpt_set_node(cptab, cpt++, i);
1006 goto err_free_table;
1013 high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
1015 for (str = strim(str), c = 0; /* until break */; c++) {
1016 struct cfs_range_expr *range;
1017 struct cfs_expr_list *el;
1020 bracket = strchr(str, '[');
1023 CERROR("Invalid pattern '%s'\n", str);
1025 goto err_free_table;
1026 } else if (c != ncpt) {
1027 CERROR("Expect %d partitions but found %d\n",
1030 goto err_free_table;
1035 if (sscanf(str, "%d%n", &cpt, &n) < 1) {
1036 CERROR("Invalid CPU pattern '%s'\n", str);
1038 goto err_free_table;
1041 if (cpt < 0 || cpt >= ncpt) {
1042 CERROR("Invalid partition id %d, total partitions %d\n",
1045 goto err_free_table;
1048 if (cfs_cpt_weight(cptab, cpt)) {
1049 CERROR("Partition %d has already been set.\n", cpt);
1051 goto err_free_table;
1054 str = strim(str + n);
1055 if (str != bracket) {
1056 CERROR("Invalid pattern '%s'\n", str);
1058 goto err_free_table;
1061 bracket = strchr(str, ']');
1063 CERROR("Missing right bracket for partition %d in '%s'\n",
1066 goto err_free_table;
1069 rc = cfs_expr_list_parse(str, (bracket - str) + 1, 0, high,
1072 CERROR("Can't parse number range in '%s'\n", str);
1074 goto err_free_table;
1077 list_for_each_entry(range, &el->el_exprs, re_link) {
1078 for (i = range->re_lo; i <= range->re_hi; i++) {
1079 if ((i - range->re_lo) % range->re_stride)
1082 rc = node ? cfs_cpt_set_node(cptab, cpt, i)
1083 : cfs_cpt_set_cpu(cptab, cpt, i);
1085 cfs_expr_list_free(el);
1087 goto err_free_table;
1092 cfs_expr_list_free(el);
1094 if (!cfs_cpt_online(cptab, cpt)) {
1095 CERROR("No online CPU is found on partition %d\n", cpt);
1097 goto err_free_table;
1100 str = strim(bracket + 1);
1107 cfs_cpt_table_free(cptab);
1113 struct cfs_var_array {
1114 unsigned int va_count; /* # of buffers */
1115 unsigned int va_size; /* size of each var */
1116 struct cfs_cpt_table *va_cptab; /* cpu partition table */
1117 void *va_ptrs[0]; /* buffer addresses */
1120 /* free per-cpu data, see more detail in cfs_percpt_free */
1122 cfs_percpt_free(void *vars)
1124 struct cfs_var_array *arr;
1127 arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
1129 for (i = 0; i < arr->va_count; i++) {
1130 if (arr->va_ptrs[i])
1131 LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
1134 LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
1135 va_ptrs[arr->va_count]));
1137 EXPORT_SYMBOL(cfs_percpt_free);
1139 /* allocate per cpu-partition variables, returned value is an array of pointers,
1140 * variable can be indexed by CPU partition ID, i.e:
1142 * arr = cfs_percpt_alloc(cfs_cpu_pt, size);
1143 * then caller can access memory block for CPU 0 by arr[0],
1144 * memory block for CPU 1 by arr[1]...
1145 * memory block for CPU N by arr[N]...
1147 * cacheline aligned.
1150 cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
1152 struct cfs_var_array *arr;
1156 count = cfs_cpt_number(cptab);
1158 LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
1162 size = L1_CACHE_ALIGN(size);
1163 arr->va_size = size;
1164 arr->va_count = count;
1165 arr->va_cptab = cptab;
1167 for (i = 0; i < count; i++) {
1168 LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
1169 if (!arr->va_ptrs[i]) {
1170 cfs_percpt_free((void *)&arr->va_ptrs[0]);
1175 return (void *)&arr->va_ptrs[0];
1177 EXPORT_SYMBOL(cfs_percpt_alloc);
1179 /* return number of CPUs (or number of elements in per-cpu data)
1180 * according to cptab of @vars
1183 cfs_percpt_number(void *vars)
1185 struct cfs_var_array *arr;
1187 arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
1189 return arr->va_count;
1191 EXPORT_SYMBOL(cfs_percpt_number);
1193 #ifdef CONFIG_HOTPLUG_CPU
1194 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1195 static enum cpuhp_state lustre_cpu_online;
1197 static int cfs_cpu_online(unsigned int cpu)
1203 static int cfs_cpu_dead(unsigned int cpu)
1207 /* if all HTs in a core are offline, it may break affinity */
1208 warn = cpumask_any_and(topology_sibling_cpumask(cpu),
1209 cpu_online_mask) >= nr_cpu_ids;
1210 CDEBUG(warn ? D_WARNING : D_INFO,
1211 "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n",
1216 #ifndef HAVE_HOTPLUG_STATE_MACHINE
1217 static int cfs_cpu_notify(struct notifier_block *self, unsigned long action,
1220 int cpu = (unsigned long)hcpu;
1224 case CPU_DEAD_FROZEN:
1226 case CPU_ONLINE_FROZEN:
1228 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
1229 CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
1240 static struct notifier_block cfs_cpu_notifier = {
1241 .notifier_call = cfs_cpu_notify,
1244 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1245 #endif /* CONFIG_HOTPLUG_CPU */
1247 void cfs_cpu_fini(void)
1249 if (!IS_ERR_OR_NULL(cfs_cpt_tab))
1250 cfs_cpt_table_free(cfs_cpt_tab);
1252 #ifdef CONFIG_HOTPLUG_CPU
1253 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1254 if (lustre_cpu_online > 0)
1255 cpuhp_remove_state_nocalls(lustre_cpu_online);
1256 cpuhp_remove_state_nocalls(CPUHP_BP_PREPARE_DYN);
1258 unregister_hotcpu_notifier(&cfs_cpu_notifier);
1259 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1260 #endif /* CONFIG_HOTPLUG_CPU */
1263 int cfs_cpu_init(void)
1267 LASSERT(!cfs_cpt_tab);
1269 #ifdef CONFIG_HOTPLUG_CPU
1270 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1271 ret = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
1272 "fs/lustre/cfe:dead", NULL,
1275 goto failed_cpu_dead;
1277 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
1278 "fs/lustre/cfe:online",
1279 cfs_cpu_online, NULL);
1281 goto failed_cpu_online;
1283 lustre_cpu_online = ret;
1285 register_hotcpu_notifier(&cfs_cpu_notifier);
1286 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1287 #endif /* CONFIG_HOTPLUG_CPU */
1291 cfs_cpt_tab = cfs_cpt_table_create_pattern(cpu_pattern);
1292 if (IS_ERR(cfs_cpt_tab)) {
1293 ret = PTR_ERR(cfs_cpt_tab);
1294 pr_err("libcfs: failed to create cptab from pattern '%s': rc = %d\n",
1296 goto failed_alloc_table;
1299 cfs_cpt_tab = cfs_cpt_table_create(cpu_npartitions);
1300 if (IS_ERR(cfs_cpt_tab)) {
1301 ret = PTR_ERR(cfs_cpt_tab);
1302 pr_err("libcfs: failed to create cptab with npartitions=%d: rc = %d\n",
1303 cpu_npartitions, ret);
1304 goto failed_alloc_table;
1310 pr_notice("libcfs: HW NUMA nodes: %d, HW CPU cores: %d, npartitions: %d\n",
1311 num_online_nodes(), num_online_cpus(),
1312 cfs_cpt_number(cfs_cpt_tab));
1318 if (!IS_ERR_OR_NULL(cfs_cpt_tab))
1319 cfs_cpt_table_free(cfs_cpt_tab);
1321 #ifdef CONFIG_HOTPLUG_CPU
1322 #ifdef HAVE_HOTPLUG_STATE_MACHINE
1323 if (lustre_cpu_online > 0)
1324 cpuhp_remove_state_nocalls(lustre_cpu_online);
1326 cpuhp_remove_state_nocalls(CPUHP_AP_ONLINE_DYN);
1329 unregister_hotcpu_notifier(&cfs_cpu_notifier);
1330 #endif /* !HAVE_HOTPLUG_STATE_MACHINE */
1331 #endif /* CONFIG_HOTPLUG_CPU */