LU-8703 libcfs: make tolerant to offline CPUs and empty NUMA nodes

[fs/lustre-release.git] / libcfs / libcfs / linux / linux-cpu.c
diff --git a/libcfs/libcfs/linux/linux-cpu.c b/libcfs/libcfs/linux/linux-cpu.c

index a1cb867..a58bc49 100644 (file)
--- a/libcfs/libcfs/linux/linux-cpu.c
+++ b/libcfs/libcfs/linux/linux-cpu.c
@@ -178,27 +178,27 @@ EXPORT_SYMBOL(cfs_cpt_table_alloc);
  
  int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
  {
-       char    *tmp = buf;
-       int     rc = -EFBIG;
-       int     i;
-       int     j;
+       char *tmp = buf;
+       int rc;
+       int i;
+       int j;
  
         for (i = 0; i < cptab->ctb_nparts; i++) {
                 if (len <= 0)
-                       goto out;
+                       goto err;
  
                 rc = snprintf(tmp, len, "%d\t:", i);
                 len -= rc;
  
                 if (len <= 0)
-                       goto out;
+                       goto err;
  
                 tmp += rc;
                 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
                         rc = snprintf(tmp, len, " %d", j);
                         len -= rc;
                         if (len <= 0)
-                               goto out;
+                               goto err;
                         tmp += rc;
                 }
  
@@ -206,31 +206,30 @@ int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
                 tmp++;
                 len--;
         }
-       rc = 0;
- out:
-       if (rc < 0)
-               return rc;
  
         return tmp - buf;
+
+err:
+       return -E2BIG;
  }
  EXPORT_SYMBOL(cfs_cpt_table_print);
  
  int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
  {
-       char    *tmp = buf;
-       int     rc = -EFBIG;
-       int     i;
-       int     j;
+       char *tmp = buf;
+       int rc;
+       int i;
+       int j;
  
         for (i = 0; i < cptab->ctb_nparts; i++) {
                 if (len <= 0)
-                       goto out;
+                       goto err;
  
                 rc = snprintf(tmp, len, "%d\t:", i);
                 len -= rc;
  
                 if (len <= 0)
-                       goto out;
+                       goto err;
  
                 tmp += rc;
                 for (j = 0; j < cptab->ctb_nparts; j++) {
@@ -238,7 +237,7 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
                                 j, cptab->ctb_parts[i].cpt_distance[j]);
                         len -= rc;
                         if (len <= 0)
-                               goto out;
+                               goto err;
                         tmp += rc;
                 }
  
@@ -246,12 +245,11 @@ int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
                 tmp++;
                 len--;
         }
-       rc = 0;
- out:
-       if (rc < 0)
-               return rc;
  
         return tmp - buf;
+
+err:
+       return -E2BIG;
  }
  EXPORT_SYMBOL(cfs_cpt_distance_print);
  
@@ -445,8 +443,15 @@ int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
                 return 0;
         }
  
-       LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
-       LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+       if (cpumask_test_cpu(cpu, cptab->ctb_cpumask)) {
+               CDEBUG(D_INFO, "CPU %d is already in cpumask\n", cpu);
+               return 0;
+       }
+       if (cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)) {
+               CDEBUG(D_INFO, "CPU %d is already in partition %d cpumask\n",
+                               cpu, cptab->ctb_cpu2cpt[cpu]);
+               return 0;
+       }
  
         cfs_cpt_add_cpu(cptab, cpt, cpu);
         cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
@@ -513,15 +518,17 @@ void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt,
  {
         int cpu;
  
-       for_each_cpu(cpu, mask)
-               cfs_cpt_unset_cpu(cptab, cpt, cpu);
+       for_each_cpu(cpu, mask) {
+               cfs_cpt_del_cpu(cptab, cpt, cpu);
+               cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
+       }
  }
  EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
  
  int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
  {
         const cpumask_t *mask;
-       int             cpu;
+       int cpu;
  
         if (node < 0 || node >= nr_node_ids) {
                 CDEBUG(D_INFO,
@@ -563,12 +570,10 @@ EXPORT_SYMBOL(cfs_cpt_unset_node);
  int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt,
                          const nodemask_t *mask)
  {
-       int     i;
+       int node;
  
-       for_each_node_mask(i, *mask) {
-               if (!cfs_cpt_set_node(cptab, cpt, i))
-                       return 0;
-       }
+       for_each_node_mask(node, *mask)
+               cfs_cpt_set_node(cptab, cpt, node);
  
         return 1;
  }
@@ -577,42 +582,42 @@ EXPORT_SYMBOL(cfs_cpt_set_nodemask);
  void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt,
                             const nodemask_t *mask)
  {
-       int     i;
+       int node;
  
-       for_each_node_mask(i, *mask)
-               cfs_cpt_unset_node(cptab, cpt, i);
+       for_each_node_mask(node, *mask)
+               cfs_cpt_unset_node(cptab, cpt, node);
  }
  EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
  
  int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
  {
-       nodemask_t      *mask;
-       int             weight;
-       int             rotor;
-       int             node;
+       nodemask_t *mask;
+       int weight;
+       int rotor;
+       int node = 0;
  
         /* convert CPU partition ID to HW node id */
  
         if (cpt < 0 || cpt >= cptab->ctb_nparts) {
-               mask = cptab->ctb_nodemask;
+               mask  = cptab->ctb_nodemask;
                 rotor = cptab->ctb_spread_rotor++;
         } else {
-               mask = cptab->ctb_parts[cpt].cpt_nodemask;
+               mask  = cptab->ctb_parts[cpt].cpt_nodemask;
                 rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+               node  = cptab->ctb_parts[cpt].cpt_node;
         }
  
         weight = nodes_weight(*mask);
-       LASSERT(weight > 0);
-
-       rotor %= weight;
+       if (weight > 0) {
+               rotor %= weight;
  
-       for_each_node_mask(node, *mask) {
-               if (rotor-- == 0)
-                       return node;
+               for_each_node_mask(node, *mask) {
+                       if (rotor-- == 0)
+                               return node;
+               }
         }
  
-       LBUG();
-       return 0;
+       return node;
  }
  EXPORT_SYMBOL(cfs_cpt_spread_node);
  
@@ -653,10 +658,10 @@ EXPORT_SYMBOL(cfs_cpt_of_node);
  
  int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
  {
-       cpumask_t       *cpumask;
-       nodemask_t      *nodemask;
-       int             rc;
-       int             i;
+       nodemask_t *nodemask;
+       cpumask_t *cpumask;
+       int cpu;
+       int rc;
  
         LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
  
@@ -675,8 +680,8 @@ int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
                 return -EINVAL;
         }
  
-       for_each_online_cpu(i) {
-               if (cpumask_test_cpu(i, cpumask))
+       for_each_online_cpu(cpu) {
+               if (cpumask_test_cpu(cpu, cpumask))
                         continue;
  
                 rc = set_cpus_allowed_ptr(current, cpumask);
@@ -697,55 +702,55 @@ EXPORT_SYMBOL(cfs_cpt_bind);
   * We always prefer to choose CPU in the same core/socket.
   */
  static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
-                               cpumask_t *node, int number)
+                               cpumask_t *node_mask, int number)
  {
-       cpumask_t *socket = NULL;
-       cpumask_t *core = NULL;
+       cpumask_t *socket_mask = NULL;
+       cpumask_t *core_mask = NULL;
         int rc = 0;
         int cpu;
         int i;
  
         LASSERT(number > 0);
  
-       if (number >= cpumask_weight(node)) {
-               while (!cpumask_empty(node)) {
-                       cpu = cpumask_first(node);
+       if (number >= cpumask_weight(node_mask)) {
+               while (!cpumask_empty(node_mask)) {
+                       cpu = cpumask_first(node_mask);
+                       cpumask_clear_cpu(cpu, node_mask);
+
+                       if (!cpu_online(cpu))
+                               continue;
  
                         rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
                         if (!rc)
                                 return -EINVAL;
-                       cpumask_clear_cpu(cpu, node);
                 }
                 return 0;
         }
  
         /* allocate scratch buffer */
-       LIBCFS_ALLOC(socket, cpumask_size());
-       LIBCFS_ALLOC(core, cpumask_size());
-       if (socket == NULL || core == NULL) {
+       LIBCFS_ALLOC(socket_mask, cpumask_size());
+       LIBCFS_ALLOC(core_mask, cpumask_size());
+       if (socket_mask == NULL || core_mask == NULL) {
                 rc = -ENOMEM;
                 goto out;
         }
  
-       while (!cpumask_empty(node)) {
-               cpu = cpumask_first(node);
+       while (!cpumask_empty(node_mask)) {
+               cpu = cpumask_first(node_mask);
  
                 /* get cpumask for cores in the same socket */
-               cpumask_copy(socket, topology_core_cpumask(cpu));
-               cpumask_and(socket, socket, node);
-
-               LASSERT(!cpumask_empty(socket));
-
-               while (!cpumask_empty(socket)) {
+               cpumask_and(socket_mask, topology_core_cpumask(cpu), node_mask);
+               while (!cpumask_empty(socket_mask)) {
                         /* get cpumask for hts in the same core */
-                       cpumask_copy(core, topology_sibling_cpumask(cpu));
-                       cpumask_and(core, core, node);
+                       cpumask_and(core_mask,
+                                   topology_sibling_cpumask(cpu), node_mask);
  
-                       LASSERT(!cpumask_empty(core));
+                       for_each_cpu(i, core_mask) {
+                               cpumask_clear_cpu(i, socket_mask);
+                               cpumask_clear_cpu(i, node_mask);
  
-                       for_each_cpu(i, core) {
-                               cpumask_clear_cpu(i, socket);
-                               cpumask_clear_cpu(i, node);
+                               if (!cpu_online(i))
+                                       continue;
  
                                 rc = cfs_cpt_set_cpu(cptab, cpt, i);
                                 if (!rc) {
@@ -756,15 +761,15 @@ static int cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
                                 if (--number == 0)
                                         goto out;
                         }
-                       cpu = cpumask_first(socket);
+                       cpu = cpumask_first(socket_mask);
                 }
         }
  
  out:
-       if (socket != NULL)
-               LIBCFS_FREE(socket, cpumask_size());
-       if (core != NULL)
-               LIBCFS_FREE(core, cpumask_size());
+       if (core_mask != NULL)
+               LIBCFS_FREE(core_mask, cpumask_size());
+       if (socket_mask != NULL)
+               LIBCFS_FREE(socket_mask, cpumask_size());
         return rc;
  }
  
@@ -784,7 +789,8 @@ static int cfs_cpt_num_estimate(void)
         /* generate reasonable number of CPU partitions based on total number
          * of CPUs, Preferred N should be power2 and match this condition:
          * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
-       for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+       for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1)
+               ;
  
         if (ncpt <= nnode) { /* fat numa system */
                 while (nnode > ncpt)
@@ -812,29 +818,22 @@ out:
  static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
  {
         struct cfs_cpt_table *cptab = NULL;
-       cpumask_t *mask = NULL;
+       cpumask_t *node_mask = NULL;
         int cpt = 0;
+       int node;
         int num;
-       int rc;
-       int i;
+       int rem;
+       int rc = 0;
  
-       rc = cfs_cpt_num_estimate();
+       num = cfs_cpt_num_estimate();
         if (ncpt <= 0)
-               ncpt = rc;
+               ncpt = num;
  
-       if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+       if (ncpt > num_online_cpus() || ncpt > 4 * num) {
                 CWARN("CPU partition number %d is larger than suggested "
-                     "value (%d), your system may have performance"
+                     "value (%d), your system may have performance "
                       "issue or run out of memory while under pressure\n",
-                     ncpt, rc);
-       }
-
-       if (num_online_cpus() % ncpt != 0) {
-               CERROR("CPU number %d is not multiple of cpu_npartition %d, "
-                      "please try different cpu_npartitions value or"
-                      "set pattern string by cpu_pattern=STRING\n",
-                      (int)num_online_cpus(), ncpt);
-               goto failed;
+                     ncpt, num);
         }
  
         cptab = cfs_cpt_table_alloc(ncpt);
@@ -843,67 +842,44 @@ static struct cfs_cpt_table *cfs_cpt_table_create(int ncpt)
                 goto failed;
         }
  
-       num = num_online_cpus() / ncpt;
-       if (num == 0) {
-               CERROR("CPU changed while setting CPU partition\n");
-               goto failed;
-       }
-
-       LIBCFS_ALLOC(mask, cpumask_size());
-       if (mask == NULL) {
+       LIBCFS_ALLOC(node_mask, cpumask_size());
+       if (node_mask == NULL) {
                 CERROR("Failed to allocate scratch cpumask\n");
                 goto failed;
         }
  
-       for_each_online_node(i) {
-               cpumask_copy(mask, cpumask_of_node(i));
-
-               while (!cpumask_empty(mask)) {
-                       struct cfs_cpu_partition *part;
-                       int n;
-
-                       /* Each emulated NUMA node has all allowed CPUs in
-                        * the mask.
-                        * End loop when all partitions have assigned CPUs.
-                        */
-                       if (cpt == ncpt)
-                               break;
-
-                       part = &cptab->ctb_parts[cpt];
+       num = num_online_cpus() / ncpt;
+       rem = num_online_cpus() % ncpt;
+       for_each_online_node(node) {
+               cpumask_copy(node_mask, cpumask_of_node(node));
  
-                       n = num - cpumask_weight(part->cpt_cpumask);
-                       LASSERT(n > 0);
+               while (cpt < ncpt && !cpumask_empty(node_mask)) {
+                       struct cfs_cpu_partition *part = &cptab->ctb_parts[cpt];
+                       int ncpu = cpumask_weight(part->cpt_cpumask);
  
-                       rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+                       rc = cfs_cpt_choose_ncpus(cptab, cpt, node_mask,
+                                                 num - ncpu);
                         if (rc < 0)
                                 goto failed;
  
-                       LASSERT(num >= cpumask_weight(part->cpt_cpumask));
-                       if (num == cpumask_weight(part->cpt_cpumask))
+                       ncpu = cpumask_weight(part->cpt_cpumask);
+                       if (ncpu == num + !!(rem > 0)) {
                                 cpt++;
+                               rem--;
+                       }
                 }
         }
  
-       if (cpt != ncpt ||
-           num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
-               CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
-                      "CPU hotplug/unplug while setting?\n",
-                      cptab->ctb_nparts, num, cpt,
-                      cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
-               goto failed;
-       }
-
-       LIBCFS_FREE(mask, cpumask_size());
-
+       LIBCFS_FREE(node_mask, cpumask_size());
         return cptab;
  
- failed:
-       CERROR("Failed to setup CPU-partition-table with %d "
-              "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
-              ncpt, num_online_nodes(), num_online_cpus());
+failed:
+       CERROR("Failed (rc=%d) to setup CPU partition table with %d "
+               "partitions, online HW NUMA nodes: %d, HW CPU cores: %d.\n",
+               rc, ncpt, num_online_nodes(), num_online_cpus());
  
-       if (mask != NULL)
-               LIBCFS_FREE(mask, cpumask_size());
+       if (node_mask != NULL)
+               LIBCFS_FREE(node_mask, cpumask_size());
  
         if (cptab != NULL)
                 cfs_cpt_table_free(cptab);