LU-7734 lnet: NUMA support

author Amir Shehata <amir.shehata@intel.com>

Tue, 15 Mar 2016 10:13:03 +0000 (03:13 -0700)

committer Amir Shehata <amir.shehata@intel.com>

Wed, 25 Jan 2017 03:10:14 +0000 (19:10 -0800)
author Amir Shehata <amir.shehata@intel.com>
Tue, 15 Mar 2016 10:13:03 +0000 (03:13 -0700)
committer Amir Shehata <amir.shehata@intel.com>
Wed, 25 Jan 2017 03:10:14 +0000 (19:10 -0800)
diff --git a/libcfs/include/libcfs/libcfs_cpu.h b/libcfs/include/libcfs/libcfs_cpu.h

index b14e29e..3b459ab 100644 (file)
--- a/libcfs/include/libcfs/libcfs_cpu.h
+++ b/libcfs/include/libcfs/libcfs_cpu.h
@@ -109,6 +109,10 @@ struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
   */
  int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
  /**
+ * print distance information of cpt-table
+ */
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
   * return total number of CPU partitions in \a cptab
   */
  int
@@ -138,6 +142,14 @@ int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
   */
  int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
  /**
+ * shadow HW node ID \a NODE to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
+/**
+ * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
+ */
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+/**
   * bind current thread on a CPU-partition \a cpt of \a cptab
   */
  int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
diff --git a/libcfs/include/libcfs/libcfs_ioctl.h b/libcfs/include/libcfs/libcfs_ioctl.h

index 9f0b284..027cb5e 100644 (file)
--- a/libcfs/include/libcfs/libcfs_ioctl.h
+++ b/libcfs/include/libcfs/libcfs_ioctl.h
@@ -146,8 +146,10 @@ struct libcfs_debug_ioctl_data
  #define IOC_LIBCFS_ADD_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
  #define IOC_LIBCFS_DEL_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
  #define IOC_LIBCFS_GET_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_DBG                    _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR                                        98
+#define IOC_LIBCFS_SET_NUMA_RANGE         _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NUMA_RANGE         _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DBG                    _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR                                        100
  
  extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
  
diff --git a/libcfs/include/libcfs/linux/linux-cpu.h b/libcfs/include/libcfs/linux/linux-cpu.h

index a8f132a..36b763f 100644 (file)
--- a/libcfs/include/libcfs/linux/linux-cpu.h
+++ b/libcfs/include/libcfs/linux/linux-cpu.h
@@ -62,6 +62,8 @@ struct cfs_cpu_partition {
         cpumask_t                       *cpt_cpumask;
         /* nodes mask for this partition */
         nodemask_t                      *cpt_nodemask;
+       /* NUMA distance between CPTs */
+       unsigned                        *cpt_distance;
         /* spread rotor for NUMA allocator */
         unsigned                        cpt_spread_rotor;
  };
@@ -70,6 +72,8 @@ struct cfs_cpu_partition {
  struct cfs_cpt_table {
         /* spread rotor for NUMA allocator */
         unsigned                        ctb_spread_rotor;
+       /* maximum NUMA distance between all nodes in table */
+       unsigned                        ctb_distance;
         /* # of CPU partitions */
         unsigned                        ctb_nparts;
         /* partitions tables */
@@ -78,6 +82,8 @@ struct cfs_cpt_table {
         int                             *ctb_cpu2cpt;
         /* all cpus in this partition table */
         cpumask_t                       *ctb_cpumask;
+       /* shadow HW node to CPU partition ID */
+       int                             *ctb_node2cpt;
         /* all nodes in this partition table */
         nodemask_t                      *ctb_nodemask;
  };
diff --git a/libcfs/libcfs/libcfs_cpu.c b/libcfs/libcfs/libcfs_cpu.c

index 1007cf9..f36e2a3 100644 (file)
--- a/libcfs/libcfs/libcfs_cpu.c
+++ b/libcfs/libcfs/libcfs_cpu.c
@@ -45,6 +45,8 @@ EXPORT_SYMBOL(cfs_cpt_table);
  
  #define CFS_CPU_VERSION_MAGIC           0xbabecafe
  
+#define CFS_CPT_DISTANCE               1       /* Arbitrary positive value */
+
  struct cfs_cpt_table *
  cfs_cpt_table_alloc(unsigned int ncpt)
  {
@@ -58,6 +60,7 @@ cfs_cpt_table_alloc(unsigned int ncpt)
         LIBCFS_ALLOC(cptab, sizeof(*cptab));
         if (cptab != NULL) {
                 cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+               cpu_set(0, cptab->ctb_cpumask);
                 node_set(0, cptab->ctb_nodemask);
                 cptab->ctb_nparts  = ncpt;
         }
@@ -90,6 +93,20 @@ cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
  EXPORT_SYMBOL(cfs_cpt_table_print);
  
  int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       int     rc = 0;
+
+       rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
+       len -= rc;
+       if (len <= 0)
+               return -EFBIG;
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
  cfs_cpt_number(struct cfs_cpt_table *cptab)
  {
         return 1;
@@ -110,12 +127,26 @@ cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
  }
  EXPORT_SYMBOL(cfs_cpt_online);
  
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+       return &cptab->ctb_mask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
  nodemask_t *
  cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
  {
         return &cptab->ctb_nodemask;
  }
-EXPORT_SYMBOL(cfs_cpt_cpumask);
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+       return CFS_CPT_DISTANCE;
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
  
  int
  cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
@@ -199,6 +230,13 @@ cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
  EXPORT_SYMBOL(cfs_cpt_of_cpu);
  
  int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
  cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
  {
         return 0;
diff --git a/libcfs/libcfs/linux/linux-cpu.c b/libcfs/libcfs/linux/linux-cpu.c

index 369f146..069211a 100644 (file)
--- a/libcfs/libcfs/linux/linux-cpu.c
+++ b/libcfs/libcfs/linux/linux-cpu.c
@@ -83,8 +83,12 @@ cfs_cpt_table_free(struct cfs_cpt_table *cptab)
  
         if (cptab->ctb_cpu2cpt != NULL) {
                 LIBCFS_FREE(cptab->ctb_cpu2cpt,
-                           num_possible_cpus() *
-                           sizeof(cptab->ctb_cpu2cpt[0]));
+                           nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+       }
+
+       if (cptab->ctb_node2cpt != NULL) {
+               LIBCFS_FREE(cptab->ctb_node2cpt,
+                           nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
         }
  
         for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
@@ -97,6 +101,12 @@ cfs_cpt_table_free(struct cfs_cpt_table *cptab)
  
                 if (part->cpt_cpumask != NULL)
                         LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+               if (part->cpt_distance) {
+                       LIBCFS_FREE(part->cpt_distance,
+                               cptab->ctb_nparts *
+                                       sizeof(part->cpt_distance[0]));
+               }
         }
  
         if (cptab->ctb_parts != NULL) {
@@ -132,12 +142,20 @@ cfs_cpt_table_alloc(unsigned int ncpt)
                 goto failed;
  
         LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
-                    num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+                    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
         if (cptab->ctb_cpu2cpt == NULL)
                 goto failed;
  
         memset(cptab->ctb_cpu2cpt, -1,
-              num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+              nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+       LIBCFS_ALLOC(cptab->ctb_node2cpt,
+                    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+       if (cptab->ctb_node2cpt == NULL)
+               goto failed;
+
+       memset(cptab->ctb_node2cpt, -1,
+              nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
  
         LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
         if (cptab->ctb_parts == NULL)
@@ -147,8 +165,16 @@ cfs_cpt_table_alloc(unsigned int ncpt)
                 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
  
                 LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+               if (!part->cpt_cpumask)
+                       goto failed;
+
                 LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
-               if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+               if (!part->cpt_nodemask)
+                       goto failed;
+
+               LIBCFS_ALLOC(part->cpt_distance,
+                       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+               if (!part->cpt_distance)
                         goto failed;
         }
  
@@ -164,29 +190,26 @@ int
  cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
  {
         char    *tmp = buf;
-       int     rc = 0;
+       int     rc = -EFBIG;
         int     i;
         int     j;
  
         for (i = 0; i < cptab->ctb_nparts; i++) {
-               if (len > 0) {
-                       rc = snprintf(tmp, len, "%d\t: ", i);
-                       len -= rc;
-               }
+               if (len <= 0)
+                       goto out;
  
-               if (len <= 0) {
-                       rc = -EFBIG;
+               rc = snprintf(tmp, len, "%d\t:", i);
+               len -= rc;
+
+               if (len <= 0)
                         goto out;
-               }
  
                 tmp += rc;
                 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
-                       rc = snprintf(tmp, len, "%d ", j);
+                       rc = snprintf(tmp, len, " %d", j);
                         len -= rc;
-                       if (len <= 0) {
-                               rc = -EFBIG;
+                       if (len <= 0)
                                 goto out;
-                       }
                         tmp += rc;
                 }
  
@@ -194,8 +217,8 @@ cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
                 tmp++;
                 len--;
         }
-
-out:
+       rc = 0;
+ out:
         if (rc < 0)
                 return rc;
  
@@ -204,6 +227,47 @@ out:
  EXPORT_SYMBOL(cfs_cpt_table_print);
  
  int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       char    *tmp = buf;
+       int     rc = -EFBIG;
+       int     i;
+       int     j;
+
+       for (i = 0; i < cptab->ctb_nparts; i++) {
+               if (len <= 0)
+                       goto out;
+
+               rc = snprintf(tmp, len, "%d\t:", i);
+               len -= rc;
+
+               if (len <= 0)
+                       goto out;
+
+               tmp += rc;
+               for (j = 0; j < cptab->ctb_nparts; j++) {
+                       rc = snprintf(tmp, len, " %d:%d",
+                               j, cptab->ctb_parts[i].cpt_distance[j]);
+                       len -= rc;
+                       if (len <= 0)
+                               goto out;
+                       tmp += rc;
+               }
+
+               *tmp = '\n';
+               tmp++;
+               len--;
+       }
+       rc = 0;
+ out:
+       if (rc < 0)
+               return rc;
+
+       return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
  cfs_cpt_number(struct cfs_cpt_table *cptab)
  {
         return cptab->ctb_nparts;
@@ -254,11 +318,139 @@ cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
  }
  EXPORT_SYMBOL(cfs_cpt_nodemask);
  
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+       LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+       LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+       if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+               return cptab->ctb_distance;
+
+       return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned
+cfs_cpt_distance_calculate(nodemask_t *from_mask, nodemask_t *to_mask)
+{
+       unsigned maximum;
+       unsigned distance;
+       int to;
+       int from;
+
+       maximum = 0;
+       for_each_node_mask(from, *from_mask) {
+               for_each_node_mask(to, *to_mask) {
+                       distance = node_distance(from, to);
+                       if (maximum < distance)
+                               maximum = distance;
+               }
+       }
+       return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       cptab->ctb_cpu2cpt[cpu] = cpt;
+
+       cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+       cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+       cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+       cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       int cpt2;
+       struct cfs_cpu_partition *part;
+       struct cfs_cpu_partition *part2;
+
+       if (!node_isset(node, *cptab->ctb_nodemask)) {
+               /* first time node is added to the CPT table */
+               node_set(node, *cptab->ctb_nodemask);
+               cptab->ctb_node2cpt[node] = cpt;
+               cptab->ctb_distance = cfs_cpt_distance_calculate(
+                                                       cptab->ctb_nodemask,
+                                                       cptab->ctb_nodemask);
+       }
+
+       part = &cptab->ctb_parts[cpt];
+       if (!node_isset(node, *part->cpt_nodemask)) {
+               /* first time node is added to this CPT */
+               node_set(node, *part->cpt_nodemask);
+               for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+                       part2 = &cptab->ctb_parts[cpt2];
+                       part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+                                               part->cpt_nodemask,
+                                               part2->cpt_nodemask);
+                       part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+                                               part2->cpt_nodemask,
+                                               part->cpt_nodemask);
+               }
+       }
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       int cpu;
+       int cpt2;
+       struct cfs_cpu_partition *part;
+       struct cfs_cpu_partition *part2;
+
+       part = &cptab->ctb_parts[cpt];
+
+       for_each_cpu(cpu, part->cpt_cpumask) {
+               /* this CPT has other CPU belonging to this node? */
+               if (cpu_to_node(cpu) == node)
+                       break;
+       }
+
+       if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+               /* No more CPUs in the node for this CPT. */
+               node_clear(node, *part->cpt_nodemask);
+               for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+                       part2 = &cptab->ctb_parts[cpt2];
+                       if (node_isset(node, *part2->cpt_nodemask))
+                               cptab->ctb_node2cpt[node] = cpt2;
+                       part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+                                               part->cpt_nodemask,
+                                               part2->cpt_nodemask);
+                       part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+                                               part2->cpt_nodemask,
+                                               part->cpt_nodemask);
+               }
+       }
+
+       for_each_cpu(cpu, cptab->ctb_cpumask) {
+               /* this CPT-table has other CPUs belonging to this node? */
+               if (cpu_to_node(cpu) == node)
+                       break;
+       }
+
+       if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+               /* No more CPUs in the table for this node. */
+               node_clear(node, *cptab->ctb_nodemask);
+               cptab->ctb_node2cpt[node] = -1;
+               cptab->ctb_distance =
+                       cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+                                       cptab->ctb_nodemask);
+       }
+}
+
  int
  cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
  {
-       int     node;
-
         LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
  
         if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
@@ -272,23 +464,11 @@ cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
                 return 0;
         }
  
-       cptab->ctb_cpu2cpt[cpu] = cpt;
-
         LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
         LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
  
-       cpumask_set_cpu(cpu, cptab->ctb_cpumask);
-       cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-
-       node = cpu_to_node(cpu);
-
-       /* first CPU of @node in this CPT table */
-       if (!node_isset(node, *cptab->ctb_nodemask))
-               node_set(node, *cptab->ctb_nodemask);
-
-       /* first CPU of @node in this partition */
-       if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
-               node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+       cfs_cpt_add_cpu(cptab, cpt, cpu);
+       cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
  
         return 1;
  }
@@ -297,9 +477,6 @@ EXPORT_SYMBOL(cfs_cpt_set_cpu);
  void
  cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
  {
-       int     node;
-       int     i;
-
         LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
  
         if (cpu < 0 || cpu >= nr_cpu_ids) {
@@ -325,32 +502,8 @@ cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
         LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
         LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
  
-       cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-       cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
-       cptab->ctb_cpu2cpt[cpu] = -1;
-
-       node = cpu_to_node(cpu);
-
-       LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
-       LASSERT(node_isset(node, *cptab->ctb_nodemask));
-
-       for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
-               /* this CPT has other CPU belonging to this node? */
-               if (cpu_to_node(i) == node)
-                       break;
-       }
-
-       if (i >= nr_cpu_ids)
-               node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
-
-       for_each_cpu(i, cptab->ctb_cpumask) {
-               /* this CPT-table has other CPU belonging to this node? */
-               if (cpu_to_node(i) == node)
-                       break;
-       }
-
-       if (i >= nr_cpu_ids)
-               node_clear(node, *cptab->ctb_nodemask);
+       cfs_cpt_del_cpu(cptab, cpt, cpu);
+       cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
  }
  EXPORT_SYMBOL(cfs_cpt_unset_cpu);
  
@@ -367,8 +520,8 @@ cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, const cpumask_t *mask)
         }
  
         for_each_cpu(cpu, mask) {
-               if (!cfs_cpt_set_cpu(cptab, cpt, cpu))
-                       return 0;
+               cfs_cpt_add_cpu(cptab, cpt, cpu);
+               cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
         }
  
         return 1;
@@ -390,18 +543,22 @@ int
  cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
  {
         const cpumask_t *mask;
-       int             rc;
+       int             cpu;
  
-       if (node < 0 || node >= MAX_NUMNODES) {
+       if (node < 0 || node >= nr_node_ids) {
                 CDEBUG(D_INFO,
                        "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
                 return 0;
         }
  
         mask = cpumask_of_node(node);
-       rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
  
-       return rc;
+       for_each_cpu(cpu, mask)
+               cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+       cfs_cpt_add_node(cptab, cpt, node);
+
+       return 1;
  }
  EXPORT_SYMBOL(cfs_cpt_set_node);
  
@@ -409,16 +566,20 @@ void
  cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
  {
         const cpumask_t *mask;
+       int cpu;
  
-       if (node < 0 || node >= MAX_NUMNODES) {
+       if (node < 0 || node >= nr_node_ids) {
                 CDEBUG(D_INFO,
                        "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
                 return;
         }
  
         mask = cpumask_of_node(node);
-       cfs_cpt_unset_cpumask(cptab, cpt, mask);
  
+       for_each_cpu(cpu, mask)
+               cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+       cfs_cpt_del_node(cptab, cpt, node);
  }
  EXPORT_SYMBOL(cfs_cpt_unset_node);
  
@@ -507,6 +668,16 @@ cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
  EXPORT_SYMBOL(cfs_cpt_of_cpu);
  
  int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+       if (node < 0 || node > nr_node_ids)
+               return CFS_CPT_ANY;
+
+       return cptab->ctb_node2cpt[node];
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
  cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
  {
         cpumask_t       *cpumask;
@@ -834,7 +1005,7 @@ cfs_cpt_table_create_pattern(char *pattern)
                 return cptab;
         }
  
-       high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+       high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
  
         for (str = cfs_trimwhite(pattern), c = 0;; c++) {
                 struct cfs_range_expr   *range;
diff --git a/libcfs/libcfs/module.c b/libcfs/libcfs/module.c

index 3920e39..e8080f3 100644 (file)
--- a/libcfs/libcfs/module.c
+++ b/libcfs/libcfs/module.c
@@ -463,6 +463,55 @@ proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
                                     __proc_cpt_table);
  }
  
+static int __proc_cpt_distance(void *data, int write,
+                              loff_t pos, void __user *buffer, int nob)
+{
+       char *buf = NULL;
+       int   len = 4096;
+       int   rc  = 0;
+
+       if (write)
+               return -EPERM;
+
+       LASSERT(cfs_cpt_table != NULL);
+
+       while (1) {
+               LIBCFS_ALLOC(buf, len);
+               if (buf == NULL)
+                       return -ENOMEM;
+
+               rc = cfs_cpt_distance_print(cfs_cpt_table, buf, len);
+               if (rc >= 0)
+                       break;
+
+               if (rc == -EFBIG) {
+                       LIBCFS_FREE(buf, len);
+                       len <<= 1;
+                       continue;
+               }
+               goto out;
+       }
+
+       if (pos >= rc) {
+               rc = 0;
+               goto out;
+       }
+
+       rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+       if (buf != NULL)
+               LIBCFS_FREE(buf, len);
+       return rc;
+}
+
+static int
+proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
+              size_t *lenp, loff_t *ppos)
+{
+       return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+                                    __proc_cpt_distance);
+}
+
  static struct ctl_table lnet_table[] = {
         /*
          * NB No .strategy entries have been provided since sysctl(8) prefers
@@ -538,6 +587,13 @@ static struct ctl_table lnet_table[] = {
         },
         {
                 INIT_CTL_NAME
+               .procname       = "cpu_partition_distance",
+               .maxlen         = 128,
+               .mode           = 0444,
+               .proc_handler   = &proc_cpt_distance,
+       },
+       {
+               INIT_CTL_NAME
                 .procname       = "debug_log_upcall",
                 .data           = lnet_debug_log_upcall,
                 .maxlen         = sizeof(lnet_debug_log_upcall),
diff --git a/lnet/include/lnet/lib-dlc.h b/lnet/include/lnet/lib-dlc.h

index 1743d72..de049d0 100644 (file)
--- a/lnet/include/lnet/lib-dlc.h
+++ b/lnet/include/lnet/lib-dlc.h
@@ -161,6 +161,7 @@ struct lnet_ioctl_config_ni {
         __u32                   lic_status;
         __u32                   lic_tcp_bonding;
         __u32                   lic_idx;
+       __s32                   lic_dev_cpt;
         char                    lic_bulk[0];
  };
  
@@ -212,6 +213,11 @@ struct lnet_ioctl_peer_cfg {
         char prcfg_bulk[0];
  };
  
+struct lnet_ioctl_numa_range {
+       struct libcfs_ioctl_hdr nr_hdr;
+       __u32 nr_range;
+};
+
  struct lnet_ioctl_lnet_stats {
         struct libcfs_ioctl_hdr st_hdr;
         struct lnet_counters st_cntrs;
diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h

index 87f3300..38e1be6 100644 (file)
--- a/lnet/include/lnet/lib-lnet.h
+++ b/lnet/include/lnet/lib-lnet.h
@@ -796,6 +796,7 @@ bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
  bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
  void lnet_incr_dlc_seq(void);
  __u32 lnet_get_dlc_seq_locked(void);
+inline __u32 lnet_get_numa_range(void);
  
  struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
                                                   struct lnet_peer_net *peer_net,
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h

index 1799889..295a43d 100644 (file)
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -379,6 +379,9 @@ typedef struct lnet_ni {
         /* lnd tunables set explicitly */
         bool ni_lnd_tunables_set;
  
+       /* physical device CPT */
+       int                     dev_cpt;
+
         /* sequence number used to round robin over nis within a net */
         __u32                   ni_seq;
  
diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c

index 1c3e2d2..3d5f65b 100644 (file)
--- a/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/lnet/klnds/o2iblnd/o2iblnd.c
@@ -3176,6 +3176,7 @@ kiblnd_startup (lnet_ni_t *ni)
          unsigned long             flags;
          int                       rc;
         int                       newdev;
+       int                       node_id;
  
          LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
  
@@ -3219,13 +3220,16 @@ kiblnd_startup (lnet_ni_t *ni)
         newdev = ibdev == NULL;
         /* hmm...create kib_dev even for alias */
         if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-                ibdev = kiblnd_create_dev(ifname);
+               ibdev = kiblnd_create_dev(ifname);
  
-        if (ibdev == NULL)
-                goto failed;
+       if (ibdev == NULL)
+               goto failed;
+
+       node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
+       ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
  
-        net->ibn_dev = ibdev;
-        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+       net->ibn_dev = ibdev;
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
  
         rc = kiblnd_dev_start_threads(ibdev, newdev,
                                       ni->ni_cpts, ni->ni_ncpts);
diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c

index bb08cca..be5545e 100644 (file)
--- a/lnet/klnds/socklnd/socklnd.c
+++ b/lnet/klnds/socklnd/socklnd.c
@@ -37,6 +37,7 @@
   * Author: Eric Barton <eric@bartonsoftware.com>
   */
  
+#include <linux/pci.h>
  #include "socklnd.h"
  
  static lnd_t                   the_ksocklnd;
@@ -2772,9 +2773,11 @@ ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
  int
  ksocknal_startup (lnet_ni_t *ni)
  {
-        ksock_net_t  *net;
-        int           rc;
-        int           i;
+       ksock_net_t  *net;
+       int           rc;
+       int           i;
+       struct net_device *net_dev;
+       int node_id;
  
          LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
  
@@ -2803,6 +2806,7 @@ ksocknal_startup (lnet_ni_t *ni)
                 ni->ni_net->net_tunables_set = true;
         }
  
+
          if (ni->ni_interfaces[0] == NULL) {
                  rc = ksocknal_enumerate_interfaces(net);
                  if (rc <= 0)
@@ -2835,10 +2839,21 @@ ksocknal_startup (lnet_ni_t *ni)
                         strlcpy(net->ksnn_interfaces[i].ksni_name,
                                 ni->ni_interfaces[i],
                                 sizeof(net->ksnn_interfaces[i].ksni_name));
+
                 }
                 net->ksnn_ninterfaces = i;
         }
  
+       net_dev = dev_get_by_name(&init_net,
+                                 net->ksnn_interfaces[0].ksni_name);
+       if (net_dev != NULL) {
+               node_id = dev_to_node(&net_dev->dev);
+               ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+               dev_put(net_dev);
+       } else {
+               ni->dev_cpt = CFS_CPT_ANY;
+       }
+
         /* call it before add it to ksocknal_data.ksnd_nets */
         rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
         if (rc != 0)
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c

index 64c912b..ecde283 100644 (file)
--- a/lnet/lnet/api-ni.c
+++ b/lnet/lnet/api-ni.c
@@ -62,6 +62,11 @@ module_param(use_tcp_bonding, int, 0444);
  MODULE_PARM_DESC(use_tcp_bonding,
                  "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
  
+static __u32 lnet_numa_range = 0;
+module_param(lnet_numa_range, int, 0444);
+MODULE_PARM_DESC(lnet_numa_range,
+               "NUMA range to consider during Multi-Rail selection");
+
  /*
   * This sequence number keeps track of how many times DLC was used to
   * update the configuration. It is incremented on any DLC update and
@@ -1968,6 +1973,7 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
         cfg_ni->lic_nid = ni->ni_nid;
         cfg_ni->lic_status = ni->ni_status->ns_status;
         cfg_ni->lic_tcp_bonding = use_tcp_bonding;
+       cfg_ni->lic_dev_cpt = ni->dev_cpt;
  
         memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
  
@@ -2577,6 +2583,11 @@ __u32 lnet_get_dlc_seq_locked(void)
         return atomic_read(&lnet_dlc_seq_no);
  }
  
+inline __u32 lnet_get_numa_range(void)
+{
+       return lnet_numa_range;
+}
+
  /**
   * LNet ioctl handler.
   *
@@ -2721,6 +2732,24 @@ LNetCtl(unsigned int cmd, void *arg)
                 mutex_unlock(&the_lnet.ln_api_mutex);
                 return rc;
  
+       case IOC_LIBCFS_SET_NUMA_RANGE: {
+               struct lnet_ioctl_numa_range *numa;
+               numa = arg;
+               if (numa->nr_hdr.ioc_len != sizeof(*numa))
+                       return -EINVAL;
+               lnet_numa_range = numa->nr_range;
+               return 0;
+       }
+
+       case IOC_LIBCFS_GET_NUMA_RANGE: {
+               struct lnet_ioctl_numa_range *numa;
+               numa = arg;
+               if (numa->nr_hdr.ioc_len != sizeof(*numa))
+                       return -EINVAL;
+               numa->nr_range = lnet_numa_range;
+               return 0;
+       }
+
         case IOC_LIBCFS_GET_BUF: {
                 struct lnet_ioctl_pool_cfg *pool_cfg;
                 size_t total = sizeof(*config) + sizeof(*pool_cfg);
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c

index 8b36ca4..35c67dc 100644 (file)
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -1289,6 +1289,10 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
         int                     best_credits = 0;
         __u32                   seq, seq2;
         int                     best_lpni_credits = INT_MIN;
+       int                     md_cpt = 0;
+       int                     shortest_distance = INT_MAX;
+       int                     distance = 0;
+       bool                    found_ir = false;
  
  again:
         /*
@@ -1307,12 +1311,20 @@ again:
         routing = false;
         local_net = NULL;
         best_ni = NULL;
+       shortest_distance = INT_MAX;
+       found_ir = false;
  
         if (the_lnet.ln_shutdown) {
                 lnet_net_unlock(cpt);
                 return -ESHUTDOWN;
         }
  
+       if (msg->msg_md != NULL)
+               /* get the cpt of the MD, used during NUMA based selection */
+               md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+       else
+               md_cpt = CFS_CPT_ANY;
+
         /*
          * initialize the variables which could be reused if we go to
          * again
@@ -1438,34 +1450,114 @@ again:
                         continue;
  
                 /*
-                * Second jab at determining best_ni
-                * if we get here then the peer we're trying to send
-                * to is on a directly connected network, and we'll
-                * need to pick the local_ni on that network to send
-                * from
+                * Iterate through the NIs in this local Net and select
+                * the NI to send from. The selection is determined by
+                * these 3 criterion in the following priority:
+                *      1. NUMA
+                *      2. NI available credits
+                *      3. Round Robin
                  */
                 while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
                         if (!lnet_is_ni_healthy_locked(ni))
                                 continue;
-                       /* TODO: compare NUMA distance */
-                       if (ni->ni_tx_queues[cpt]->tq_credits <=
-                           best_credits) {
+
+                       /*
+                        * calculate the distance from the cpt on which
+                        * the message memory is allocated to the CPT of
+                        * the NI's physical device
+                        */
+                       distance = cfs_cpt_distance(lnet_cpt_table(),
+                                                   md_cpt,
+                                                   ni->dev_cpt);
+
+                       /*
+                        * If we already have a closer NI within the NUMA
+                        * range provided, then there is no need to
+                        * consider the current NI. Move on to the next
+                        * one.
+                        */
+                       if (distance > shortest_distance &&
+                           distance > lnet_get_numa_range())
+                               continue;
+
+                       if (distance < shortest_distance &&
+                           distance > lnet_get_numa_range()) {
                                 /*
-                                * all we want is to read tq_credits
-                                * value as an approximation of how
-                                * busy the NI is. No need to grab a lock
+                                * The current NI is the closest one that we
+                                * have found, even though it's not in the
+                                * NUMA range specified. This occurs if
+                                * the NUMA range is less than the least
+                                * of the distances in the system.
+                                * In effect NUMA range consideration is
+                                * turned off.
                                  */
-                               continue;
-                       } else if (best_ni) {
-                               if ((best_ni)->ni_seq - ni->ni_seq <= 0)
+                               shortest_distance = distance;
+                       } else if ((distance <= shortest_distance &&
+                                   distance < lnet_get_numa_range()) ||
+                                  distance == shortest_distance) {
+                               /*
+                                * This NI is either within range or it's
+                                * equidistant. In both of these cases we
+                                * would want to select the NI based on
+                                * its available credits first, and then
+                                * via Round Robin.
+                                */
+                               if (distance <= shortest_distance &&
+                                   distance < lnet_get_numa_range()) {
+                                       /*
+                                        * If this is the first NI that's
+                                        * within range, then set the
+                                        * shortest distance to the range
+                                        * specified by the user. In
+                                        * effect we're saying that all
+                                        * NIs that fall within this NUMA
+                                        * range shall be dealt with as
+                                        * having equal NUMA weight. Which
+                                        * will mean that we should select
+                                        * through that set by their
+                                        * available credits first
+                                        * followed by Round Robin.
+                                        *
+                                        * And since this is the first NI
+                                        * in the range, let's just set it
+                                        * as our best_ni for now. The
+                                        * following NIs found in the
+                                        * range will be dealt with as
+                                        * mentioned previously.
+                                        */
+                                       shortest_distance = lnet_get_numa_range();
+                                       if (!found_ir) {
+                                               found_ir = true;
+                                               goto set_ni;
+                                       }
+                               }
+                               /*
+                                * This NI is NUMA equidistant let's
+                                * select using credits followed by Round
+                                * Robin.
+                                */
+                               if (ni->ni_tx_queues[cpt]->tq_credits <
+                                       best_credits) {
                                         continue;
-                               (best_ni)->ni_seq = ni->ni_seq + 1;
+                               } else if (ni->ni_tx_queues[cpt]->tq_credits ==
+                                               best_credits) {
+                                       if (best_ni) {
+                                               if (best_ni->ni_seq <= ni->ni_seq)
+                                                       continue;
+                                       }
+                               }
                         }
-
+set_ni:
                         best_ni = ni;
                         best_credits = ni->ni_tx_queues[cpt]->tq_credits;
                 }
         }
+       /*
+        * Now that we selected the NI to use increment its sequence
+        * number so the Round Robin algorithm will detect that it has
+        * been used and pick the next NI.
+        */
+       best_ni->ni_seq++;
  
         if (!best_ni) {
                 lnet_net_unlock(cpt);
@@ -1550,36 +1642,59 @@ pick_peer:
         best_lpni = NULL;
         while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
                 /*
-                * if this peer ni is not healty just skip it, no point in
+                * if this peer ni is not healthy just skip it, no point in
                  * examining it further
                  */
                 if (!lnet_is_peer_ni_healthy_locked(lpni))
                         continue;
                 ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
  
+               /* if this is a preferred peer use it */
                 if (!preferred && ni_is_pref) {
                         preferred = true;
                 } else if (preferred && !ni_is_pref) {
+                       /*
+                        * this is not the preferred peer so let's ignore
+                        * it.
+                        */
                         continue;
-               } if (lpni->lpni_txcredits <= best_lpni_credits)
+               } if (lpni->lpni_txcredits < best_lpni_credits)
+                       /*
+                        * We already have a peer that has more credits
+                        * available than this one. No need to consider
+                        * this peer further.
+                        */
                         continue;
-               else if (best_lpni) {
-                       if (best_lpni->lpni_seq - lpni->lpni_seq <= 0)
-                               continue;
-                       best_lpni->lpni_seq = lpni->lpni_seq + 1;
+               else if (lpni->lpni_txcredits == best_lpni_credits) {
+                       /*
+                        * The best peer found so far and the current peer
+                        * have the same number of available credits let's
+                        * make sure to select between them using Round
+                        * Robin
+                        */
+                       if (best_lpni) {
+                               if (best_lpni->lpni_seq <= lpni->lpni_seq)
+                                       continue;
+                       }
                 }
  
                 best_lpni = lpni;
                 best_lpni_credits = lpni->lpni_txcredits;
         }
  
+       /*
+        * Increment sequence number of the peer selected so that we can
+        * pick the next one in Round Robin.
+        */
+       best_lpni->lpni_seq++;
+
         /* if we still can't find a peer ni then we can't reach it */
         if (!best_lpni) {
                 __u32 net_id = peer_net->lpn_net_id;
                 lnet_net_unlock(cpt);
                 LCONSOLE_WARN("no peer_ni found on peer net %s\n",
                                 libcfs_net2str(net_id));
-               goto again;
+               return -EHOSTUNREACH;
         }
  
  send:
diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c

index 0e960f4..aa23ce0 100644 (file)
--- a/lnet/utils/lnetconfig/liblnetconfig.c
+++ b/lnet/utils/lnetconfig/liblnetconfig.c
@@ -1561,6 +1561,10 @@ int lustre_lnet_show_net(char *nw, int detail, int seq_no,
                                                         == NULL)
                                 goto out;
  
+                       if (cYAML_create_number(item, "dev cpt",
+                                               ni_data->lic_dev_cpt) == NULL)
+                               goto out;
+
                         /* out put the CPTs in the format: "[x,x,x,...]" */
                         limit = str_buf + str_buf_len - 3;
                         pos += snprintf(pos, limit - pos, "\"[");
@@ -1654,6 +1658,40 @@ out:
         return rc;
  }
  
+int lustre_lnet_config_numa_range(int range, int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_numa_range data;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       if (range < 0) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"range must be >= 0\"");
+               rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
+               goto out;
+       }
+
+       LIBCFS_IOC_INIT_V2(data, nr_hdr);
+       data.nr_range = range;
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_SET_NUMA_RANGE, &data);
+       if (rc != 0) {
+               rc = -errno;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot configure buffers: %s\"", strerror(errno));
+               goto out;
+       }
+
+out:
+       cYAML_build_error(rc, seq_no, ADD_CMD, "numa_range", err_str, err_rc);
+
+       return rc;
+}
+
  int lustre_lnet_config_buffers(int tiny, int small, int large, int seq_no,
                                struct cYAML **err_rc)
  {
@@ -2019,6 +2057,62 @@ out:
         return rc;
  }
  
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+                               struct cYAML **err_rc)
+{
+       struct lnet_ioctl_numa_range data;
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       int l_errno;
+       char err_str[LNET_MAX_STR_LEN];
+       struct cYAML *root = NULL, *range = NULL;
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       LIBCFS_IOC_INIT_V2(data, nr_hdr);
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NUMA_RANGE, &data);
+       if (rc != 0) {
+               l_errno = errno;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot get numa range: %s\"",
+                        strerror(l_errno));
+               rc = -l_errno;
+               goto out;
+       }
+
+       root = cYAML_create_object(NULL, NULL);
+       if (root == NULL)
+               goto out;
+
+       range = cYAML_create_object(root, "numa");
+       if (range == NULL)
+               goto out;
+
+       if (cYAML_create_number(range, "range",
+                               data.nr_range) == NULL)
+               goto out;
+
+       if (show_rc == NULL)
+               cYAML_print_tree(root);
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+out:
+       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_free_tree(root);
+       } else if (show_rc != NULL && *show_rc != NULL) {
+               cYAML_insert_sibling((*show_rc)->cy_child,
+                                       root->cy_child);
+               free(root);
+       } else {
+               *show_rc = root;
+       }
+
+       cYAML_build_error(rc, seq_no, SHOW_CMD, "numa", err_str, err_rc);
+
+       return rc;
+}
+
  int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
                            struct cYAML **err_rc)
  {
diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h

index 8eebfc2..4491d14 100644 (file)
--- a/lnet/utils/lnetconfig/liblnetconfig.h
+++ b/lnet/utils/lnetconfig/liblnetconfig.h
@@ -189,6 +189,35 @@ int lustre_lnet_enable_routing(int enable, int seq_no,
                                struct cYAML **err_rc);
  
  /*
+ * lustre_lnet_config_numa_range
+ *   Set the NUMA range which impacts the NIs to be selected
+ *   during sending. If the NUMA range is large the NUMA
+ *   distance between the message memory and the NI becomes
+ *   less significant. The NUMA range is a relative number
+ *   with no other meaning besides allowing a wider breadth
+ *   for picking an NI to send from.
+ *
+ *   range - numa range value.
+ *   seq_no - sequence number of the request
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_config_numa_range(int range, int seq_no,
+                                 struct cYAML **err_rc);
+
+/*
+ * lustre_lnet_show_num_range
+ *   Get the currently set NUMA range
+ *
+ *   seq_no - sequence number of the request
+ *   show_rc - [OUT] struct cYAML tree containing NUMA range info
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+                               struct cYAML **err_rc);
+
+/*
   * lustre_lnet_config_buffers
   *   Send down an IOCTL to configure routing buffer sizes.  A value of 0 means
   *   default that particular buffer to default size. A value of -1 means
diff --git a/lnet/utils/lnetconfig/liblnetconfig_lnd.c b/lnet/utils/lnetconfig/liblnetconfig_lnd.c

index 688cb98..56e5975 100644 (file)
--- a/lnet/utils/lnetconfig/liblnetconfig_lnd.c
+++ b/lnet/utils/lnetconfig/liblnetconfig_lnd.c
@@ -26,6 +26,7 @@
   *   James Simmons <jsimmons@infradead.org>
   */
  
+#include <limits.h>
  #include <stdlib.h>
  #include <stdio.h>
  #include <string.h>
diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c

index af53e16..dea8c9b 100644 (file)
--- a/lnet/utils/lnetctl.c
+++ b/lnet/utils/lnetctl.c
@@ -50,9 +50,11 @@ static int jt_show_net(int argc, char **argv);
  static int jt_show_routing(int argc, char **argv);
  static int jt_show_stats(int argc, char **argv);
  static int jt_show_peer(int argc, char **argv);
+static int jt_show_numa(int argc, char **argv);
  static int jt_set_tiny(int argc, char **argv);
  static int jt_set_small(int argc, char **argv);
  static int jt_set_large(int argc, char **argv);
+static int jt_set_numa(int argc, char **argv);
  static int jt_add_peer_nid(int argc, char **argv);
  static int jt_del_peer_nid(int argc, char **argv);
  /*static int jt_show_peer(int argc, char **argv);*/
@@ -111,6 +113,11 @@ command_t stats_cmds[] = {
         { 0, 0, 0, NULL }
  };
  
+command_t numa_cmds[] = {
+       {"show", jt_show_numa, 0, "show NUMA range\n"},
+       { 0, 0, 0, NULL }
+};
+
  command_t set_cmds[] = {
         {"tiny_buffers", jt_set_tiny, 0, "set tiny routing buffers\n"
          "\tVALUE must be greater than 0\n"},
@@ -121,6 +128,8 @@ command_t set_cmds[] = {
         {"routing", jt_set_routing, 0, "enable/disable routing\n"
          "\t0 - disable routing\n"
          "\t1 - enable routing\n"},
+       {"numa_range", jt_set_numa, 0, "set NUMA range for NI selection\n"
+        "\tVALUE must be at least 0\n"},
         { 0, 0, 0, NULL }
  };
  
@@ -200,6 +209,33 @@ static int handle_help(const command_t *cmd_list, const char *cmd,
         return rc;
  }
  
+static int jt_set_numa(int argc, char **argv)
+{
+       long int value;
+       int rc;
+       struct cYAML *err_rc = NULL;
+
+       if (handle_help(set_cmds, "set", "numa_range", argc, argv) == 0)
+               return 0;
+
+       rc = parse_long(argv[1], &value);
+       if (rc != 0) {
+               cYAML_build_error(-1, -1, "parser", "set",
+                                 "cannot parse numa_range value", &err_rc);
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               return -1;
+       }
+
+       rc = lustre_lnet_config_buffers(value, -1, -1, -1, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
  static int jt_set_tiny(int argc, char **argv)
  {
         long int value;
@@ -788,6 +824,27 @@ static int jt_show_stats(int argc, char **argv)
         return rc;
  }
  
+static int jt_show_numa(int argc, char **argv)
+{
+       int rc;
+       struct cYAML *show_rc = NULL, *err_rc = NULL;
+
+       if (handle_help(numa_cmds, "numa", "show", argc, argv) == 0)
+               return 0;
+
+       rc = lustre_lnet_show_numa_range(-1, &show_rc, &err_rc);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+       else if (show_rc)
+               cYAML_print_tree(show_rc);
+
+       cYAML_free_tree(err_rc);
+       cYAML_free_tree(show_rc);
+
+       return rc;
+}
+
  static inline int jt_lnet(int argc, char **argv)
  {
         if (argc < 2)
@@ -848,6 +905,18 @@ static inline int jt_stats(int argc, char **argv)
         return Parser_execarg(argc - 1, &argv[1], stats_cmds);
  }
  
+static inline int jt_numa(int argc, char **argv)
+{
+       if (argc < 2)
+               return CMD_HELP;
+
+       if (argc == 2 &&
+           handle_help(numa_cmds, "numa", NULL, argc, argv) == 0)
+               return 0;
+
+       return Parser_execarg(argc - 1, &argv[1], numa_cmds);
+}
+
  static inline int jt_peers(int argc, char **argv)
  {
         if (argc < 2)
@@ -1181,6 +1250,7 @@ command_t list[] = {
                                  "--help} FILE.yaml"},
         {"export", jt_export, 0, "export {--help} FILE.yaml"},
         {"stats", jt_stats, 0, "stats {show | help}"},
+       {"numa", jt_numa, 0, "numa {show | help}"},
         {"peer", jt_peers, 0, "peer {add | del | show | help}"},
         {"help", Parser_help, 0, "help"},
         {"exit", Parser_quit, 0, "quit"},
author	Amir Shehata <amir.shehata@intel.com>
	Tue, 15 Mar 2016 10:13:03 +0000 (03:13 -0700)
committer	Amir Shehata <amir.shehata@intel.com>
	Wed, 25 Jan 2017 03:10:14 +0000 (19:10 -0800)
libcfs/include/libcfs/libcfs_cpu.h		patch \| blob \| history
libcfs/include/libcfs/libcfs_ioctl.h		patch \| blob \| history
libcfs/include/libcfs/linux/linux-cpu.h		patch \| blob \| history
libcfs/libcfs/libcfs_cpu.c		patch \| blob \| history
libcfs/libcfs/linux/linux-cpu.c		patch \| blob \| history
libcfs/libcfs/module.c		patch \| blob \| history
lnet/include/lnet/lib-dlc.h		patch \| blob \| history
lnet/include/lnet/lib-lnet.h		patch \| blob \| history
lnet/include/lnet/lib-types.h		patch \| blob \| history
lnet/klnds/o2iblnd/o2iblnd.c		patch \| blob \| history
lnet/klnds/socklnd/socklnd.c		patch \| blob \| history
lnet/lnet/api-ni.c		patch \| blob \| history
lnet/lnet/lib-move.c		patch \| blob \| history
lnet/utils/lnetconfig/liblnetconfig.c		patch \| blob \| history
lnet/utils/lnetconfig/liblnetconfig.h		patch \| blob \| history
lnet/utils/lnetconfig/liblnetconfig_lnd.c		patch \| blob \| history
lnet/utils/lnetctl.c		patch \| blob \| history