*/
int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
/**
+ * print distance information of cpt-table
+ */
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
* return total number of CPU partitions in \a cptab
*/
int
*/
int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
/**
+ * shadow HW node ID \a NODE to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
+/**
+ * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
+ */
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+/**
* bind current thread on a CPU-partition \a cpt of \a cptab
*/
int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
#define IOC_LIBCFS_ADD_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
#define IOC_LIBCFS_DEL_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
#define IOC_LIBCFS_GET_LOCAL_NI _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_DBG _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR 98
+#define IOC_LIBCFS_SET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NUMA_RANGE _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DBG _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR 100
extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
cpumask_t *cpt_cpumask;
/* nodes mask for this partition */
nodemask_t *cpt_nodemask;
+ /* NUMA distance between CPTs */
+ unsigned *cpt_distance;
/* spread rotor for NUMA allocator */
unsigned cpt_spread_rotor;
};
struct cfs_cpt_table {
/* spread rotor for NUMA allocator */
unsigned ctb_spread_rotor;
+ /* maximum NUMA distance between all nodes in table */
+ unsigned ctb_distance;
/* # of CPU partitions */
unsigned ctb_nparts;
/* partitions tables */
int *ctb_cpu2cpt;
/* all cpus in this partition table */
cpumask_t *ctb_cpumask;
+ /* shadow HW node to CPU partition ID */
+ int *ctb_node2cpt;
/* all nodes in this partition table */
nodemask_t *ctb_nodemask;
};
#define CFS_CPU_VERSION_MAGIC 0xbabecafe
+#define CFS_CPT_DISTANCE 1 /* Arbitrary positive value */
+
struct cfs_cpt_table *
cfs_cpt_table_alloc(unsigned int ncpt)
{
LIBCFS_ALLOC(cptab, sizeof(*cptab));
if (cptab != NULL) {
cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+ cpu_set(0, cptab->ctb_cpumask);
node_set(0, cptab->ctb_nodemask);
cptab->ctb_nparts = ncpt;
}
EXPORT_SYMBOL(cfs_cpt_table_print);
int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+ int rc = 0;
+
+ rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
+ len -= rc;
+ if (len <= 0)
+ return -EFBIG;
+
+ return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
cfs_cpt_number(struct cfs_cpt_table *cptab)
{
return 1;
}
EXPORT_SYMBOL(cfs_cpt_online);
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+ return &cptab->ctb_mask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
nodemask_t *
cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
{
return &cptab->ctb_nodemask;
}
-EXPORT_SYMBOL(cfs_cpt_cpumask);
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+ return CFS_CPT_DISTANCE;
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
int
cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
EXPORT_SYMBOL(cfs_cpt_of_cpu);
int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
{
return 0;
if (cptab->ctb_cpu2cpt != NULL) {
LIBCFS_FREE(cptab->ctb_cpu2cpt,
- num_possible_cpus() *
- sizeof(cptab->ctb_cpu2cpt[0]));
+ nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+ }
+
+ if (cptab->ctb_node2cpt != NULL) {
+ LIBCFS_FREE(cptab->ctb_node2cpt,
+ nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
}
for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
if (part->cpt_cpumask != NULL)
LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+ if (part->cpt_distance) {
+ LIBCFS_FREE(part->cpt_distance,
+ cptab->ctb_nparts *
+ sizeof(part->cpt_distance[0]));
+ }
}
if (cptab->ctb_parts != NULL) {
goto failed;
LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
- num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+ nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
if (cptab->ctb_cpu2cpt == NULL)
goto failed;
memset(cptab->ctb_cpu2cpt, -1,
- num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+ nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+ LIBCFS_ALLOC(cptab->ctb_node2cpt,
+ nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+ if (cptab->ctb_node2cpt == NULL)
+ goto failed;
+
+ memset(cptab->ctb_node2cpt, -1,
+ nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
if (cptab->ctb_parts == NULL)
struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+ if (!part->cpt_cpumask)
+ goto failed;
+
LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
- if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+ if (!part->cpt_nodemask)
+ goto failed;
+
+ LIBCFS_ALLOC(part->cpt_distance,
+ cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+ if (!part->cpt_distance)
goto failed;
}
cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
{
char *tmp = buf;
- int rc = 0;
+ int rc = -EFBIG;
int i;
int j;
for (i = 0; i < cptab->ctb_nparts; i++) {
- if (len > 0) {
- rc = snprintf(tmp, len, "%d\t: ", i);
- len -= rc;
- }
+ if (len <= 0)
+ goto out;
- if (len <= 0) {
- rc = -EFBIG;
+ rc = snprintf(tmp, len, "%d\t:", i);
+ len -= rc;
+
+ if (len <= 0)
goto out;
- }
tmp += rc;
for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
- rc = snprintf(tmp, len, "%d ", j);
+ rc = snprintf(tmp, len, " %d", j);
len -= rc;
- if (len <= 0) {
- rc = -EFBIG;
+ if (len <= 0)
goto out;
- }
tmp += rc;
}
tmp++;
len--;
}
-
-out:
+ rc = 0;
+ out:
if (rc < 0)
return rc;
EXPORT_SYMBOL(cfs_cpt_table_print);
int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+ char *tmp = buf;
+ int rc = -EFBIG;
+ int i;
+ int j;
+
+ for (i = 0; i < cptab->ctb_nparts; i++) {
+ if (len <= 0)
+ goto out;
+
+ rc = snprintf(tmp, len, "%d\t:", i);
+ len -= rc;
+
+ if (len <= 0)
+ goto out;
+
+ tmp += rc;
+ for (j = 0; j < cptab->ctb_nparts; j++) {
+ rc = snprintf(tmp, len, " %d:%d",
+ j, cptab->ctb_parts[i].cpt_distance[j]);
+ len -= rc;
+ if (len <= 0)
+ goto out;
+ tmp += rc;
+ }
+
+ *tmp = '\n';
+ tmp++;
+ len--;
+ }
+ rc = 0;
+ out:
+ if (rc < 0)
+ return rc;
+
+ return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
cfs_cpt_number(struct cfs_cpt_table *cptab)
{
return cptab->ctb_nparts;
}
EXPORT_SYMBOL(cfs_cpt_nodemask);
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+ LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+ LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+ if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+ return cptab->ctb_distance;
+
+ return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned
+cfs_cpt_distance_calculate(nodemask_t *from_mask, nodemask_t *to_mask)
+{
+ unsigned maximum;
+ unsigned distance;
+ int to;
+ int from;
+
+ maximum = 0;
+ for_each_node_mask(from, *from_mask) {
+ for_each_node_mask(to, *to_mask) {
+ distance = node_distance(from, to);
+ if (maximum < distance)
+ maximum = distance;
+ }
+ }
+ return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+ cptab->ctb_cpu2cpt[cpu] = cpt;
+
+ cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+ cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+ cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+ cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+ cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+ int cpt2;
+ struct cfs_cpu_partition *part;
+ struct cfs_cpu_partition *part2;
+
+ if (!node_isset(node, *cptab->ctb_nodemask)) {
+ /* first time node is added to the CPT table */
+ node_set(node, *cptab->ctb_nodemask);
+ cptab->ctb_node2cpt[node] = cpt;
+ cptab->ctb_distance = cfs_cpt_distance_calculate(
+ cptab->ctb_nodemask,
+ cptab->ctb_nodemask);
+ }
+
+ part = &cptab->ctb_parts[cpt];
+ if (!node_isset(node, *part->cpt_nodemask)) {
+ /* first time node is added to this CPT */
+ node_set(node, *part->cpt_nodemask);
+ for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+ part2 = &cptab->ctb_parts[cpt2];
+ part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+ part->cpt_nodemask,
+ part2->cpt_nodemask);
+ part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+ part2->cpt_nodemask,
+ part->cpt_nodemask);
+ }
+ }
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+ int cpu;
+ int cpt2;
+ struct cfs_cpu_partition *part;
+ struct cfs_cpu_partition *part2;
+
+ part = &cptab->ctb_parts[cpt];
+
+ for_each_cpu(cpu, part->cpt_cpumask) {
+ /* this CPT has other CPU belonging to this node? */
+ if (cpu_to_node(cpu) == node)
+ break;
+ }
+
+ if (cpu >= nr_cpu_ids && node_isset(node, *part->cpt_nodemask)) {
+ /* No more CPUs in the node for this CPT. */
+ node_clear(node, *part->cpt_nodemask);
+ for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+ part2 = &cptab->ctb_parts[cpt2];
+ if (node_isset(node, *part2->cpt_nodemask))
+ cptab->ctb_node2cpt[node] = cpt2;
+ part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+ part->cpt_nodemask,
+ part2->cpt_nodemask);
+ part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+ part2->cpt_nodemask,
+ part->cpt_nodemask);
+ }
+ }
+
+ for_each_cpu(cpu, cptab->ctb_cpumask) {
+ /* this CPT-table has other CPUs belonging to this node? */
+ if (cpu_to_node(cpu) == node)
+ break;
+ }
+
+ if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+ /* No more CPUs in the table for this node. */
+ node_clear(node, *cptab->ctb_nodemask);
+ cptab->ctb_node2cpt[node] = -1;
+ cptab->ctb_distance =
+ cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+ cptab->ctb_nodemask);
+ }
+}
+
int
cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
- int node;
-
LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
return 0;
}
- cptab->ctb_cpu2cpt[cpu] = cpt;
-
LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
- cpumask_set_cpu(cpu, cptab->ctb_cpumask);
- cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-
- node = cpu_to_node(cpu);
-
- /* first CPU of @node in this CPT table */
- if (!node_isset(node, *cptab->ctb_nodemask))
- node_set(node, *cptab->ctb_nodemask);
-
- /* first CPU of @node in this partition */
- if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
- node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+ cfs_cpt_add_cpu(cptab, cpt, cpu);
+ cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
return 1;
}
void
cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
{
- int node;
- int i;
-
LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
if (cpu < 0 || cpu >= nr_cpu_ids) {
LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
- cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
- cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
- cptab->ctb_cpu2cpt[cpu] = -1;
-
- node = cpu_to_node(cpu);
-
- LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
- LASSERT(node_isset(node, *cptab->ctb_nodemask));
-
- for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
- /* this CPT has other CPU belonging to this node? */
- if (cpu_to_node(i) == node)
- break;
- }
-
- if (i >= nr_cpu_ids)
- node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
-
- for_each_cpu(i, cptab->ctb_cpumask) {
- /* this CPT-table has other CPU belonging to this node? */
- if (cpu_to_node(i) == node)
- break;
- }
-
- if (i >= nr_cpu_ids)
- node_clear(node, *cptab->ctb_nodemask);
+ cfs_cpt_del_cpu(cptab, cpt, cpu);
+ cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
}
EXPORT_SYMBOL(cfs_cpt_unset_cpu);
}
for_each_cpu(cpu, mask) {
- if (!cfs_cpt_set_cpu(cptab, cpt, cpu))
- return 0;
+ cfs_cpt_add_cpu(cptab, cpt, cpu);
+ cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
}
return 1;
cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
const cpumask_t *mask;
- int rc;
+ int cpu;
- if (node < 0 || node >= MAX_NUMNODES) {
+ if (node < 0 || node >= nr_node_ids) {
CDEBUG(D_INFO,
"Invalid NUMA id %d for CPU partition %d\n", node, cpt);
return 0;
}
mask = cpumask_of_node(node);
- rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
- return rc;
+ for_each_cpu(cpu, mask)
+ cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+ cfs_cpt_add_node(cptab, cpt, node);
+
+ return 1;
}
EXPORT_SYMBOL(cfs_cpt_set_node);
cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
{
const cpumask_t *mask;
+ int cpu;
- if (node < 0 || node >= MAX_NUMNODES) {
+ if (node < 0 || node >= nr_node_ids) {
CDEBUG(D_INFO,
"Invalid NUMA id %d for CPU partition %d\n", node, cpt);
return;
}
mask = cpumask_of_node(node);
- cfs_cpt_unset_cpumask(cptab, cpt, mask);
+ for_each_cpu(cpu, mask)
+ cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+ cfs_cpt_del_node(cptab, cpt, node);
}
EXPORT_SYMBOL(cfs_cpt_unset_node);
EXPORT_SYMBOL(cfs_cpt_of_cpu);
int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+ if (node < 0 || node > nr_node_ids)
+ return CFS_CPT_ANY;
+
+ return cptab->ctb_node2cpt[node];
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
{
cpumask_t *cpumask;
return cptab;
}
- high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+ high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
for (str = cfs_trimwhite(pattern), c = 0;; c++) {
struct cfs_range_expr *range;
__proc_cpt_table);
}
+static int __proc_cpt_distance(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ char *buf = NULL;
+ int len = 4096;
+ int rc = 0;
+
+ if (write)
+ return -EPERM;
+
+ LASSERT(cfs_cpt_table != NULL);
+
+ while (1) {
+ LIBCFS_ALLOC(buf, len);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ rc = cfs_cpt_distance_print(cfs_cpt_table, buf, len);
+ if (rc >= 0)
+ break;
+
+ if (rc == -EFBIG) {
+ LIBCFS_FREE(buf, len);
+ len <<= 1;
+ continue;
+ }
+ goto out;
+ }
+
+ if (pos >= rc) {
+ rc = 0;
+ goto out;
+ }
+
+ rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+ if (buf != NULL)
+ LIBCFS_FREE(buf, len);
+ return rc;
+}
+
+static int
+proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_cpt_distance);
+}
+
static struct ctl_table lnet_table[] = {
/*
* NB No .strategy entries have been provided since sysctl(8) prefers
},
{
INIT_CTL_NAME
+ .procname = "cpu_partition_distance",
+ .maxlen = 128,
+ .mode = 0444,
+ .proc_handler = &proc_cpt_distance,
+ },
+ {
+ INIT_CTL_NAME
.procname = "debug_log_upcall",
.data = lnet_debug_log_upcall,
.maxlen = sizeof(lnet_debug_log_upcall),
__u32 lic_status;
__u32 lic_tcp_bonding;
__u32 lic_idx;
+ __s32 lic_dev_cpt;
char lic_bulk[0];
};
char prcfg_bulk[0];
};
+struct lnet_ioctl_numa_range {
+ struct libcfs_ioctl_hdr nr_hdr;
+ __u32 nr_range;
+};
+
struct lnet_ioctl_lnet_stats {
struct libcfs_ioctl_hdr st_hdr;
struct lnet_counters st_cntrs;
bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
void lnet_incr_dlc_seq(void);
__u32 lnet_get_dlc_seq_locked(void);
+inline __u32 lnet_get_numa_range(void);
struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
struct lnet_peer_net *peer_net,
/* lnd tunables set explicitly */
bool ni_lnd_tunables_set;
+ /* physical device CPT */
+ int dev_cpt;
+
/* sequence number used to round robin over nis within a net */
__u32 ni_seq;
unsigned long flags;
int rc;
int newdev;
+ int node_id;
LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
newdev = ibdev == NULL;
/* hmm...create kib_dev even for alias */
if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
- ibdev = kiblnd_create_dev(ifname);
+ ibdev = kiblnd_create_dev(ifname);
- if (ibdev == NULL)
- goto failed;
+ if (ibdev == NULL)
+ goto failed;
+
+ node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
+ ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
- net->ibn_dev = ibdev;
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+ net->ibn_dev = ibdev;
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
rc = kiblnd_dev_start_threads(ibdev, newdev,
ni->ni_cpts, ni->ni_ncpts);
* Author: Eric Barton <eric@bartonsoftware.com>
*/
+#include <linux/pci.h>
#include "socklnd.h"
static lnd_t the_ksocklnd;
int
ksocknal_startup (lnet_ni_t *ni)
{
- ksock_net_t *net;
- int rc;
- int i;
+ ksock_net_t *net;
+ int rc;
+ int i;
+ struct net_device *net_dev;
+ int node_id;
LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
ni->ni_net->net_tunables_set = true;
}
+
if (ni->ni_interfaces[0] == NULL) {
rc = ksocknal_enumerate_interfaces(net);
if (rc <= 0)
strlcpy(net->ksnn_interfaces[i].ksni_name,
ni->ni_interfaces[i],
sizeof(net->ksnn_interfaces[i].ksni_name));
+
}
net->ksnn_ninterfaces = i;
}
+ net_dev = dev_get_by_name(&init_net,
+ net->ksnn_interfaces[0].ksni_name);
+ if (net_dev != NULL) {
+ node_id = dev_to_node(&net_dev->dev);
+ ni->dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+ dev_put(net_dev);
+ } else {
+ ni->dev_cpt = CFS_CPT_ANY;
+ }
+
/* call it before add it to ksocknal_data.ksnd_nets */
rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
if (rc != 0)
MODULE_PARM_DESC(use_tcp_bonding,
"Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+static __u32 lnet_numa_range = 0;
+module_param(lnet_numa_range, int, 0444);
+MODULE_PARM_DESC(lnet_numa_range,
+ "NUMA range to consider during Multi-Rail selection");
+
/*
* This sequence number keeps track of how many times DLC was used to
* update the configuration. It is incremented on any DLC update and
cfg_ni->lic_nid = ni->ni_nid;
cfg_ni->lic_status = ni->ni_status->ns_status;
cfg_ni->lic_tcp_bonding = use_tcp_bonding;
+ cfg_ni->lic_dev_cpt = ni->dev_cpt;
memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
return atomic_read(&lnet_dlc_seq_no);
}
+inline __u32 lnet_get_numa_range(void)
+{
+ return lnet_numa_range;
+}
+
/**
* LNet ioctl handler.
*
mutex_unlock(&the_lnet.ln_api_mutex);
return rc;
+ case IOC_LIBCFS_SET_NUMA_RANGE: {
+ struct lnet_ioctl_numa_range *numa;
+ numa = arg;
+ if (numa->nr_hdr.ioc_len != sizeof(*numa))
+ return -EINVAL;
+ lnet_numa_range = numa->nr_range;
+ return 0;
+ }
+
+ case IOC_LIBCFS_GET_NUMA_RANGE: {
+ struct lnet_ioctl_numa_range *numa;
+ numa = arg;
+ if (numa->nr_hdr.ioc_len != sizeof(*numa))
+ return -EINVAL;
+ numa->nr_range = lnet_numa_range;
+ return 0;
+ }
+
case IOC_LIBCFS_GET_BUF: {
struct lnet_ioctl_pool_cfg *pool_cfg;
size_t total = sizeof(*config) + sizeof(*pool_cfg);
int best_credits = 0;
__u32 seq, seq2;
int best_lpni_credits = INT_MIN;
+ int md_cpt = 0;
+ int shortest_distance = INT_MAX;
+ int distance = 0;
+ bool found_ir = false;
again:
/*
routing = false;
local_net = NULL;
best_ni = NULL;
+ shortest_distance = INT_MAX;
+ found_ir = false;
if (the_lnet.ln_shutdown) {
lnet_net_unlock(cpt);
return -ESHUTDOWN;
}
+ if (msg->msg_md != NULL)
+ /* get the cpt of the MD, used during NUMA based selection */
+ md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+ else
+ md_cpt = CFS_CPT_ANY;
+
/*
* initialize the variables which could be reused if we go to
* again
continue;
/*
- * Second jab at determining best_ni
- * if we get here then the peer we're trying to send
- * to is on a directly connected network, and we'll
- * need to pick the local_ni on that network to send
- * from
+ * Iterate through the NIs in this local Net and select
+ * the NI to send from. The selection is determined by
+ * these 3 criterion in the following priority:
+ * 1. NUMA
+ * 2. NI available credits
+ * 3. Round Robin
*/
while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
if (!lnet_is_ni_healthy_locked(ni))
continue;
- /* TODO: compare NUMA distance */
- if (ni->ni_tx_queues[cpt]->tq_credits <=
- best_credits) {
+
+ /*
+ * calculate the distance from the cpt on which
+ * the message memory is allocated to the CPT of
+ * the NI's physical device
+ */
+ distance = cfs_cpt_distance(lnet_cpt_table(),
+ md_cpt,
+ ni->dev_cpt);
+
+ /*
+ * If we already have a closer NI within the NUMA
+ * range provided, then there is no need to
+ * consider the current NI. Move on to the next
+ * one.
+ */
+ if (distance > shortest_distance &&
+ distance > lnet_get_numa_range())
+ continue;
+
+ if (distance < shortest_distance &&
+ distance > lnet_get_numa_range()) {
/*
- * all we want is to read tq_credits
- * value as an approximation of how
- * busy the NI is. No need to grab a lock
+ * The current NI is the closest one that we
+ * have found, even though it's not in the
+ * NUMA range specified. This occurs if
+ * the NUMA range is less than the least
+ * of the distances in the system.
+ * In effect NUMA range consideration is
+ * turned off.
*/
- continue;
- } else if (best_ni) {
- if ((best_ni)->ni_seq - ni->ni_seq <= 0)
+ shortest_distance = distance;
+ } else if ((distance <= shortest_distance &&
+ distance < lnet_get_numa_range()) ||
+ distance == shortest_distance) {
+ /*
+ * This NI is either within range or it's
+ * equidistant. In both of these cases we
+ * would want to select the NI based on
+ * its available credits first, and then
+ * via Round Robin.
+ */
+ if (distance <= shortest_distance &&
+ distance < lnet_get_numa_range()) {
+ /*
+ * If this is the first NI that's
+ * within range, then set the
+ * shortest distance to the range
+ * specified by the user. In
+ * effect we're saying that all
+ * NIs that fall within this NUMA
+ * range shall be dealt with as
+ * having equal NUMA weight. Which
+ * will mean that we should select
+ * through that set by their
+ * available credits first
+ * followed by Round Robin.
+ *
+ * And since this is the first NI
+ * in the range, let's just set it
+ * as our best_ni for now. The
+ * following NIs found in the
+ * range will be dealt with as
+ * mentioned previously.
+ */
+ shortest_distance = lnet_get_numa_range();
+ if (!found_ir) {
+ found_ir = true;
+ goto set_ni;
+ }
+ }
+ /*
+ * This NI is NUMA equidistant let's
+ * select using credits followed by Round
+ * Robin.
+ */
+ if (ni->ni_tx_queues[cpt]->tq_credits <
+ best_credits) {
continue;
- (best_ni)->ni_seq = ni->ni_seq + 1;
+ } else if (ni->ni_tx_queues[cpt]->tq_credits ==
+ best_credits) {
+ if (best_ni) {
+ if (best_ni->ni_seq <= ni->ni_seq)
+ continue;
+ }
+ }
}
-
+set_ni:
best_ni = ni;
best_credits = ni->ni_tx_queues[cpt]->tq_credits;
}
}
+ /*
+ * Now that we selected the NI to use increment its sequence
+ * number so the Round Robin algorithm will detect that it has
+ * been used and pick the next NI.
+ */
+ best_ni->ni_seq++;
if (!best_ni) {
lnet_net_unlock(cpt);
best_lpni = NULL;
while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
/*
- * if this peer ni is not healty just skip it, no point in
+ * if this peer ni is not healthy just skip it, no point in
* examining it further
*/
if (!lnet_is_peer_ni_healthy_locked(lpni))
continue;
ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+ /* if this is a preferred peer use it */
if (!preferred && ni_is_pref) {
preferred = true;
} else if (preferred && !ni_is_pref) {
+ /*
+ * this is not the preferred peer so let's ignore
+ * it.
+ */
continue;
- } if (lpni->lpni_txcredits <= best_lpni_credits)
+ } if (lpni->lpni_txcredits < best_lpni_credits)
+ /*
+ * We already have a peer that has more credits
+ * available than this one. No need to consider
+ * this peer further.
+ */
continue;
- else if (best_lpni) {
- if (best_lpni->lpni_seq - lpni->lpni_seq <= 0)
- continue;
- best_lpni->lpni_seq = lpni->lpni_seq + 1;
+ else if (lpni->lpni_txcredits == best_lpni_credits) {
+ /*
+ * The best peer found so far and the current peer
+ * have the same number of available credits let's
+ * make sure to select between them using Round
+ * Robin
+ */
+ if (best_lpni) {
+ if (best_lpni->lpni_seq <= lpni->lpni_seq)
+ continue;
+ }
}
best_lpni = lpni;
best_lpni_credits = lpni->lpni_txcredits;
}
+ /*
+ * Increment sequence number of the peer selected so that we can
+ * pick the next one in Round Robin.
+ */
+ best_lpni->lpni_seq++;
+
/* if we still can't find a peer ni then we can't reach it */
if (!best_lpni) {
__u32 net_id = peer_net->lpn_net_id;
lnet_net_unlock(cpt);
LCONSOLE_WARN("no peer_ni found on peer net %s\n",
libcfs_net2str(net_id));
- goto again;
+ return -EHOSTUNREACH;
}
send:
== NULL)
goto out;
+ if (cYAML_create_number(item, "dev cpt",
+ ni_data->lic_dev_cpt) == NULL)
+ goto out;
+
/* out put the CPTs in the format: "[x,x,x,...]" */
limit = str_buf + str_buf_len - 3;
pos += snprintf(pos, limit - pos, "\"[");
return rc;
}
+int lustre_lnet_config_numa_range(int range, int seq_no, struct cYAML **err_rc)
+{
+ struct lnet_ioctl_numa_range data;
+ int rc = LUSTRE_CFG_RC_NO_ERR;
+ char err_str[LNET_MAX_STR_LEN];
+
+ snprintf(err_str, sizeof(err_str), "\"success\"");
+
+ if (range < 0) {
+ snprintf(err_str,
+ sizeof(err_str),
+ "\"range must be >= 0\"");
+ rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
+ goto out;
+ }
+
+ LIBCFS_IOC_INIT_V2(data, nr_hdr);
+ data.nr_range = range;
+
+ rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_SET_NUMA_RANGE, &data);
+ if (rc != 0) {
+ rc = -errno;
+ snprintf(err_str,
+ sizeof(err_str),
+ "\"cannot configure buffers: %s\"", strerror(errno));
+ goto out;
+ }
+
+out:
+ cYAML_build_error(rc, seq_no, ADD_CMD, "numa_range", err_str, err_rc);
+
+ return rc;
+}
+
int lustre_lnet_config_buffers(int tiny, int small, int large, int seq_no,
struct cYAML **err_rc)
{
return rc;
}
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+ struct cYAML **err_rc)
+{
+ struct lnet_ioctl_numa_range data;
+ int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+ int l_errno;
+ char err_str[LNET_MAX_STR_LEN];
+ struct cYAML *root = NULL, *range = NULL;
+
+ snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+ LIBCFS_IOC_INIT_V2(data, nr_hdr);
+
+ rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NUMA_RANGE, &data);
+ if (rc != 0) {
+ l_errno = errno;
+ snprintf(err_str,
+ sizeof(err_str),
+ "\"cannot get numa range: %s\"",
+ strerror(l_errno));
+ rc = -l_errno;
+ goto out;
+ }
+
+ root = cYAML_create_object(NULL, NULL);
+ if (root == NULL)
+ goto out;
+
+ range = cYAML_create_object(root, "numa");
+ if (range == NULL)
+ goto out;
+
+ if (cYAML_create_number(range, "range",
+ data.nr_range) == NULL)
+ goto out;
+
+ if (show_rc == NULL)
+ cYAML_print_tree(root);
+
+ snprintf(err_str, sizeof(err_str), "\"success\"");
+out:
+ if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) {
+ cYAML_free_tree(root);
+ } else if (show_rc != NULL && *show_rc != NULL) {
+ cYAML_insert_sibling((*show_rc)->cy_child,
+ root->cy_child);
+ free(root);
+ } else {
+ *show_rc = root;
+ }
+
+ cYAML_build_error(rc, seq_no, SHOW_CMD, "numa", err_str, err_rc);
+
+ return rc;
+}
+
int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
struct cYAML **err_rc)
{
struct cYAML **err_rc);
/*
+ * lustre_lnet_config_numa_range
+ * Set the NUMA range which impacts the NIs to be selected
+ * during sending. If the NUMA range is large the NUMA
+ * distance between the message memory and the NI becomes
+ * less significant. The NUMA range is a relative number
+ * with no other meaning besides allowing a wider breadth
+ * for picking an NI to send from.
+ *
+ * range - numa range value.
+ * seq_no - sequence number of the request
+ * err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ * caller
+ */
+int lustre_lnet_config_numa_range(int range, int seq_no,
+ struct cYAML **err_rc);
+
+/*
+ * lustre_lnet_show_num_range
+ * Get the currently set NUMA range
+ *
+ * seq_no - sequence number of the request
+ * show_rc - [OUT] struct cYAML tree containing NUMA range info
+ * err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ * caller
+ */
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+ struct cYAML **err_rc);
+
+/*
* lustre_lnet_config_buffers
* Send down an IOCTL to configure routing buffer sizes. A value of 0 means
* default that particular buffer to default size. A value of -1 means
* James Simmons <jsimmons@infradead.org>
*/
+#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
static int jt_show_routing(int argc, char **argv);
static int jt_show_stats(int argc, char **argv);
static int jt_show_peer(int argc, char **argv);
+static int jt_show_numa(int argc, char **argv);
static int jt_set_tiny(int argc, char **argv);
static int jt_set_small(int argc, char **argv);
static int jt_set_large(int argc, char **argv);
+static int jt_set_numa(int argc, char **argv);
static int jt_add_peer_nid(int argc, char **argv);
static int jt_del_peer_nid(int argc, char **argv);
/*static int jt_show_peer(int argc, char **argv);*/
{ 0, 0, 0, NULL }
};
+command_t numa_cmds[] = {
+ {"show", jt_show_numa, 0, "show NUMA range\n"},
+ { 0, 0, 0, NULL }
+};
+
command_t set_cmds[] = {
{"tiny_buffers", jt_set_tiny, 0, "set tiny routing buffers\n"
"\tVALUE must be greater than 0\n"},
{"routing", jt_set_routing, 0, "enable/disable routing\n"
"\t0 - disable routing\n"
"\t1 - enable routing\n"},
+ {"numa_range", jt_set_numa, 0, "set NUMA range for NI selection\n"
+ "\tVALUE must be at least 0\n"},
{ 0, 0, 0, NULL }
};
return rc;
}
+static int jt_set_numa(int argc, char **argv)
+{
+ long int value;
+ int rc;
+ struct cYAML *err_rc = NULL;
+
+ if (handle_help(set_cmds, "set", "numa_range", argc, argv) == 0)
+ return 0;
+
+ rc = parse_long(argv[1], &value);
+ if (rc != 0) {
+ cYAML_build_error(-1, -1, "parser", "set",
+ "cannot parse numa_range value", &err_rc);
+ cYAML_print_tree2file(stderr, err_rc);
+ cYAML_free_tree(err_rc);
+ return -1;
+ }
+
+ rc = lustre_lnet_config_buffers(value, -1, -1, -1, &err_rc);
+ if (rc != LUSTRE_CFG_RC_NO_ERR)
+ cYAML_print_tree2file(stderr, err_rc);
+
+ cYAML_free_tree(err_rc);
+
+ return rc;
+}
+
static int jt_set_tiny(int argc, char **argv)
{
long int value;
return rc;
}
+static int jt_show_numa(int argc, char **argv)
+{
+ int rc;
+ struct cYAML *show_rc = NULL, *err_rc = NULL;
+
+ if (handle_help(numa_cmds, "numa", "show", argc, argv) == 0)
+ return 0;
+
+ rc = lustre_lnet_show_numa_range(-1, &show_rc, &err_rc);
+
+ if (rc != LUSTRE_CFG_RC_NO_ERR)
+ cYAML_print_tree2file(stderr, err_rc);
+ else if (show_rc)
+ cYAML_print_tree(show_rc);
+
+ cYAML_free_tree(err_rc);
+ cYAML_free_tree(show_rc);
+
+ return rc;
+}
+
static inline int jt_lnet(int argc, char **argv)
{
if (argc < 2)
return Parser_execarg(argc - 1, &argv[1], stats_cmds);
}
+static inline int jt_numa(int argc, char **argv)
+{
+ if (argc < 2)
+ return CMD_HELP;
+
+ if (argc == 2 &&
+ handle_help(numa_cmds, "numa", NULL, argc, argv) == 0)
+ return 0;
+
+ return Parser_execarg(argc - 1, &argv[1], numa_cmds);
+}
+
static inline int jt_peers(int argc, char **argv)
{
if (argc < 2)
"--help} FILE.yaml"},
{"export", jt_export, 0, "export {--help} FILE.yaml"},
{"stats", jt_stats, 0, "stats {show | help}"},
+ {"numa", jt_numa, 0, "numa {show | help}"},
{"peer", jt_peers, 0, "peer {add | del | show | help}"},
{"help", Parser_help, 0, "help"},
{"exit", Parser_quit, 0, "quit"},