Whamcloud - gitweb
LU-7734 lnet: multi-rail feature 87/25087/1
authorAmir Shehata <amir.shehata@intel.com>
Wed, 25 Jan 2017 20:28:42 +0000 (12:28 -0800)
committerAmir Shehata <amir.shehata@intel.com>
Wed, 25 Jan 2017 20:33:17 +0000 (12:33 -0800)
Merge branch 'multi-rail'

Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: I88d3d86d81681802387fc70dba2b9315a9720470

54 files changed:
libcfs/include/libcfs/libcfs_cpu.h
libcfs/include/libcfs/libcfs_ioctl.h
libcfs/include/libcfs/libcfs_string.h
libcfs/include/libcfs/linux/linux-cpu.h
libcfs/include/libcfs/util/string.h
libcfs/libcfs/libcfs_cpu.c
libcfs/libcfs/libcfs_string.c
libcfs/libcfs/linux/linux-cpu.c
libcfs/libcfs/module.c
libcfs/libcfs/util/string.c
lnet/include/lnet/api.h
lnet/include/lnet/lib-dlc.h
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/lnet/types.h
lnet/klnds/gnilnd/gnilnd.c
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/o2iblnd/o2iblnd_modparams.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_lib.c
lnet/klnds/socklnd/socklnd_proto.c
lnet/lnet/acceptor.c
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-md.c
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/lib-ptl.c
lnet/lnet/lo.c
lnet/lnet/module.c
lnet/lnet/net_fault.c
lnet/lnet/peer.c
lnet/lnet/router.c
lnet/lnet/router_proc.c
lnet/selftest/brw_test.c
lnet/selftest/framework.c
lnet/selftest/selftest.h
lnet/utils/cyaml/cyaml.c
lnet/utils/cyaml/cyaml.h
lnet/utils/lnetconfig/Makefile.am
lnet/utils/lnetconfig/liblnd.h
lnet/utils/lnetconfig/liblnetconfig.c
lnet/utils/lnetconfig/liblnetconfig.h
lnet/utils/lnetconfig/liblnetconfig_lnd.c
lnet/utils/lnetctl.c
lustre/doc/lnetctl.8
lustre/include/lustre_net.h
lustre/ptlrpc/connection.c
lustre/ptlrpc/events.c
lustre/ptlrpc/niobuf.c

index b14e29e..3b459ab 100644 (file)
@@ -109,6 +109,10 @@ struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
  */
 int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
 /**
+ * print distance information of cpt-table
+ */
+int cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
  * return total number of CPU partitions in \a cptab
  */
 int
@@ -138,6 +142,14 @@ int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
  */
 int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
 /**
+ * shadow HW node ID \a NODE to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node);
+/**
+ * NUMA distance between \a cpt1 and \a cpt2 in \a cptab
+ */
+unsigned cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2);
+/**
  * bind current thread on a CPU-partition \a cpt of \a cptab
  */
 int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
index 40ff9f9..027cb5e 100644 (file)
@@ -128,18 +128,29 @@ struct libcfs_debug_ioctl_data
  * tools which might be accessing the IOCTL numbers, a new group of IOCTL
  * number have been allocated.
  */
-#define IOCTL_CONFIG_SIZE              struct lnet_ioctl_config_data
-#define IOC_LIBCFS_ADD_ROUTE           _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_DEL_ROUTE           _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_ROUTE           _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_ADD_NET             _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_DEL_NET             _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_NET             _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_CONFIG_RTR          _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_ADD_BUF             _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_BUF             _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_PEER_INFO       _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_GET_LNET_STATS      _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR              91
+#define IOCTL_CONFIG_SIZE                 struct lnet_ioctl_config_data
+#define IOC_LIBCFS_ADD_ROUTE              _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_ROUTE              _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_ROUTE              _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_NET                _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_NET                _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NET                _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_CONFIG_RTR             _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_BUF                _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_BUF                _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_INFO          _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LNET_STATS         _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_PEER_NI            _IOWR(IOC_LIBCFS_TYPE, 92, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_PEER_NI            _IOWR(IOC_LIBCFS_TYPE, 93, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_PEER_NI            _IOWR(IOC_LIBCFS_TYPE, 94, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_ADD_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 95, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 96, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_LOCAL_NI                   _IOWR(IOC_LIBCFS_TYPE, 97, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_SET_NUMA_RANGE         _IOWR(IOC_LIBCFS_TYPE, 98, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_NUMA_RANGE         _IOWR(IOC_LIBCFS_TYPE, 99, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DBG                    _IOWR(IOC_LIBCFS_TYPE, 100, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR                                        100
+
+extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
 #endif /* __LIBCFS_IOCTL_H__ */
index f743fc6..3c34071 100644 (file)
@@ -82,15 +82,7 @@ int cfs_expr_list_print(char *buffer, int count,
                        struct cfs_expr_list *expr_list);
 int cfs_expr_list_values(struct cfs_expr_list *expr_list,
                         int max, __u32 **values);
-static inline void
-cfs_expr_list_values_free(__u32 *values, int num)
-{
-       /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
-        * by OBD_FREE() if it's called by module other than libcfs & LNet,
-        * otherwise we will see fake memory leak */
-       LIBCFS_FREE(values, num * sizeof(values[0]));
-}
-
+void cfs_expr_list_values_free(__u32 *values, int num);
 void cfs_expr_list_free(struct cfs_expr_list *expr_list);
 int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
                        struct cfs_expr_list **elpp);
index a8f132a..36b763f 100644 (file)
@@ -62,6 +62,8 @@ struct cfs_cpu_partition {
        cpumask_t                       *cpt_cpumask;
        /* nodes mask for this partition */
        nodemask_t                      *cpt_nodemask;
+       /* NUMA distance between CPTs */
+       unsigned                        *cpt_distance;
        /* spread rotor for NUMA allocator */
        unsigned                        cpt_spread_rotor;
 };
@@ -70,6 +72,8 @@ struct cfs_cpu_partition {
 struct cfs_cpt_table {
        /* spread rotor for NUMA allocator */
        unsigned                        ctb_spread_rotor;
+       /* maximum NUMA distance between all nodes in table */
+       unsigned                        ctb_distance;
        /* # of CPU partitions */
        unsigned                        ctb_nparts;
        /* partitions tables */
@@ -78,6 +82,8 @@ struct cfs_cpt_table {
        int                             *ctb_cpu2cpt;
        /* all cpus in this partition table */
        cpumask_t                       *ctb_cpumask;
+       /* shadow HW node to CPU partition ID */
+       int                             *ctb_node2cpt;
        /* all nodes in this partition table */
        nodemask_t                      *ctb_nodemask;
 };
index 2a1c1b7..4dc7abf 100644 (file)
@@ -78,6 +78,7 @@ struct cfs_expr_list {
        struct list_head        el_exprs;
 };
 
+int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp);
 int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
 int cfs_str2num_check(char *str, int nob, unsigned *num,
                      unsigned min, unsigned max);
@@ -86,6 +87,7 @@ int cfs_expr_list_print(char *buffer, int count,
                        struct cfs_expr_list *expr_list);
 int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
                        struct cfs_expr_list **elpp);
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
 void cfs_expr_list_free_list(struct list_head *list);
 int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
 int cfs_ip_addr_match(__u32 addr, struct list_head *list);
index 1007cf9..f36e2a3 100644 (file)
@@ -45,6 +45,8 @@ EXPORT_SYMBOL(cfs_cpt_table);
 
 #define CFS_CPU_VERSION_MAGIC           0xbabecafe
 
+#define CFS_CPT_DISTANCE               1       /* Arbitrary positive value */
+
 struct cfs_cpt_table *
 cfs_cpt_table_alloc(unsigned int ncpt)
 {
@@ -58,6 +60,7 @@ cfs_cpt_table_alloc(unsigned int ncpt)
        LIBCFS_ALLOC(cptab, sizeof(*cptab));
        if (cptab != NULL) {
                cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+               cpu_set(0, cptab->ctb_cpumask);
                node_set(0, cptab->ctb_nodemask);
                cptab->ctb_nparts  = ncpt;
        }
@@ -90,6 +93,20 @@ cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 EXPORT_SYMBOL(cfs_cpt_table_print);
 
 int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       int     rc = 0;
+
+       rc = snprintf(buf, len, "%d\t: %d:%d\n", 0, CFS_CPT_DISTANCE);
+       len -= rc;
+       if (len <= 0)
+               return -EFBIG;
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
 cfs_cpt_number(struct cfs_cpt_table *cptab)
 {
        return 1;
@@ -110,12 +127,26 @@ cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
 }
 EXPORT_SYMBOL(cfs_cpt_online);
 
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+       return &cptab->ctb_mask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
 nodemask_t *
 cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 {
        return &cptab->ctb_nodemask;
 }
-EXPORT_SYMBOL(cfs_cpt_cpumask);
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+       return CFS_CPT_DISTANCE;
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
 
 int
 cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
@@ -199,6 +230,13 @@ cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 EXPORT_SYMBOL(cfs_cpt_of_cpu);
 
 int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
 cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 {
        return 0;
index bf83793..04e1dd5 100644 (file)
@@ -480,6 +480,16 @@ cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
 }
 EXPORT_SYMBOL(cfs_expr_list_values);
 
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+       /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+        * by OBD_FREE() if it's called by module other than libcfs & LNet,
+        * otherwise we will see fake memory leak */
+       LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+EXPORT_SYMBOL(cfs_expr_list_values_free);
+
 /**
  * Frees cfs_range_expr structures of \a expr_list.
  *
index 369f146..069211a 100644 (file)
@@ -83,8 +83,12 @@ cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 
        if (cptab->ctb_cpu2cpt != NULL) {
                LIBCFS_FREE(cptab->ctb_cpu2cpt,
-                           num_possible_cpus() *
-                           sizeof(cptab->ctb_cpu2cpt[0]));
+                           nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+       }
+
+       if (cptab->ctb_node2cpt != NULL) {
+               LIBCFS_FREE(cptab->ctb_node2cpt,
+                           nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
        }
 
        for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
@@ -97,6 +101,12 @@ cfs_cpt_table_free(struct cfs_cpt_table *cptab)
 
                if (part->cpt_cpumask != NULL)
                        LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+
+               if (part->cpt_distance) {
+                       LIBCFS_FREE(part->cpt_distance,
+                               cptab->ctb_nparts *
+                                       sizeof(part->cpt_distance[0]));
+               }
        }
 
        if (cptab->ctb_parts != NULL) {
@@ -132,12 +142,20 @@ cfs_cpt_table_alloc(unsigned int ncpt)
                goto failed;
 
        LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
-                    num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+                    nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
        if (cptab->ctb_cpu2cpt == NULL)
                goto failed;
 
        memset(cptab->ctb_cpu2cpt, -1,
-              num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+              nr_cpu_ids * sizeof(cptab->ctb_cpu2cpt[0]));
+
+       LIBCFS_ALLOC(cptab->ctb_node2cpt,
+                    nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
+       if (cptab->ctb_node2cpt == NULL)
+               goto failed;
+
+       memset(cptab->ctb_node2cpt, -1,
+              nr_node_ids * sizeof(cptab->ctb_node2cpt[0]));
 
        LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
        if (cptab->ctb_parts == NULL)
@@ -147,8 +165,16 @@ cfs_cpt_table_alloc(unsigned int ncpt)
                struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
 
                LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+               if (!part->cpt_cpumask)
+                       goto failed;
+
                LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
-               if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+               if (!part->cpt_nodemask)
+                       goto failed;
+
+               LIBCFS_ALLOC(part->cpt_distance,
+                       cptab->ctb_nparts * sizeof(part->cpt_distance[0]));
+               if (!part->cpt_distance)
                        goto failed;
        }
 
@@ -164,29 +190,26 @@ int
 cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
 {
        char    *tmp = buf;
-       int     rc = 0;
+       int     rc = -EFBIG;
        int     i;
        int     j;
 
        for (i = 0; i < cptab->ctb_nparts; i++) {
-               if (len > 0) {
-                       rc = snprintf(tmp, len, "%d\t: ", i);
-                       len -= rc;
-               }
+               if (len <= 0)
+                       goto out;
 
-               if (len <= 0) {
-                       rc = -EFBIG;
+               rc = snprintf(tmp, len, "%d\t:", i);
+               len -= rc;
+
+               if (len <= 0)
                        goto out;
-               }
 
                tmp += rc;
                for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
-                       rc = snprintf(tmp, len, "%d ", j);
+                       rc = snprintf(tmp, len, " %d", j);
                        len -= rc;
-                       if (len <= 0) {
-                               rc = -EFBIG;
+                       if (len <= 0)
                                goto out;
-                       }
                        tmp += rc;
                }
 
@@ -194,8 +217,8 @@ cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
                tmp++;
                len--;
        }
-
-out:
+       rc = 0;
+ out:
        if (rc < 0)
                return rc;
 
@@ -204,6 +227,47 @@ out:
 EXPORT_SYMBOL(cfs_cpt_table_print);
 
 int
+cfs_cpt_distance_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       char    *tmp = buf;
+       int     rc = -EFBIG;
+       int     i;
+       int     j;
+
+       for (i = 0; i < cptab->ctb_nparts; i++) {
+               if (len <= 0)
+                       goto out;
+
+               rc = snprintf(tmp, len, "%d\t:", i);
+               len -= rc;
+
+               if (len <= 0)
+                       goto out;
+
+               tmp += rc;
+               for (j = 0; j < cptab->ctb_nparts; j++) {
+                       rc = snprintf(tmp, len, " %d:%d",
+                               j, cptab->ctb_parts[i].cpt_distance[j]);
+                       len -= rc;
+                       if (len <= 0)
+                               goto out;
+                       tmp += rc;
+               }
+
+               *tmp = '\n';
+               tmp++;
+               len--;
+       }
+       rc = 0;
+ out:
+       if (rc < 0)
+               return rc;
+
+       return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_distance_print);
+
+int
 cfs_cpt_number(struct cfs_cpt_table *cptab)
 {
        return cptab->ctb_nparts;
@@ -254,11 +318,139 @@ cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
 }
 EXPORT_SYMBOL(cfs_cpt_nodemask);
 
+unsigned
+cfs_cpt_distance(struct cfs_cpt_table *cptab, int cpt1, int cpt2)
+{
+       LASSERT(cpt1 == CFS_CPT_ANY || (cpt1 >= 0 && cpt1 < cptab->ctb_nparts));
+       LASSERT(cpt2 == CFS_CPT_ANY || (cpt2 >= 0 && cpt2 < cptab->ctb_nparts));
+
+       if (cpt1 == CFS_CPT_ANY || cpt2 == CFS_CPT_ANY)
+               return cptab->ctb_distance;
+
+       return cptab->ctb_parts[cpt1].cpt_distance[cpt2];
+}
+EXPORT_SYMBOL(cfs_cpt_distance);
+
+/*
+ * Calculate the maximum NUMA distance between all nodes in the
+ * from_mask and all nodes in the to_mask.
+ */
+static unsigned
+cfs_cpt_distance_calculate(nodemask_t *from_mask, nodemask_t *to_mask)
+{
+       unsigned maximum;
+       unsigned distance;
+       int to;
+       int from;
+
+       maximum = 0;
+       for_each_node_mask(from, *from_mask) {
+               for_each_node_mask(to, *to_mask) {
+                       distance = node_distance(from, to);
+                       if (maximum < distance)
+                               maximum = distance;
+               }
+       }
+       return maximum;
+}
+
+static void cfs_cpt_add_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       cptab->ctb_cpu2cpt[cpu] = cpt;
+
+       cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+       cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+}
+
+static void cfs_cpt_del_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+       cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+
+       cptab->ctb_cpu2cpt[cpu] = -1;
+}
+
+static void cfs_cpt_add_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       int cpt2;
+       struct cfs_cpu_partition *part;
+       struct cfs_cpu_partition *part2;
+
+       if (!node_isset(node, *cptab->ctb_nodemask)) {
+               /* first time node is added to the CPT table */
+               node_set(node, *cptab->ctb_nodemask);
+               cptab->ctb_node2cpt[node] = cpt;
+               cptab->ctb_distance = cfs_cpt_distance_calculate(
+                                                       cptab->ctb_nodemask,
+                                                       cptab->ctb_nodemask);
+       }
+
+       part = &cptab->ctb_parts[cpt];
+       if (!node_isset(node, *part->cpt_nodemask)) {
+               /* first time node is added to this CPT */
+               node_set(node, *part->cpt_nodemask);
+               for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+                       part2 = &cptab->ctb_parts[cpt2];
+                       part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+                                               part->cpt_nodemask,
+                                               part2->cpt_nodemask);
+                       part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+                                               part2->cpt_nodemask,
+                                               part->cpt_nodemask);
+               }
+       }
+}
+
+static void cfs_cpt_del_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       int cpu;
+       int cpt2;
+       struct cfs_cpu_partition *part;
+       struct cfs_cpu_partition *part2;
+
+       part = &cptab->ctb_parts[cpt];
+
+       for_each_cpu(cpu, part->cpt_cpumask) {
+               /* this CPT has other CPU belonging to this node? */
+               if (cpu_to_node(cpu) == node)
+                       break;
+       }
+
+       if (cpu >= nr_cpu_ids && node_isset(node,  *part->cpt_nodemask)) {
+               /* No more CPUs in the node for this CPT. */
+               node_clear(node, *part->cpt_nodemask);
+               for (cpt2 = 0; cpt2 < cptab->ctb_nparts; cpt2++) {
+                       part2 = &cptab->ctb_parts[cpt2];
+                       if (node_isset(node, *part2->cpt_nodemask))
+                               cptab->ctb_node2cpt[node] = cpt2;
+                       part->cpt_distance[cpt2] = cfs_cpt_distance_calculate(
+                                               part->cpt_nodemask,
+                                               part2->cpt_nodemask);
+                       part2->cpt_distance[cpt] = cfs_cpt_distance_calculate(
+                                               part2->cpt_nodemask,
+                                               part->cpt_nodemask);
+               }
+       }
+
+       for_each_cpu(cpu, cptab->ctb_cpumask) {
+               /* this CPT-table has other CPUs belonging to this node? */
+               if (cpu_to_node(cpu) == node)
+                       break;
+       }
+
+       if (cpu >= nr_cpu_ids && node_isset(node, *cptab->ctb_nodemask)) {
+               /* No more CPUs in the table for this node. */
+               node_clear(node, *cptab->ctb_nodemask);
+               cptab->ctb_node2cpt[node] = -1;
+               cptab->ctb_distance =
+                       cfs_cpt_distance_calculate(cptab->ctb_nodemask,
+                                       cptab->ctb_nodemask);
+       }
+}
+
 int
 cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
-       int     node;
-
        LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
 
        if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
@@ -272,23 +464,11 @@ cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
                return 0;
        }
 
-       cptab->ctb_cpu2cpt[cpu] = cpt;
-
        LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
        LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
 
-       cpumask_set_cpu(cpu, cptab->ctb_cpumask);
-       cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-
-       node = cpu_to_node(cpu);
-
-       /* first CPU of @node in this CPT table */
-       if (!node_isset(node, *cptab->ctb_nodemask))
-               node_set(node, *cptab->ctb_nodemask);
-
-       /* first CPU of @node in this partition */
-       if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
-               node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+       cfs_cpt_add_cpu(cptab, cpt, cpu);
+       cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
 
        return 1;
 }
@@ -297,9 +477,6 @@ EXPORT_SYMBOL(cfs_cpt_set_cpu);
 void
 cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
 {
-       int     node;
-       int     i;
-
        LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
 
        if (cpu < 0 || cpu >= nr_cpu_ids) {
@@ -325,32 +502,8 @@ cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
        LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
        LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
 
-       cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
-       cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
-       cptab->ctb_cpu2cpt[cpu] = -1;
-
-       node = cpu_to_node(cpu);
-
-       LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
-       LASSERT(node_isset(node, *cptab->ctb_nodemask));
-
-       for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
-               /* this CPT has other CPU belonging to this node? */
-               if (cpu_to_node(i) == node)
-                       break;
-       }
-
-       if (i >= nr_cpu_ids)
-               node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
-
-       for_each_cpu(i, cptab->ctb_cpumask) {
-               /* this CPT-table has other CPU belonging to this node? */
-               if (cpu_to_node(i) == node)
-                       break;
-       }
-
-       if (i >= nr_cpu_ids)
-               node_clear(node, *cptab->ctb_nodemask);
+       cfs_cpt_del_cpu(cptab, cpt, cpu);
+       cfs_cpt_del_node(cptab, cpt, cpu_to_node(cpu));
 }
 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
 
@@ -367,8 +520,8 @@ cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, const cpumask_t *mask)
        }
 
        for_each_cpu(cpu, mask) {
-               if (!cfs_cpt_set_cpu(cptab, cpt, cpu))
-                       return 0;
+               cfs_cpt_add_cpu(cptab, cpt, cpu);
+               cfs_cpt_add_node(cptab, cpt, cpu_to_node(cpu));
        }
 
        return 1;
@@ -390,18 +543,22 @@ int
 cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
        const cpumask_t *mask;
-       int             rc;
+       int             cpu;
 
-       if (node < 0 || node >= MAX_NUMNODES) {
+       if (node < 0 || node >= nr_node_ids) {
                CDEBUG(D_INFO,
                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
                return 0;
        }
 
        mask = cpumask_of_node(node);
-       rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
 
-       return rc;
+       for_each_cpu(cpu, mask)
+               cfs_cpt_add_cpu(cptab, cpt, cpu);
+
+       cfs_cpt_add_node(cptab, cpt, node);
+
+       return 1;
 }
 EXPORT_SYMBOL(cfs_cpt_set_node);
 
@@ -409,16 +566,20 @@ void
 cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
 {
        const cpumask_t *mask;
+       int cpu;
 
-       if (node < 0 || node >= MAX_NUMNODES) {
+       if (node < 0 || node >= nr_node_ids) {
                CDEBUG(D_INFO,
                       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
                return;
        }
 
        mask = cpumask_of_node(node);
-       cfs_cpt_unset_cpumask(cptab, cpt, mask);
 
+       for_each_cpu(cpu, mask)
+               cfs_cpt_del_cpu(cptab, cpt, cpu);
+
+       cfs_cpt_del_node(cptab, cpt, node);
 }
 EXPORT_SYMBOL(cfs_cpt_unset_node);
 
@@ -507,6 +668,16 @@ cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
 EXPORT_SYMBOL(cfs_cpt_of_cpu);
 
 int
+cfs_cpt_of_node(struct cfs_cpt_table *cptab, int node)
+{
+       if (node < 0 || node > nr_node_ids)
+               return CFS_CPT_ANY;
+
+       return cptab->ctb_node2cpt[node];
+}
+EXPORT_SYMBOL(cfs_cpt_of_node);
+
+int
 cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
 {
        cpumask_t       *cpumask;
@@ -834,7 +1005,7 @@ cfs_cpt_table_create_pattern(char *pattern)
                return cptab;
        }
 
-       high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+       high = node ? nr_node_ids - 1 : nr_cpu_ids - 1;
 
        for (str = cfs_trimwhite(pattern), c = 0;; c++) {
                struct cfs_range_expr   *range;
index 3920e39..e8080f3 100644 (file)
@@ -463,6 +463,55 @@ proc_cpt_table(struct ctl_table *table, int write, void __user *buffer,
                                    __proc_cpt_table);
 }
 
+static int __proc_cpt_distance(void *data, int write,
+                              loff_t pos, void __user *buffer, int nob)
+{
+       char *buf = NULL;
+       int   len = 4096;
+       int   rc  = 0;
+
+       if (write)
+               return -EPERM;
+
+       LASSERT(cfs_cpt_table != NULL);
+
+       while (1) {
+               LIBCFS_ALLOC(buf, len);
+               if (buf == NULL)
+                       return -ENOMEM;
+
+               rc = cfs_cpt_distance_print(cfs_cpt_table, buf, len);
+               if (rc >= 0)
+                       break;
+
+               if (rc == -EFBIG) {
+                       LIBCFS_FREE(buf, len);
+                       len <<= 1;
+                       continue;
+               }
+               goto out;
+       }
+
+       if (pos >= rc) {
+               rc = 0;
+               goto out;
+       }
+
+       rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+       if (buf != NULL)
+               LIBCFS_FREE(buf, len);
+       return rc;
+}
+
+static int
+proc_cpt_distance(struct ctl_table *table, int write, void __user *buffer,
+              size_t *lenp, loff_t *ppos)
+{
+       return lprocfs_call_handler(table->data, write, ppos, buffer, lenp,
+                                    __proc_cpt_distance);
+}
+
 static struct ctl_table lnet_table[] = {
        /*
         * NB No .strategy entries have been provided since sysctl(8) prefers
@@ -538,6 +587,13 @@ static struct ctl_table lnet_table[] = {
        },
        {
                INIT_CTL_NAME
+               .procname       = "cpu_partition_distance",
+               .maxlen         = 128,
+               .mode           = 0444,
+               .proc_handler   = &proc_cpt_distance,
+       },
+       {
+               INIT_CTL_NAME
                .procname       = "debug_log_upcall",
                .data           = lnet_debug_log_upcall,
                .maxlen         = sizeof(lnet_debug_log_upcall),
index 97b5f7f..9078500 100644 (file)
@@ -327,11 +327,64 @@ cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
 }
 
 /**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+       struct cfs_range_expr   *expr;
+       __u32                   *val;
+       int                     count = 0;
+       int                     i;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               count++;
+               }
+       }
+
+       if (count == 0) /* empty expression list */
+               return 0;
+
+       if (count > max)
+               return -EINVAL;
+
+       val = calloc(sizeof(val[0]), count);
+       if (val == NULL)
+               return -ENOMEM;
+
+       count = 0;
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               val[count++] = i;
+               }
+       }
+
+       *valpp = val;
+       return count;
+}
+
+void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+       /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+        * by OBD_FREE() if it's called by module other than libcfs & LNet,
+        * otherwise we will see fake memory leak */
+       free(values);
+}
+
+/**
  * Frees cfs_range_expr structures of \a expr_list.
  *
  * \retval none
  */
-static void
+void
 cfs_expr_list_free(struct cfs_expr_list *expr_list)
 {
        while (!list_empty(&expr_list->el_exprs)) {
index 6098439..337b986 100644 (file)
@@ -78,6 +78,7 @@ int LNetNIFini(void);
 int LNetGetId(unsigned int index, lnet_process_id_t *id);
 int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
 void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+lnet_nid_t LNetPrimaryNID(lnet_nid_t nid);
 
 /** @} lnet_addr */
 
index 436d9e8..1a67738 100644 (file)
 #define LNET_MAX_SHOW_NUM_CPT  128
 #define LNET_UNDEFINED_HOPS    ((__u32) -1)
 
+/*
+ * To allow for future enhancements to extend the tunables
+ * add a hdr to this structure, so that the version can be set
+ * and checked for backwards compatibility. Newer versions of LNet
+ * can still work with older versions of lnetctl. The restriction is
+ * that the structure can be added to and not removed from in order
+ * to not invalidate older lnetctl utilities. Moreover, the order of
+ * fields must remain the same, and new fields appended to the structure
+ *
+ * That said all existing LND tunables will be added in this structure
+ * to avoid future changes.
+ */
 struct lnet_ioctl_config_lnd_cmn_tunables {
        __u32 lct_version;
-       __u32 lct_peer_timeout;
-       __u32 lct_peer_tx_credits;
-       __u32 lct_peer_rtr_credits;
-       __u32 lct_max_tx_credits;
+       __s32 lct_peer_timeout;
+       __s32 lct_peer_tx_credits;
+       __s32 lct_peer_rtr_credits;
+       __s32 lct_max_tx_credits;
 };
 
 struct lnet_ioctl_config_o2iblnd_tunables {
@@ -56,11 +68,15 @@ struct lnet_ioctl_config_o2iblnd_tunables {
        __u32 pad;
 };
 
+struct lnet_lnd_tunables {
+       union {
+               struct lnet_ioctl_config_o2iblnd_tunables lnd_o2ib;
+       } lnd_tun_u;
+};
+
 struct lnet_ioctl_config_lnd_tunables {
        struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn;
-       union {
-               struct lnet_ioctl_config_o2iblnd_tunables lt_o2ib;
-       } lt_tun_u;
+       struct lnet_lnd_tunables lt_tun;
 };
 
 struct lnet_ioctl_net_config {
@@ -77,6 +93,10 @@ struct lnet_ioctl_net_config {
 /* # different router buffer pools */
 #define LNET_NRBPOOLS          (LNET_LARGE_BUF_IDX + 1)
 
+enum lnet_dbg_task {
+       LNET_DBG_INCR_DLC_SEQ = 0
+};
+
 struct lnet_ioctl_pool_cfg {
        struct {
                __u32 pl_npages;
@@ -121,26 +141,92 @@ struct lnet_ioctl_config_data {
        char cfg_bulk[0];
 };
 
+struct lnet_ioctl_element_stats {
+       __u32   send_count;
+       __u32   recv_count;
+       __u32   drop_count;
+};
+
+/*
+ * lnet_ioctl_config_ni
+ *  This structure describes an NI configuration. There are multiple components
+ *  when configuring an NI: Net, Interfaces, CPT list and LND tunables
+ *  A network is passed as a string to the DLC and translated using
+ *  libcfs_str2net()
+ *  An interface is the name of the system configured interface
+ *  (ex eth0, ib1)
+ *  CPT is the list of CPTS LND tunables are passed in the lic_bulk area
+ */
+struct lnet_ioctl_config_ni {
+       struct libcfs_ioctl_hdr lic_cfg_hdr;
+       lnet_nid_t              lic_nid;
+       char                    lic_ni_intf[LNET_MAX_INTERFACES][LNET_MAX_STR_LEN];
+       char                    lic_legacy_ip2nets[LNET_MAX_STR_LEN];
+       __u32                   lic_cpts[LNET_MAX_SHOW_NUM_CPT];
+       __u32                   lic_ncpts;
+       __u32                   lic_status;
+       __u32                   lic_tcp_bonding;
+       __u32                   lic_idx;
+       __s32                   lic_dev_cpt;
+       char                    pad[4];
+       char                    lic_bulk[0];
+};
+
+struct lnet_peer_ni_credit_info {
+       char cr_aliveness[LNET_MAX_STR_LEN];
+       __u32 cr_refcount;
+       __s32 cr_ni_peer_tx_credits;
+       __s32 cr_peer_tx_credits;
+       __s32 cr_peer_min_tx_credits;
+       __u32 cr_peer_tx_qnob;
+       __s32 cr_peer_rtr_credits;
+       __s32 cr_peer_min_rtr_credits;
+       __u32 cr_ncpt;
+};
+
 struct lnet_ioctl_peer {
        struct libcfs_ioctl_hdr pr_hdr;
        __u32 pr_count;
        __u32 pr_pad;
-       __u64 pr_nid;
+       lnet_nid_t pr_nid;
 
        union {
-               struct {
-                       char cr_aliveness[LNET_MAX_STR_LEN];
-                       __u32 cr_refcount;
-                       __u32 cr_ni_peer_tx_credits;
-                       __u32 cr_peer_tx_credits;
-                       __u32 cr_peer_rtr_credits;
-                       __u32 cr_peer_min_rtr_credits;
-                       __u32 cr_peer_tx_qnob;
-                       __u32 cr_ncpt;
-               } pr_peer_credits;
+               struct lnet_peer_ni_credit_info  pr_peer_credits;
        } pr_lnd_u;
 };
 
+struct lnet_dbg_task_info {
+       /*
+        * TODO: a union can be added if the task requires more
+        * information from user space to be carried out in kernel space.
+        */
+};
+
+/*
+ * This structure is intended to allow execution of debugging tasks. This
+ * is not intended to be backwards compatible. Extra tasks can be added in
+ * the future
+ */
+struct lnet_ioctl_dbg {
+       struct libcfs_ioctl_hdr dbg_hdr;
+       enum lnet_dbg_task dbg_task;
+       char dbg_bulk[0];
+};
+
+struct lnet_ioctl_peer_cfg {
+       struct libcfs_ioctl_hdr prcfg_hdr;
+       lnet_nid_t prcfg_prim_nid;
+       lnet_nid_t prcfg_cfg_nid;
+       __u32 prcfg_idx;
+       bool prcfg_mr;
+       char prcfg_bulk[0];
+};
+
+struct lnet_ioctl_numa_range {
+       struct libcfs_ioctl_hdr nr_hdr;
+       __u32 nr_range;
+};
+
 struct lnet_ioctl_lnet_stats {
        struct libcfs_ioctl_hdr st_hdr;
        struct lnet_counters st_cntrs;
index 3b29582..ff680eb 100644 (file)
@@ -72,9 +72,9 @@ extern lnet_t  the_lnet;                      /* THE network */
 
 static inline int lnet_is_route_alive(lnet_route_t *route)
 {
-       if (!route->lr_gateway->lp_alive)
+       if (!route->lr_gateway->lpni_alive)
                return 0; /* gateway is down */
-       if ((route->lr_gateway->lp_ping_feats &
+       if ((route->lr_gateway->lpni_ping_feats &
             LNET_PING_FEAT_NI_STATUS) == 0)
                return 1; /* no NI status, assume it's alive */
        /* has NI status, check # down NIs */
@@ -279,24 +279,6 @@ lnet_me_free(lnet_me_t *me)
        kmem_cache_free(lnet_mes_cachep, me);
 }
 
-static inline lnet_msg_t *
-lnet_msg_alloc(void)
-{
-       lnet_msg_t *msg;
-
-       LIBCFS_ALLOC(msg, sizeof(*msg));
-
-       /* no need to zero, LIBCFS_ALLOC does for us */
-       return (msg);
-}
-
-static inline void
-lnet_msg_free(lnet_msg_t *msg)
-{
-       LASSERT(!msg->msg_onactivelist);
-       LIBCFS_FREE(msg, sizeof(*msg));
-}
-
 lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
                                     __u64 cookie);
 void lnet_res_lh_initialize(struct lnet_res_container *rec,
@@ -397,27 +379,27 @@ lnet_handle2me(lnet_handle_me_t *handle)
 }
 
 static inline void
-lnet_peer_addref_locked(lnet_peer_t *lp)
+lnet_peer_ni_addref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(lp->lp_refcount > 0);
-       lp->lp_refcount++;
+       LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+       atomic_inc(&lp->lpni_refcount);
 }
 
-extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+extern void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lp);
 
 static inline void
-lnet_peer_decref_locked(lnet_peer_t *lp)
+lnet_peer_ni_decref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(lp->lp_refcount > 0);
-       lp->lp_refcount--;
-       if (lp->lp_refcount == 0)
-               lnet_destroy_peer_locked(lp);
+       LASSERT (atomic_read(&lp->lpni_refcount) > 0);
+       atomic_dec(&lp->lpni_refcount);
+       if (atomic_read(&lp->lpni_refcount) == 0)
+               lnet_destroy_peer_ni_locked(lp);
 }
 
 static inline int
-lnet_isrouter(lnet_peer_t *lp)
+lnet_isrouter(struct lnet_peer_ni *lp)
 {
-       return lp->lp_rtr_refcount != 0;
+       return lp->lpni_rtr_refcount != 0;
 }
 
 static inline void
@@ -454,9 +436,36 @@ lnet_ni_decref(lnet_ni_t *ni)
        lnet_net_unlock(0);
 }
 
-void lnet_ni_free(lnet_ni_t *ni);
-lnet_ni_t *
-lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist);
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+       lnet_msg_t *msg;
+
+       LIBCFS_ALLOC(msg, sizeof(*msg));
+
+       /* no need to zero, LIBCFS_ALLOC does for us */
+       return (msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+       LASSERT(!msg->msg_onactivelist);
+       LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+void lnet_ni_free(struct lnet_ni *ni);
+void lnet_net_free(struct lnet_net *net);
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_type, struct list_head *netlist);
+
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el,
+             char *iface);
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+                         char *iface);
 
 static inline int
 lnet_nid2peerhash(lnet_nid_t nid)
@@ -475,19 +484,25 @@ lnet_net2rnethash(__u32 net)
 extern lnd_t the_lolnd;
 extern int avoid_asym_router_failure;
 
-extern int lnet_cpt_of_nid_locked(lnet_nid_t nid);
-extern int lnet_cpt_of_nid(lnet_nid_t nid);
+extern unsigned int lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number);
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni);
+extern int lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni);
 extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern lnet_ni_t *lnet_nid2ni_addref(lnet_nid_t nid);
 extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
-extern lnet_ni_t *lnet_net2ni(__u32 net);
+extern lnet_ni_t *lnet_net2ni_addref(__u32 net);
+bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
 
+extern unsigned int lnet_numa_range;
 extern int portal_rotor;
 
 int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
-void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
+void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
+                       cfs_time_t when);
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
                   unsigned int priority);
 int lnet_check_routes(void);
@@ -496,6 +511,9 @@ void lnet_destroy_routes(void);
 int lnet_get_route(int idx, __u32 *net, __u32 *hops,
                   lnet_nid_t *gateway, __u32 *alive, __u32 *priority);
 int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg);
+struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
+                                       struct lnet_ni *prev);
+struct lnet_ni *lnet_get_ni_idx_locked(int idx);
 
 struct libcfs_ioctl_handler {
        struct list_head item;
@@ -521,11 +539,13 @@ int  lnet_rtrpools_adjust(int tiny, int small, int large);
 int lnet_rtrpools_enable(void);
 void lnet_rtrpools_disable(void);
 void lnet_rtrpools_free(int keep_pools);
-lnet_remotenet_t *lnet_find_net_locked (__u32 net);
-int lnet_dyn_add_ni(lnet_pid_t requested_pid,
-                   struct lnet_ioctl_config_data *conf);
-int lnet_dyn_del_ni(__u32 net);
+lnet_remotenet_t *lnet_find_rnet_locked(__u32 net);
+int lnet_dyn_add_net(struct lnet_ioctl_config_data *conf);
+int lnet_dyn_del_net(__u32 net);
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf);
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf);
 int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_islocalnid(lnet_nid_t nid);
 int lnet_islocalnet(__u32 net);
@@ -724,6 +744,7 @@ void lnet_me_unlink(lnet_me_t *me);
 
 void lnet_md_unlink(lnet_libmd_t *md);
 void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+int lnet_cpt_of_md(lnet_libmd_t *md);
 
 void lnet_register_lnd(lnd_t *lnd);
 void lnet_unregister_lnd(lnd_t *lnd);
@@ -731,8 +752,8 @@ void lnet_unregister_lnd(lnd_t *lnd);
 int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid,
                 __u32 local_ip, __u32 peer_ip, int peer_port);
 void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
-                               __u32 peer_ip, int port);
-int lnet_count_acceptor_nis(void);
+                                __u32 peer_ip, int port);
+int lnet_count_acceptor_nets(void);
 int lnet_acceptor_timeout(void);
 int lnet_acceptor_port(void);
 int lnet_acceptor_start(void);
@@ -754,38 +775,113 @@ int lnet_sock_connect(struct socket **sockp, int *fatal,
                        __u32 peer_ip, int peer_port);
 
 int lnet_peers_start_down(void);
-int lnet_peer_buffer_credits(lnet_ni_t *ni);
+int lnet_peer_buffer_credits(struct lnet_net *net);
 
 int lnet_router_checker_start(void);
 void lnet_router_checker_stop(void);
-void lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net);
+void lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net);
 void lnet_swap_pinginfo(struct lnet_ping_info *info);
 
 int lnet_parse_ip2nets(char **networksp, char *ip2nets);
 int lnet_parse_routes(char *route_str, int *im_a_router);
-int lnet_parse_networks(struct list_head *nilist, char *networks);
-int lnet_net_unique(__u32 net, struct list_head *nilist);
-
-int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
-lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
-                                  lnet_nid_t nid);
-void lnet_peer_tables_cleanup(lnet_ni_t *ni);
-void lnet_peer_tables_destroy(void);
+int lnet_parse_networks(struct list_head *nilist, char *networks,
+                       bool use_tcp_bonding);
+bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
+                    struct lnet_net **net);
+bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
+void lnet_incr_dlc_seq(void);
+__u32 lnet_get_dlc_seq_locked(void);
+
+struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+                                                 struct lnet_peer_net *peer_net,
+                                                 struct lnet_peer_ni *prev);
+struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
+struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
+void lnet_peer_net_added(struct lnet_net *net);
+lnet_nid_t lnet_peer_primary_nid(lnet_nid_t nid);
+void lnet_peer_tables_cleanup(struct lnet_net *net);
+void lnet_peer_uninit(void);
 int lnet_peer_tables_create(void);
 void lnet_debug_peer(lnet_nid_t nid);
-int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
-                      char alivness[LNET_MAX_STR_LEN],
-                      __u32 *cpt_iter, __u32 *refcount,
-                      __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
-                      __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
-                      __u32 *peer_tx_qnob);
+struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
+                                              __u32 net_id);
+bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
+                                struct lnet_ni *ni);
+int lnet_add_peer_ni_to_peer(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
+int lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid);
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+                      bool *mr, struct lnet_peer_ni_credit_info *peer_ni_info,
+                      struct lnet_ioctl_element_stats *peer_ni_stats);
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+                         char alivness[LNET_MAX_STR_LEN],
+                         __u32 *cpt_iter, __u32 *refcount,
+                         __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+                         __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
+                         __u32 *peer_tx_qnob);
+
+
+static inline __u32
+lnet_get_num_peer_nis(struct lnet_peer *peer)
+{
+       struct lnet_peer_net *lpn;
+       struct lnet_peer_ni *lpni;
+       __u32 count = 0;
+
+       list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_on_peer_list)
+               list_for_each_entry(lpni, &lpn->lpn_peer_nis,
+                                   lpni_on_peer_net_list)
+                       count++;
+
+       return count;
+}
+
+static inline bool
+lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
+{
+       return lpni->lpni_healthy;
+}
+
+static inline void
+lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
+{
+       lpni->lpni_healthy = health;
+}
+
+static inline bool
+lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
+{
+       struct lnet_peer_ni *lpni;
+
+       list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+                           lpni_on_peer_net_list) {
+               if (lnet_is_peer_ni_healthy_locked(lpni))
+                       return true;
+       }
+
+       return false;
+}
+
+static inline bool
+lnet_is_peer_healthy_locked(struct lnet_peer *peer)
+{
+       struct lnet_peer_net *peer_net;
+
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (lnet_is_peer_net_healthy_locked(peer_net))
+                       return true;
+       }
+
+       return false;
+}
 
 static inline void
-lnet_peer_set_alive(lnet_peer_t *lp)
+lnet_peer_set_alive(struct lnet_peer_ni *lp)
 {
-       lp->lp_last_alive = lp->lp_last_query = cfs_time_current();
-       if (!lp->lp_alive)
-               lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+       lp->lpni_last_alive = lp->lpni_last_query = cfs_time_current();
+       if (!lp->lpni_alive)
+               lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
 }
 
 #endif
index fca5ace..13ca7d5 100644 (file)
@@ -68,6 +68,8 @@ typedef struct lnet_msg {
        struct list_head        msg_list;       /* Q for credits/MD */
 
        lnet_process_id_t       msg_target;
+       /* Primary NID of the source. */
+       lnet_nid_t              msg_initiator;
        /* where is it from, it's only for building event */
        lnet_nid_t              msg_from;
        __u32                   msg_type;
@@ -87,34 +89,37 @@ typedef struct lnet_msg {
        /* ready for pending on RX delay list */
        unsigned int            msg_rx_ready_delay:1;
 
-       unsigned int          msg_vmflush:1;      /* VM trying to free memory */
-       unsigned int          msg_target_is_router:1; /* sending to a router */
-       unsigned int          msg_routing:1;      /* being forwarded */
-       unsigned int          msg_ack:1;          /* ack on finalize (PUT) */
-       unsigned int          msg_sending:1;      /* outgoing message */
-       unsigned int          msg_receiving:1;    /* being received */
-       unsigned int          msg_txcredit:1;     /* taken an NI send credit */
-       unsigned int          msg_peertxcredit:1; /* taken a peer send credit */
-       unsigned int          msg_rtrcredit:1;    /* taken a globel router credit */
-       unsigned int          msg_peerrtrcredit:1; /* taken a peer router credit */
-       unsigned int          msg_onactivelist:1; /* on the activelist */
+       unsigned int          msg_vmflush:1;      /* VM trying to free memory */
+       unsigned int          msg_target_is_router:1; /* sending to a router */
+       unsigned int          msg_routing:1;      /* being forwarded */
+       unsigned int          msg_ack:1;          /* ack on finalize (PUT) */
+       unsigned int          msg_sending:1;      /* outgoing message */
+       unsigned int          msg_receiving:1;    /* being received */
+       unsigned int          msg_txcredit:1;     /* taken an NI send credit */
+       unsigned int          msg_peertxcredit:1; /* taken a peer send credit */
+       unsigned int          msg_rtrcredit:1;    /* taken a globel router credit */
+       unsigned int          msg_peerrtrcredit:1; /* taken a peer router credit */
+       unsigned int          msg_onactivelist:1; /* on the activelist */
        unsigned int          msg_rdma_get:1;
 
-       struct lnet_peer     *msg_txpeer;         /* peer I'm sending to */
-       struct lnet_peer     *msg_rxpeer;         /* peer I received from */
+       struct lnet_peer_ni  *msg_txpeer;         /* peer I'm sending to */
+       struct lnet_peer_ni  *msg_rxpeer;         /* peer I received from */
 
-       void                 *msg_private;
+       void                 *msg_private;
        struct lnet_libmd    *msg_md;
-
-       unsigned int          msg_len;
-       unsigned int          msg_wanted;
-       unsigned int          msg_offset;
-       unsigned int          msg_niov;
+       /* the NI the message was sent or received over */
+       struct lnet_ni       *msg_txni;
+       struct lnet_ni       *msg_rxni;
+
+       unsigned int          msg_len;
+       unsigned int          msg_wanted;
+       unsigned int          msg_offset;
+       unsigned int          msg_niov;
        struct kvec          *msg_iov;
-       lnet_kiov_t          *msg_kiov;
+       lnet_kiov_t          *msg_kiov;
 
-       lnet_event_t          msg_ev;
-       lnet_hdr_t            msg_hdr;
+       lnet_event_t          msg_ev;
+       lnet_hdr_t            msg_hdr;
 } lnet_msg_t;
 
 
@@ -164,6 +169,7 @@ typedef struct lnet_libmd {
        unsigned int            md_niov;        /* # frags at end of struct */
        void                   *md_user_ptr;
        lnet_eq_t              *md_eq;
+       lnet_handle_md_t       md_bulk_handle;
        union {
                struct kvec     iov[LNET_MAX_IOV];
                lnet_kiov_t     kiov[LNET_MAX_IOV];
@@ -263,29 +269,141 @@ struct lnet_tx_queue {
        struct list_head        tq_delayed;     /* delayed TXs */
 };
 
+enum lnet_net_state {
+       /* set when net block is allocated */
+       LNET_NET_STATE_INIT = 0,
+       /* set when NIs in net are started successfully */
+       LNET_NET_STATE_ACTIVE,
+       /* set if all NIs in net are in FAILED state */
+       LNET_NET_STATE_INACTIVE,
+       /* set when shutting down a NET */
+       LNET_NET_STATE_DELETING
+};
+
+enum lnet_ni_state {
+       /* set when NI block is allocated */
+       LNET_NI_STATE_INIT = 0,
+       /* set when NI is started successfully */
+       LNET_NI_STATE_ACTIVE,
+       /* set when LND notifies NI failed */
+       LNET_NI_STATE_FAILED,
+       /* set when LND notifies NI degraded */
+       LNET_NI_STATE_DEGRADED,
+       /* set when shuttding down NI */
+       LNET_NI_STATE_DELETING
+};
+
+struct lnet_element_stats {
+       atomic_t        send_count;
+       atomic_t        recv_count;
+       atomic_t        drop_count;
+};
+
+struct lnet_net {
+       /* chain on the ln_nets */
+       struct list_head        net_list;
+
+       /* net ID, which is composed of
+        * (net_type << 16) | net_num.
+        * net_type can be one of the enumerated types defined in
+        * lnet/include/lnet/nidstr.h */
+       __u32                   net_id;
+
+       /* priority of the network */
+       __u32                   net_prio;
+
+       /* total number of CPTs in the array */
+       __u32                   net_ncpts;
+
+       /* cumulative CPTs of all NIs in this net */
+       __u32                   *net_cpts;
+
+       /* network tunables */
+       struct lnet_ioctl_config_lnd_cmn_tunables net_tunables;
+
+       /*
+        * boolean to indicate that the tunables have been set and
+        * shouldn't be reset
+        */
+       bool                    net_tunables_set;
+
+       /* procedural interface */
+       lnd_t                   *net_lnd;
+
+       /* list of NIs on this net */
+       struct list_head        net_ni_list;
+
+       /* list of NIs being added, but not started yet */
+       struct list_head        net_ni_added;
+
+       /* dying LND instances */
+       struct list_head        net_ni_zombie;
+
+       /* network state */
+       enum lnet_net_state     net_state;
+};
+
 typedef struct lnet_ni {
+       /* chain on the lnet_net structure */
+       struct list_head        ni_netlist;
+
+       /* chain on net_ni_cpt */
+       struct list_head        ni_cptlist;
+
        spinlock_t              ni_lock;
-       struct list_head        ni_list;        /* chain on ln_nis */
-       struct list_head        ni_cptlist;     /* chain on ln_nis_cpt */
-       int                     ni_maxtxcredits; /* # tx credits  */
-       /* # per-peer send credits */
-       int                     ni_peertxcredits;
-       /* # per-peer router buffer credits */
-       int                     ni_peerrtrcredits;
-       /* seconds to consider peer dead */
-       int                     ni_peertimeout;
-       int                     ni_ncpts;       /* number of CPTs */
-       __u32                   *ni_cpts;       /* bond NI on some CPTs */
-       lnet_nid_t              ni_nid;         /* interface's NID */
-       void                    *ni_data;       /* instance-specific data */
-       lnd_t                   *ni_lnd;        /* procedural interface */
-       struct lnet_tx_queue    **ni_tx_queues; /* percpt TX queues */
-       int                     **ni_refs;      /* percpt reference count */
-       time64_t                ni_last_alive;  /* when I was last alive */
-       struct lnet_ni_status   *ni_status;     /* my health status */
+
+       /* number of CPTs */
+       int                     ni_ncpts;
+
+       /* bond NI on some CPTs */
+       __u32                   *ni_cpts;
+
+       /* interface's NID */
+       lnet_nid_t              ni_nid;
+
+       /* instance-specific data */
+       void                    *ni_data;
+
+       /* per ni credits */
+       atomic_t                ni_tx_credits;
+
+       /* percpt TX queues */
+       struct lnet_tx_queue    **ni_tx_queues;
+
+       /* percpt reference count */
+       int                     **ni_refs;
+
+       /* when I was last alive */
+       long                    ni_last_alive;
+
+       /* pointer to parent network */
+       struct lnet_net         *ni_net;
+
+       /* my health status */
+       lnet_ni_status_t        *ni_status;
+
+       /* NI FSM */
+       enum lnet_ni_state      ni_state;
+
        /* per NI LND tunables */
-       struct lnet_ioctl_config_lnd_tunables *ni_lnd_tunables;
-       /* equivalent interfaces to use */
+       struct lnet_lnd_tunables ni_lnd_tunables;
+
+       /* lnd tunables set explicitly */
+       bool ni_lnd_tunables_set;
+
+       /* NI statistics */
+       struct lnet_element_stats ni_stats;
+
+       /* physical device CPT */
+       int                     ni_dev_cpt;
+
+       /* sequence number used to round robin over nis within a net */
+       __u32                   ni_seq;
+
+       /*
+        * equivalent interfaces to use
+        * This is an array because socklnd bonding can still be configured
+        */
        char                    *ni_interfaces[LNET_MAX_INTERFACES];
        struct net              *ni_net_ns;     /* original net namespace */
 } lnet_ni_t;
@@ -317,63 +435,116 @@ typedef struct {
        /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
        struct list_head        rcd_list;
        lnet_handle_md_t        rcd_mdh;        /* ping buffer MD */
-       struct lnet_peer        *rcd_gateway;   /* reference to gateway */
+       struct lnet_peer_ni     *rcd_gateway;   /* reference to gateway */
        struct lnet_ping_info   *rcd_pinginfo;  /* ping buffer */
 } lnet_rc_data_t;
 
-typedef struct lnet_peer {
+struct lnet_peer_ni {
+       /* chain on peer_net */
+       struct list_head        lpni_on_peer_net_list;
+       /* chain on remote peer list */
+       struct list_head        lpni_on_remote_peer_ni_list;
        /* chain on peer hash */
-       struct list_head        lp_hashlist;
+       struct list_head        lpni_hashlist;
        /* messages blocking for tx credits */
-       struct list_head        lp_txq;
+       struct list_head        lpni_txq;
        /* messages blocking for router credits */
-       struct list_head        lp_rtrq;
+       struct list_head        lpni_rtrq;
        /* chain on router list */
-       struct list_head        lp_rtr_list;
+       struct list_head        lpni_rtr_list;
+       /* pointer to peer net I'm part of */
+       struct lnet_peer_net    *lpni_peer_net;
+       /* statistics kept on each peer NI */
+       struct lnet_element_stats lpni_stats;
+       /* spin lock protecting credits and lpni_txq / lpni_rtrq */
+       spinlock_t              lpni_lock;
        /* # tx credits available */
-       int                     lp_txcredits;
+       int                     lpni_txcredits;
        /* low water mark */
-       int                     lp_mintxcredits;
+       int                     lpni_mintxcredits;
        /* # router credits */
-       int                     lp_rtrcredits;
+       int                     lpni_rtrcredits;
        /* low water mark */
-       int                     lp_minrtrcredits;
+       int                     lpni_minrtrcredits;
+       /* bytes queued for sending */
+       long                    lpni_txqnob;
        /* alive/dead? */
-       unsigned int            lp_alive:1;
+       bool                    lpni_alive;
        /* notification outstanding? */
-       unsigned int            lp_notify:1;
+       bool                    lpni_notify;
        /* outstanding notification for LND? */
-       unsigned int            lp_notifylnd:1;
+       bool                    lpni_notifylnd;
        /* some thread is handling notification */
-       unsigned int            lp_notifying:1;
+       bool                    lpni_notifying;
        /* SEND event outstanding from ping */
-       unsigned int            lp_ping_notsent;
-       /* # times router went dead<->alive */
-       int                     lp_alive_count;
-       /* bytes queued for sending */
-       long                    lp_txqnob;
+       bool                    lpni_ping_notsent;
+       /* # times router went dead<->alive. Protected with lpni_lock */
+       int                     lpni_alive_count;
        /* time of last aliveness news */
-       cfs_time_t              lp_timestamp;
+       cfs_time_t              lpni_timestamp;
        /* time of last ping attempt */
-       cfs_time_t              lp_ping_timestamp;
+       cfs_time_t              lpni_ping_timestamp;
        /* != 0 if ping reply expected */
-       cfs_time_t              lp_ping_deadline;
+       cfs_time_t              lpni_ping_deadline;
        /* when I was last alive */
-       cfs_time_t              lp_last_alive;
-       /* when lp_ni was queried last time */
-       cfs_time_t              lp_last_query;
-       /* interface peer is on */
-       lnet_ni_t               *lp_ni;
-       lnet_nid_t              lp_nid;         /* peer's NID */
-       int                     lp_refcount;    /* # refs */
-       int                     lp_cpt;         /* CPT this peer attached on */
+       cfs_time_t              lpni_last_alive;
+       /* when lpni_ni was queried last time */
+       cfs_time_t              lpni_last_query;
+       /* network peer is on */
+       struct lnet_net         *lpni_net;
+       /* peer's NID */
+       lnet_nid_t              lpni_nid;
+       /* # refs */
+       atomic_t                lpni_refcount;
+       /* CPT this peer attached on */
+       int                     lpni_cpt;
        /* # refs from lnet_route_t::lr_gateway */
-       int                     lp_rtr_refcount;
-       /* returned RC ping features */
-       unsigned int            lp_ping_feats;
-       struct list_head        lp_routes;      /* routers on this peer */
-       lnet_rc_data_t          *lp_rcd;        /* router checker state */
-} lnet_peer_t;
+       int                     lpni_rtr_refcount;
+       /* sequence number used to round robin over peer nis within a net */
+       __u32                   lpni_seq;
+       /* sequence number used to round robin over gateways */
+       __u32                   lpni_gw_seq;
+       /* health flag */
+       bool                    lpni_healthy;
+       /* returned RC ping features. Protected with lpni_lock */
+       unsigned int            lpni_ping_feats;
+       /* routes on this peer */
+       struct list_head        lpni_routes;
+       /* array of preferred local nids */
+       lnet_nid_t              *lpni_pref_nids;
+       /* number of preferred NIDs in lnpi_pref_nids */
+       __u32                   lpni_pref_nnids;
+       /* router checker state */
+       lnet_rc_data_t          *lpni_rcd;
+};
+
+struct lnet_peer {
+       /* chain on global peer list */
+       struct list_head        lp_on_lnet_peer_list;
+
+       /* list of peer nets */
+       struct list_head        lp_peer_nets;
+
+       /* primary NID of the peer */
+       lnet_nid_t              lp_primary_nid;
+
+       /* peer is Multi-Rail enabled peer */
+       bool                    lp_multi_rail;
+};
+
+struct lnet_peer_net {
+       /* chain on peer block */
+       struct list_head        lpn_on_peer_list;
+
+       /* list of peer_nis on this network */
+       struct list_head        lpn_peer_nis;
+
+       /* pointer to the peer I'm part of */
+       struct lnet_peer        *lpn_peer;
+
+       /* Net ID */
+       __u32                   lpn_net_id;
+};
 
 /* peer hash size */
 #define LNET_PEER_HASH_BITS    9
@@ -382,22 +553,23 @@ typedef struct lnet_peer {
 /* peer hash table */
 struct lnet_peer_table {
        int                     pt_version;     /* /proc validity stamp */
-       int                     pt_number;      /* # peers extant */
-       int                     pt_zombies;     /* # zombies to go to deathrow
-                                                * (and not there yet) */
-       struct list_head        pt_deathrow;    /* zombie peers */
+       atomic_t                pt_number;      /* # peers extant */
        struct list_head        *pt_hash;       /* NID->peer hash */
+       struct list_head        pt_zombie_list; /* zombie peers */
+       int                     pt_zombies;     /* # zombie peers */
+       spinlock_t              pt_zombie_lock; /* protect list and count */
 };
 
 /* peer aliveness is enabled only on routers for peers in a network where the
  * lnet_ni_t::ni_peertimeout has been set to a positive value */
 #define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
-                                        (lp)->lp_ni->ni_peertimeout > 0)
+                                       ((lp)->lpni_net) && \
+                                       (lp)->lpni_net->net_tunables.lct_peer_timeout > 0)
 
 typedef struct {
        struct list_head        lr_list;        /* chain on net */
        struct list_head        lr_gwlist;      /* chain on gateway */
-       lnet_peer_t             *lr_gateway;    /* router node */
+       struct lnet_peer_ni     *lr_gateway;    /* router node */
        __u32                   lr_net;         /* remote network number */
        int                     lr_seq;         /* sequence for round-robin */
        unsigned int            lr_downis;      /* number of down NIs */
@@ -470,6 +642,7 @@ enum {
 struct lnet_match_info {
        __u64                   mi_mbits;
        lnet_process_id_t       mi_id;
+       unsigned int            mi_cpt;
        unsigned int            mi_opc;
        unsigned int            mi_portal;
        unsigned int            mi_rlength;
@@ -593,17 +766,20 @@ typedef struct
        struct lnet_msg_container       **ln_msg_containers;
        lnet_counters_t                 **ln_counters;
        struct lnet_peer_table          **ln_peer_tables;
+       /* list of configured or discovered peers */
+       struct list_head                ln_peers;
+       /* list of peer nis not on a local network */
+       struct list_head                ln_remote_peer_ni_list;
        /* failure simulation */
        struct list_head                ln_test_peers;
        struct list_head                ln_drop_rules;
        struct list_head                ln_delay_rules;
-
-       struct list_head                ln_nis;         /* LND instances */
-       /* NIs bond on specific CPT(s) */
-       struct list_head                ln_nis_cpt;
-       /* dying LND instances */
-       struct list_head                ln_nis_zombie;
-       lnet_ni_t                       *ln_loni;       /* the loopback NI */
+       /* LND instances */
+       struct list_head                ln_nets;
+       /* the loopback NI */
+       struct lnet_ni                  *ln_loni;
+       /* network zombie list */
+       struct list_head                ln_net_zombie;
 
        /* remote networks with routes to them */
        struct list_head                *ln_remote_nets_hash;
index a5cec58..6b9a949 100644 (file)
@@ -409,6 +409,7 @@ typedef struct {
         * - LNET_MD_IOVEC: The start and length fields specify an array of
         *   struct iovec.
         * - LNET_MD_MAX_SIZE: The max_size field is valid.
+        * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid.
         *
         * Note:
         * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
@@ -432,6 +433,15 @@ typedef struct {
         * descriptor are not logged.
         */
        lnet_handle_eq_t eq_handle;
+       /**
+        * The bulk MD handle which was registered to describe the buffers
+        * either to be used to transfer data to the peer or receive data
+        * from the peer. This allows LNet to properly determine the NUMA
+        * node on which the memory was allocated and use that to select the
+        * nearest local network interface. This value is only used
+        * if the LNET_MD_BULK_HANDLE option is set.
+        */
+       lnet_handle_md_t bulk_handle;
 } lnet_md_t;
 
 /* Max Transfer Unit (minimum supported everywhere).
@@ -462,6 +472,8 @@ typedef struct {
 #define LNET_MD_MAX_SIZE            (1 << 7)
 /** See lnet_md_t::options. */
 #define LNET_MD_KIOV                (1 << 8)
+/** See lnet_md_t::options. */
+#define LNET_MD_BULK_HANDLE         (1 << 9)
 
 /* For compatibility with Cray Portals */
 #define LNET_MD_PHYS                        0
@@ -545,20 +557,22 @@ typedef struct {
        lnet_process_id_t   target;
        /** The identifier (nid, pid) of the initiator. */
        lnet_process_id_t   initiator;
+       /** The source NID on the initiator. */
+       lnet_process_id_t   source;
        /**
         * The NID of the immediate sender. If the request has been forwarded
         * by routers, this is the NID of the last hop; otherwise it's the
-        * same as the initiator.
+        * same as the source.
         */
-       lnet_nid_t          sender;
+       lnet_nid_t          sender;
        /** Indicates the type of the event. */
        lnet_event_kind_t   type;
        /** The portal table index specified in the request */
-       unsigned int        pt_index;
+       unsigned int        pt_index;
        /** A copy of the match bits specified in the request. */
-       __u64               match_bits;
+       __u64               match_bits;
        /** The length (in bytes) specified in the request. */
-       unsigned int        rlength;
+       unsigned int        rlength;
        /**
         * The length (in bytes) of the data that was manipulated by the
         * operation. For truncated operations, the manipulated length will be
@@ -566,7 +580,7 @@ typedef struct {
         * see lnet_md_t). For all other operations, the manipulated length
         * will be the length of the requested operation, i.e. rlength.
         */
-       unsigned int        mlength;
+       unsigned int        mlength;
        /**
         * The handle to the MD associated with the event. The handle may be
         * invalid if the MD has been unlinked.
@@ -577,31 +591,31 @@ typedef struct {
         * been processed. In particular, the threshold field in md will
         * reflect the value of the threshold after the operation occurred.
         */
-       lnet_md_t           md;
+       lnet_md_t           md;
        /**
         * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
         * \see LNetPut
         */
-       __u64               hdr_data;
+       __u64               hdr_data;
        /**
         * Indicates the completion status of the operation. It's 0 for
         * successful operations, otherwise it's an error code.
         */
-       int                 status;
+       int                 status;
        /**
         * Indicates whether the MD has been unlinked. Note that:
         * - An event with unlinked set is the last event on the MD.
         * - This field is also set for an explicit LNET_EVENT_UNLINK event.
         * \see LNetMDUnlink
         */
-       int                 unlinked;
+       int                 unlinked;
        /**
         * The displacement (in bytes) into the memory region that the
         * operation used. The offset can be determined by the operation for
         * a remote managed MD or by the local MD.
         * \see lnet_md_t::options
         */
-       unsigned int        offset;
+       unsigned int        offset;
        /**
         * The sequence number for this event. Sequence numbers are unique
         * to each event.
index 4e1d708..2922f5e 100644 (file)
@@ -2684,9 +2684,9 @@ kgnilnd_startup(lnet_ni_t *ni)
        kgn_net_t        *net;
        ENTRY;
 
-       LASSERTF(ni->ni_lnd == &the_kgnilnd,
+       LASSERTF(ni->ni_net->net_lnd == &the_kgnilnd,
                "bad LND 0x%p != the_kgnilnd @ 0x%p\n",
-               ni->ni_lnd, &the_kgnilnd);
+               ni->ni_net->net_lnd, &the_kgnilnd);
 
        if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
                rc = kgnilnd_base_startup();
index ee5a01f..cd136b3 100644 (file)
@@ -255,7 +255,7 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob)
         msg->ibm_cksum = msg_cksum;
 
         if (flip) {
-                /* leave magic unflipped as a clue to peer endianness */
+                /* leave magic unflipped as a clue to peer_ni endianness */
                 msg->ibm_version = version;
                 CLASSERT (sizeof(msg->ibm_type) == 1);
                 CLASSERT (sizeof(msg->ibm_credits) == 1);
@@ -313,33 +313,33 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_ni_t **peerp, lnet_nid_t nid)
 {
-       kib_peer_t      *peer;
+       kib_peer_ni_t   *peer_ni;
        kib_net_t       *net = ni->ni_data;
-       int             cpt = lnet_cpt_of_nid(nid);
+       int             cpt = lnet_cpt_of_nid(nid, ni);
        unsigned long   flags;
 
        LASSERT(net != NULL);
        LASSERT(nid != LNET_NID_ANY);
 
-       LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
-        if (peer == NULL) {
-                CERROR("Cannot allocate peer\n");
+       LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+        if (peer_ni == NULL) {
+                CERROR("Cannot allocate peer_ni\n");
                 return -ENOMEM;
         }
 
-       peer->ibp_ni = ni;
-       peer->ibp_nid = nid;
-       peer->ibp_error = 0;
-       peer->ibp_last_alive = 0;
-       peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
-       peer->ibp_queue_depth = ni->ni_peertxcredits;
-       atomic_set(&peer->ibp_refcount, 1);     /* 1 ref for caller */
+       peer_ni->ibp_ni = ni;
+       peer_ni->ibp_nid = nid;
+       peer_ni->ibp_error = 0;
+       peer_ni->ibp_last_alive = 0;
+       peer_ni->ibp_max_frags = kiblnd_cfg_rdma_frags(peer_ni->ibp_ni);
+       peer_ni->ibp_queue_depth = ni->ni_net->net_tunables.lct_peer_tx_credits;
+       atomic_set(&peer_ni->ibp_refcount, 1);  /* 1 ref for caller */
 
-       INIT_LIST_HEAD(&peer->ibp_list);        /* not in the peer table yet */
-       INIT_LIST_HEAD(&peer->ibp_conns);
-       INIT_LIST_HEAD(&peer->ibp_tx_queue);
+       INIT_LIST_HEAD(&peer_ni->ibp_list);     /* not in the peer_ni table yet */
+       INIT_LIST_HEAD(&peer_ni->ibp_conns);
+       INIT_LIST_HEAD(&peer_ni->ibp_tx_queue);
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
@@ -351,72 +351,79 @@ kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
 
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-       *peerp = peer;
+       *peerp = peer_ni;
        return 0;
 }
 
 void
-kiblnd_destroy_peer (kib_peer_t *peer)
+kiblnd_destroy_peer (kib_peer_ni_t *peer_ni)
 {
-       kib_net_t *net = peer->ibp_ni->ni_data;
+       kib_net_t *net = peer_ni->ibp_ni->ni_data;
 
        LASSERT(net != NULL);
-       LASSERT (atomic_read(&peer->ibp_refcount) == 0);
-       LASSERT(!kiblnd_peer_active(peer));
-       LASSERT(kiblnd_peer_idle(peer));
-       LASSERT(list_empty(&peer->ibp_tx_queue));
+       LASSERT (atomic_read(&peer_ni->ibp_refcount) == 0);
+       LASSERT(!kiblnd_peer_active(peer_ni));
+       LASSERT(kiblnd_peer_idle(peer_ni));
+       LASSERT(list_empty(&peer_ni->ibp_tx_queue));
 
-       LIBCFS_FREE(peer, sizeof(*peer));
+       LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
 
-       /* NB a peer's connections keep a reference on their peer until
+       /* NB a peer_ni's connections keep a reference on their peer_ni until
         * they are destroyed, so we can be assured that _all_ state to do
-        * with this peer has been cleaned up when its refcount drops to
+        * with this peer_ni has been cleaned up when its refcount drops to
         * zero. */
        atomic_dec(&net->ibn_npeers);
 }
 
-kib_peer_t *
-kiblnd_find_peer_locked (lnet_nid_t nid)
+kib_peer_ni_t *
+kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid)
 {
        /* the caller is responsible for accounting the additional reference
         * that this creates */
        struct list_head        *peer_list = kiblnd_nid2peerlist(nid);
        struct list_head        *tmp;
-       kib_peer_t              *peer;
+       kib_peer_ni_t           *peer_ni;
 
        list_for_each(tmp, peer_list) {
 
-               peer = list_entry(tmp, kib_peer_t, ibp_list);
-               LASSERT(!kiblnd_peer_idle(peer));
-
-               if (peer->ibp_nid != nid)
+               peer_ni = list_entry(tmp, kib_peer_ni_t, ibp_list);
+               LASSERT(!kiblnd_peer_idle(peer_ni));
+
+               /*
+                * Match a peer if its NID and the NID of the local NI it
+                * communicates over are the same. Otherwise don't match
+                * the peer, which will result in a new lnd peer being
+                * created.
+                */
+               if (peer_ni->ibp_nid != nid ||
+                   peer_ni->ibp_ni->ni_nid != ni->ni_nid)
                        continue;
 
-               CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
-                      peer, libcfs_nid2str(nid),
-                      atomic_read(&peer->ibp_refcount),
-                      peer->ibp_version);
-               return peer;
+               CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d) version: %x\n",
+                      peer_ni, libcfs_nid2str(nid),
+                      atomic_read(&peer_ni->ibp_refcount),
+                      peer_ni->ibp_version);
+               return peer_ni;
        }
        return NULL;
 }
 
 void
-kiblnd_unlink_peer_locked (kib_peer_t *peer)
+kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni)
 {
-       LASSERT(list_empty(&peer->ibp_conns));
+       LASSERT(list_empty(&peer_ni->ibp_conns));
 
-        LASSERT (kiblnd_peer_active(peer));
-       list_del_init(&peer->ibp_list);
+        LASSERT (kiblnd_peer_active(peer_ni));
+       list_del_init(&peer_ni->ibp_list);
         /* lose peerlist's ref */
-        kiblnd_peer_decref(peer);
+        kiblnd_peer_decref(peer_ni);
 }
 
 static int
 kiblnd_get_peer_info(lnet_ni_t *ni, int index,
                     lnet_nid_t *nidp, int *count)
 {
-       kib_peer_t              *peer;
+       kib_peer_ni_t           *peer_ni;
        struct list_head        *ptmp;
        int                      i;
        unsigned long            flags;
@@ -427,17 +434,17 @@ kiblnd_get_peer_info(lnet_ni_t *ni, int index,
 
                list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
-                       LASSERT(!kiblnd_peer_idle(peer));
+                       peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+                       LASSERT(!kiblnd_peer_idle(peer_ni));
 
-                       if (peer->ibp_ni != ni)
+                       if (peer_ni->ibp_ni != ni)
                                continue;
 
                        if (index-- > 0)
                                continue;
 
-                       *nidp = peer->ibp_nid;
-                       *count = atomic_read(&peer->ibp_refcount);
+                       *nidp = peer_ni->ibp_nid;
+                       *count = atomic_read(&peer_ni->ibp_refcount);
 
                        read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
                                               flags);
@@ -450,23 +457,23 @@ kiblnd_get_peer_info(lnet_ni_t *ni, int index,
 }
 
 static void
-kiblnd_del_peer_locked (kib_peer_t *peer)
+kiblnd_del_peer_locked (kib_peer_ni_t *peer_ni)
 {
        struct list_head        *ctmp;
        struct list_head        *cnxt;
        kib_conn_t              *conn;
 
-       if (list_empty(&peer->ibp_conns)) {
-               kiblnd_unlink_peer_locked(peer);
+       if (list_empty(&peer_ni->ibp_conns)) {
+               kiblnd_unlink_peer_locked(peer_ni);
        } else {
-               list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+               list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
                        conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
                        kiblnd_close_conn_locked(conn, 0);
                }
-               /* NB closing peer's last conn unlinked it. */
+               /* NB closing peer_ni's last conn unlinked it. */
        }
-       /* NB peer now unlinked; might even be freed if the peer table had the
+       /* NB peer_ni now unlinked; might even be freed if the peer_ni table had the
         * last ref on it. */
 }
 
@@ -476,7 +483,7 @@ kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
        struct list_head        zombies = LIST_HEAD_INIT(zombies);
        struct list_head        *ptmp;
        struct list_head        *pnxt;
-       kib_peer_t              *peer;
+       kib_peer_ni_t           *peer_ni;
        int                     lo;
        int                     hi;
        int                     i;
@@ -494,23 +501,23 @@ kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
 
        for (i = lo; i <= hi; i++) {
                list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
-                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
-                       LASSERT(!kiblnd_peer_idle(peer));
+                       peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+                       LASSERT(!kiblnd_peer_idle(peer_ni));
 
-                       if (peer->ibp_ni != ni)
+                       if (peer_ni->ibp_ni != ni)
                                continue;
 
-                       if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+                       if (!(nid == LNET_NID_ANY || peer_ni->ibp_nid == nid))
                                continue;
 
-                       if (!list_empty(&peer->ibp_tx_queue)) {
-                               LASSERT(list_empty(&peer->ibp_conns));
+                       if (!list_empty(&peer_ni->ibp_tx_queue)) {
+                               LASSERT(list_empty(&peer_ni->ibp_conns));
 
-                               list_splice_init(&peer->ibp_tx_queue,
+                               list_splice_init(&peer_ni->ibp_tx_queue,
                                                 &zombies);
                        }
 
-                       kiblnd_del_peer_locked(peer);
+                       kiblnd_del_peer_locked(peer_ni);
                        rc = 0;         /* matched something */
                }
        }
@@ -525,7 +532,7 @@ kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
 static kib_conn_t *
 kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
 {
-       kib_peer_t              *peer;
+       kib_peer_ni_t           *peer_ni;
        struct list_head        *ptmp;
        kib_conn_t              *conn;
        struct list_head        *ctmp;
@@ -537,13 +544,13 @@ kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
                list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
 
-                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
-                       LASSERT(!kiblnd_peer_idle(peer));
+                       peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+                       LASSERT(!kiblnd_peer_idle(peer_ni));
 
-                       if (peer->ibp_ni != ni)
+                       if (peer_ni->ibp_ni != ni)
                                continue;
 
-                       list_for_each(ctmp, &peer->ibp_conns) {
+                       list_for_each(ctmp, &peer_ni->ibp_conns) {
                                if (index-- > 0)
                                        continue;
 
@@ -692,18 +699,18 @@ kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
 }
 
 kib_conn_t *
-kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                   int state, int version)
 {
        /* CAVEAT EMPTOR:
         * If the new conn is created successfully it takes over the caller's
-        * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
-        * is destroyed.  On failure, the caller's ref on 'peer' remains and
+        * ref on 'peer_ni'.  It also "owns" 'cmid' and destroys it when it itself
+        * is destroyed.  On failure, the caller's ref on 'peer_ni' remains and
         * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
         * to destroy 'cmid' here since I'm called from the CM which still has
         * its ref on 'cmid'). */
        rwlock_t               *glock = &kiblnd_data.kib_global_lock;
-       kib_net_t              *net = peer->ibp_ni->ni_data;
+       kib_net_t              *net = peer_ni->ibp_ni->ni_data;
        kib_dev_t              *dev;
        struct ib_qp_init_attr *init_qp_attr;
        struct kib_sched_info   *sched;
@@ -722,7 +729,7 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
 
        dev = net->ibn_dev;
 
-       cpt = lnet_cpt_of_nid(peer->ibp_nid);
+       cpt = lnet_cpt_of_nid(peer_ni->ibp_nid, peer_ni->ibp_ni);
        sched = kiblnd_data.kib_scheds[cpt];
 
        LASSERT(sched->ibs_nthreads > 0);
@@ -731,24 +738,24 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
                         sizeof(*init_qp_attr));
        if (init_qp_attr == NULL) {
                CERROR("Can't allocate qp_attr for %s\n",
-                      libcfs_nid2str(peer->ibp_nid));
+                      libcfs_nid2str(peer_ni->ibp_nid));
                goto failed_0;
        }
 
        LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
        if (conn == NULL) {
                CERROR("Can't allocate connection for %s\n",
-                      libcfs_nid2str(peer->ibp_nid));
+                      libcfs_nid2str(peer_ni->ibp_nid));
                goto failed_1;
        }
 
        conn->ibc_state = IBLND_CONN_INIT;
        conn->ibc_version = version;
-       conn->ibc_peer = peer;                  /* I take the caller's ref */
+       conn->ibc_peer = peer_ni;                       /* I take the caller's ref */
        cmid->context = conn;                   /* for future CM callbacks */
        conn->ibc_cmid = cmid;
-       conn->ibc_max_frags = peer->ibp_max_frags;
-       conn->ibc_queue_depth = peer->ibp_queue_depth;
+       conn->ibc_max_frags = peer_ni->ibp_max_frags;
+       conn->ibc_queue_depth = peer_ni->ibp_queue_depth;
 
        INIT_LIST_HEAD(&conn->ibc_early_rxs);
        INIT_LIST_HEAD(&conn->ibc_tx_noops);
@@ -921,7 +928,7 @@ void
 kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 {
        struct rdma_cm_id *cmid = conn->ibc_cmid;
-       kib_peer_t        *peer = conn->ibc_peer;
+       kib_peer_ni_t        *peer_ni = conn->ibc_peer;
        int                rc;
 
        LASSERT (!in_interrupt());
@@ -975,9 +982,9 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 
        /* See CAVEAT EMPTOR above in kiblnd_create_conn */
        if (conn->ibc_state != IBLND_CONN_INIT) {
-               kib_net_t *net = peer->ibp_ni->ni_data;
+               kib_net_t *net = peer_ni->ibp_ni->ni_data;
 
-               kiblnd_peer_decref(peer);
+               kiblnd_peer_decref(peer_ni);
                rdma_destroy_id(cmid);
                atomic_dec(&net->ibn_nconns);
        }
@@ -987,19 +994,19 @@ kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
 }
 
 int
-kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
+kiblnd_close_peer_conns_locked(kib_peer_ni_t *peer_ni, int why)
 {
        kib_conn_t              *conn;
        struct list_head        *ctmp;
        struct list_head        *cnxt;
        int                     count = 0;
 
-       list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+       list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
                conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
                CDEBUG(D_NET, "Closing conn -> %s, "
                              "version: %x, reason: %d\n",
-                      libcfs_nid2str(peer->ibp_nid),
+                      libcfs_nid2str(peer_ni->ibp_nid),
                       conn->ibc_version, why);
 
                kiblnd_close_conn_locked(conn, why);
@@ -1010,7 +1017,7 @@ kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
 }
 
 int
-kiblnd_close_stale_conns_locked(kib_peer_t *peer,
+kiblnd_close_stale_conns_locked(kib_peer_ni_t *peer_ni,
                                int version, __u64 incarnation)
 {
        kib_conn_t              *conn;
@@ -1018,7 +1025,7 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer,
        struct list_head        *cnxt;
        int                     count = 0;
 
-       list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
+       list_for_each_safe(ctmp, cnxt, &peer_ni->ibp_conns) {
                conn = list_entry(ctmp, kib_conn_t, ibc_list);
 
                if (conn->ibc_version     == version &&
@@ -1027,7 +1034,7 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer,
 
                CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
                              "incarnation:%#llx(%x, %#llx)\n",
-                      libcfs_nid2str(peer->ibp_nid),
+                      libcfs_nid2str(peer_ni->ibp_nid),
                       conn->ibc_version, conn->ibc_incarnation,
                       version, incarnation);
 
@@ -1041,7 +1048,7 @@ kiblnd_close_stale_conns_locked(kib_peer_t *peer,
 static int
 kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
 {
-       kib_peer_t              *peer;
+       kib_peer_ni_t           *peer_ni;
        struct list_head        *ptmp;
        struct list_head        *pnxt;
        int                     lo;
@@ -1062,16 +1069,16 @@ kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
        for (i = lo; i <= hi; i++) {
                list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
 
-                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
-                       LASSERT(!kiblnd_peer_idle(peer));
+                       peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
+                       LASSERT(!kiblnd_peer_idle(peer_ni));
 
-                       if (peer->ibp_ni != ni)
+                       if (peer_ni->ibp_ni != ni)
                                continue;
 
-                       if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+                       if (!(nid == LNET_NID_ANY || nid == peer_ni->ibp_nid))
                                continue;
 
-                       count += kiblnd_close_peer_conns_locked(peer, 0);
+                       count += kiblnd_close_peer_conns_locked(peer_ni, 0);
                }
        }
 
@@ -1144,27 +1151,27 @@ kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
        cfs_time_t      last_alive = 0;
        cfs_time_t      now = cfs_time_current();
        rwlock_t        *glock = &kiblnd_data.kib_global_lock;
-       kib_peer_t      *peer;
+       kib_peer_ni_t   *peer_ni;
        unsigned long   flags;
 
        read_lock_irqsave(glock, flags);
 
-       peer = kiblnd_find_peer_locked(nid);
-       if (peer != NULL)
-               last_alive = peer->ibp_last_alive;
+       peer_ni = kiblnd_find_peer_locked(ni, nid);
+       if (peer_ni != NULL)
+               last_alive = peer_ni->ibp_last_alive;
 
        read_unlock_irqrestore(glock, flags);
 
        if (last_alive != 0)
                *when = last_alive;
 
-       /* peer is not persistent in hash, trigger peer creation
+       /* peer_ni is not persistent in hash, trigger peer_ni creation
         * and connection establishment with a NULL tx */
-       if (peer == NULL)
+       if (peer_ni == NULL)
                kiblnd_launch_tx(ni, NULL, nid);
 
-       CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
-              libcfs_nid2str(nid), peer,
+       CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago\n",
+              libcfs_nid2str(nid), peer_ni,
               last_alive ? cfs_duration_sec(now - last_alive) : -1);
        return;
 }
@@ -1391,7 +1398,7 @@ kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
        int     mod;
        __u16   nfrags;
 
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
        mod = tunables->lnd_map_on_demand;
        nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
 
@@ -2395,7 +2402,7 @@ kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, int ncpts)
        int             rc;
        int             i;
 
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        if (tunables->lnd_map_on_demand == 0) {
@@ -2926,7 +2933,7 @@ kiblnd_shutdown (lnet_ni_t *ni)
                 /* nuke all existing peers within this net */
                 kiblnd_del_peer(ni, LNET_NID_ANY);
 
-               /* Wait for all peer state to clean up */
+               /* Wait for all peer_ni state to clean up */
                i = 2;
                while (atomic_read(&net->ibn_npeers) != 0) {
                        i++;
@@ -3176,8 +3183,9 @@ kiblnd_startup (lnet_ni_t *ni)
         unsigned long             flags;
         int                       rc;
        int                       newdev;
+       int                       node_id;
 
-        LASSERT (ni->ni_lnd == &the_o2iblnd);
+        LASSERT (ni->ni_net->net_lnd == &the_o2iblnd);
 
         if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
                 rc = kiblnd_base_startup();
@@ -3219,13 +3227,16 @@ kiblnd_startup (lnet_ni_t *ni)
        newdev = ibdev == NULL;
        /* hmm...create kib_dev even for alias */
        if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
-                ibdev = kiblnd_create_dev(ifname);
+               ibdev = kiblnd_create_dev(ifname);
 
-        if (ibdev == NULL)
-                goto failed;
+       if (ibdev == NULL)
+               goto failed;
+
+       node_id = dev_to_node(ibdev->ibd_hdev->ibh_ibdev->dma_device);
+       ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
 
-        net->ibn_dev = ibdev;
-        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+       net->ibn_dev = ibdev;
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
 
        rc = kiblnd_dev_start_threads(ibdev, newdev,
                                      ni->ni_cpts, ni->ni_ncpts);
index a617b63..25382c2 100644 (file)
@@ -78,7 +78,7 @@
 #include <lnet/lnet.h>
 #include <lnet/lib-lnet.h>
 
-#define IBLND_PEER_HASH_SIZE           101     /* # peer lists */
+#define IBLND_PEER_HASH_SIZE           101     /* # peer_ni lists */
 /* # scheduler loops before reschedule */
 #define IBLND_RESCHED                  100
 
@@ -110,8 +110,8 @@ extern kib_tunables_t  kiblnd_tunables;
 #define IBLND_MSG_QUEUE_SIZE_V1      8          /* V1 only : # messages/RDMAs in-flight */
 #define IBLND_CREDIT_HIGHWATER_V1    7          /* V1 only : when eagerly to return credits */
 
-#define IBLND_CREDITS_DEFAULT        8          /* default # of peer credits */
-#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+#define IBLND_CREDITS_DEFAULT        8          /* default # of peer_ni credits */
+#define IBLND_CREDITS_MAX          ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer_ni credits */
 
 /* when eagerly to return credits */
 #define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
@@ -398,7 +398,7 @@ typedef struct
        /* schedulers sleep here */
        wait_queue_head_t       kib_failover_waitq;
        atomic_t                kib_nthreads;   /* # live threads */
-       /* stabilize net/dev/peer/conn ops */
+       /* stabilize net/dev/peer_ni/conn ops */
        rwlock_t                kib_global_lock;
        /* hash table of all my known peers */
        struct list_head        *kib_peers;
@@ -535,7 +535,7 @@ typedef struct {
         __u16            ibr_version;           /* sender's version */
         __u8             ibr_why;               /* reject reason */
         __u8             ibr_padding;           /* padding */
-        __u64            ibr_incarnation;       /* incarnation of peer */
+        __u64            ibr_incarnation;       /* incarnation of peer_ni */
         kib_connparams_t ibr_cp;                /* connection parameters */
 } WIRE_ATTR kib_rej_t;
 
@@ -544,12 +544,12 @@ typedef struct {
 #define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
 #define IBLND_REJECT_FATAL           3          /* Anything else */
 
-#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer */
-#define IBLND_REJECT_CONN_STALE      5          /* stale peer */
+#define IBLND_REJECT_CONN_UNCOMPAT   4          /* incompatible version peer_ni */
+#define IBLND_REJECT_CONN_STALE      5          /* stale peer_ni */
 
-/* peer's rdma frags doesn't match mine */
+/* peer_ni's rdma frags doesn't match mine */
 #define IBLND_REJECT_RDMA_FRAGS      6
-/* peer's msg queue size doesn't match mine */
+/* peer_ni's msg queue size doesn't match mine */
 #define IBLND_REJECT_MSG_QUEUE_SIZE  7
 
 /***********************************************************************/
@@ -578,7 +578,7 @@ typedef struct kib_rx                           /* receive message */
 
 #define IBLND_POSTRX_DONT_POST    0             /* don't post */
 #define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
-#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer back 1 credit */
+#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer_ni back 1 credit */
 #define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
 
 typedef struct kib_tx                           /* transmit message */
@@ -593,7 +593,7 @@ typedef struct kib_tx                           /* transmit message */
        short                   tx_sending;
        /* queued for sending */
        short                   tx_queued;
-       /* waiting for peer */
+       /* waiting for peer_ni */
        short                   tx_waiting;
        /* LNET completion status */
        int                     tx_status;
@@ -639,11 +639,11 @@ typedef struct kib_conn
 {
        /* scheduler information */
        struct kib_sched_info   *ibc_sched;
-       /* owning peer */
+       /* owning peer_ni */
        struct kib_peer         *ibc_peer;
        /* HCA bound on */
        kib_hca_dev_t           *ibc_hdev;
-       /* stash on peer's conn list */
+       /* stash on peer_ni's conn list */
        struct list_head        ibc_list;
        /* schedule for attention */
        struct list_head        ibc_sched_list;
@@ -720,7 +720,7 @@ typedef struct kib_conn
 
 typedef struct kib_peer
 {
-       /* stash on global peer list */
+       /* stash on global peer_ni list */
        struct list_head        ibp_list;
        /* who's on the other end(s) */
        lnet_nid_t              ibp_nid;
@@ -730,31 +730,31 @@ typedef struct kib_peer
        struct list_head        ibp_conns;
        /* msgs waiting for a conn */
        struct list_head        ibp_tx_queue;
-       /* incarnation of peer */
+       /* incarnation of peer_ni */
        __u64                   ibp_incarnation;
        /* when (in jiffies) I was last alive */
        cfs_time_t              ibp_last_alive;
        /* # users */
        atomic_t                ibp_refcount;
-       /* version of peer */
+       /* version of peer_ni */
        __u16                   ibp_version;
        /* current passive connection attempts */
        unsigned short          ibp_accepting;
        /* current active connection attempts */
        unsigned short          ibp_connecting;
-       /* reconnect this peer later */
+       /* reconnect this peer_ni later */
        unsigned short          ibp_reconnecting:1;
        /* counter of how many times we triggered a conn race */
        unsigned char           ibp_races;
        /* # consecutive reconnection attempts to this peer */
        unsigned int            ibp_reconnected;
-       /* errno on closing this peer */
+       /* errno on closing this peer_ni */
        int                     ibp_error;
        /* max map_on_demand */
        __u16                   ibp_max_frags;
        /* max_peer_credits */
        __u16                   ibp_queue_depth;
-} kib_peer_t;
+} kib_peer_ni_t;
 
 #ifndef HAVE_IB_INC_RKEY
 /**
@@ -782,7 +782,7 @@ kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
        int mod;
 
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
        mod = tunables->lnd_map_on_demand;
        return mod != 0 ? mod : IBLND_MAX_RDMA_FRAGS;
 }
@@ -801,7 +801,7 @@ kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
        int concurrent_sends;
 
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
        concurrent_sends = tunables->lnd_concurrent_sends;
 
        if (version == IBLND_MSG_VERSION_1) {
@@ -868,36 +868,36 @@ do {                                                                      \
        }                                                               \
 } while (0)
 
-#define kiblnd_peer_addref(peer)                                \
+#define kiblnd_peer_addref(peer_ni)                                \
 do {                                                            \
-       CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
-              (peer), libcfs_nid2str((peer)->ibp_nid),         \
-              atomic_read (&(peer)->ibp_refcount));            \
-       atomic_inc(&(peer)->ibp_refcount);                      \
+       CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)++\n",                \
+              (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+              atomic_read (&(peer_ni)->ibp_refcount));         \
+       atomic_inc(&(peer_ni)->ibp_refcount);                   \
 } while (0)
 
-#define kiblnd_peer_decref(peer)                                \
+#define kiblnd_peer_decref(peer_ni)                                \
 do {                                                            \
-       CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
-              (peer), libcfs_nid2str((peer)->ibp_nid),         \
-              atomic_read (&(peer)->ibp_refcount));            \
-       LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);              \
-       if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
-               kiblnd_destroy_peer(peer);                      \
+       CDEBUG(D_NET, "peer_ni[%p] -> %s (%d)--\n",                \
+              (peer_ni), libcfs_nid2str((peer_ni)->ibp_nid),         \
+              atomic_read (&(peer_ni)->ibp_refcount));         \
+       LASSERT_ATOMIC_POS(&(peer_ni)->ibp_refcount);              \
+       if (atomic_dec_and_test(&(peer_ni)->ibp_refcount))      \
+               kiblnd_destroy_peer(peer_ni);                      \
 } while (0)
 
 static inline bool
-kiblnd_peer_connecting(kib_peer_t *peer)
+kiblnd_peer_connecting(kib_peer_ni_t *peer_ni)
 {
-       return peer->ibp_connecting != 0 ||
-              peer->ibp_reconnecting != 0 ||
-              peer->ibp_accepting != 0;
+       return peer_ni->ibp_connecting != 0 ||
+              peer_ni->ibp_reconnecting != 0 ||
+              peer_ni->ibp_accepting != 0;
 }
 
 static inline bool
-kiblnd_peer_idle(kib_peer_t *peer)
+kiblnd_peer_idle(kib_peer_ni_t *peer_ni)
 {
-       return !kiblnd_peer_connecting(peer) && list_empty(&peer->ibp_conns);
+       return !kiblnd_peer_connecting(peer_ni) && list_empty(&peer_ni->ibp_conns);
 }
 
 static inline struct list_head *
@@ -910,19 +910,19 @@ kiblnd_nid2peerlist (lnet_nid_t nid)
 }
 
 static inline int
-kiblnd_peer_active (kib_peer_t *peer)
+kiblnd_peer_active (kib_peer_ni_t *peer_ni)
 {
-       /* Am I in the peer hash table? */
-       return !list_empty(&peer->ibp_list);
+       /* Am I in the peer_ni hash table? */
+       return !list_empty(&peer_ni->ibp_list);
 }
 
 static inline kib_conn_t *
-kiblnd_get_conn_locked (kib_peer_t *peer)
+kiblnd_get_conn_locked (kib_peer_ni_t *peer_ni)
 {
-       LASSERT(!list_empty(&peer->ibp_conns));
+       LASSERT(!list_empty(&peer_ni->ibp_conns));
 
         /* just return the first connection */
-       return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+       return list_entry(peer_ni->ibp_conns.next, kib_conn_t, ibc_list);
 }
 
 static inline int
@@ -941,7 +941,7 @@ kiblnd_need_noop(kib_conn_t *conn)
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
 
        LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
         if (conn->ibc_outstanding_credits <
            IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
@@ -1179,17 +1179,17 @@ int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
 int  kiblnd_translate_mtu(int value);
 
 int  kiblnd_dev_failover(kib_dev_t *dev);
-int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
-void kiblnd_destroy_peer (kib_peer_t *peer);
-bool kiblnd_reconnect_peer(kib_peer_t *peer);
+int  kiblnd_create_peer(lnet_ni_t *ni, kib_peer_ni_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_ni_t *peer);
+bool kiblnd_reconnect_peer(kib_peer_ni_t *peer);
 void kiblnd_destroy_dev (kib_dev_t *dev);
-void kiblnd_unlink_peer_locked (kib_peer_t *peer);
-kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
-int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+void kiblnd_unlink_peer_locked (kib_peer_ni_t *peer_ni);
+kib_peer_ni_t *kiblnd_find_peer_locked(struct lnet_ni *ni, lnet_nid_t nid);
+int  kiblnd_close_stale_conns_locked (kib_peer_ni_t *peer_ni,
                                       int version, __u64 incarnation);
-int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+int  kiblnd_close_peer_conns_locked (kib_peer_ni_t *peer_ni, int why);
 
-kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+kib_conn_t *kiblnd_create_conn(kib_peer_ni_t *peer_ni, struct rdma_cm_id *cmid,
                               int state, int version);
 void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn);
 void kiblnd_close_conn (kib_conn_t *conn, int error);
index a72bdba..3901be7 100644 (file)
@@ -38,8 +38,8 @@
 
 #define MAX_CONN_RACES_BEFORE_ABORT 20
 
-static void kiblnd_peer_alive(kib_peer_t *peer);
-static void kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error);
+static void kiblnd_peer_alive(kib_peer_ni_t *peer_ni);
+static void kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error);
 static void kiblnd_init_tx_msg(lnet_ni_t *ni, kib_tx_t *tx,
                               int type, int body_nob);
 static int kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
@@ -61,7 +61,7 @@ kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
        LASSERT (!in_interrupt());
        LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
-       LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
+       LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer_ni response */
        LASSERT (tx->tx_pool != NULL);
 
        kiblnd_unmap_tx(ni, tx);
@@ -116,7 +116,7 @@ kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
        kib_tx_t                *tx;
        kib_tx_poolset_t        *tps;
 
-       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
        node = kiblnd_pool_alloc_node(&tps->tps_poolset);
         if (node == NULL)
                 return NULL;
@@ -416,7 +416,7 @@ kiblnd_handle_rx (kib_rx_t *rx)
 
                 LASSERT (tx->tx_waiting);
                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
-                 * (a) I can overwrite tx_msg since my peer has received it!
+                 * (a) I can overwrite tx_msg since my peer_ni has received it!
                  * (b) tx_waiting set tells tx_complete() it's not done. */
 
                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
@@ -580,7 +580,7 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
                return rc;
        }
 
-       /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+       /* If rd is not tx_rd, it's going to get sent to a peer_ni, who will need
         * the rkey */
        rd->rd_key = tx->fmr.fmr_key;
        rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
@@ -616,7 +616,7 @@ kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags)
        __u32 nob;
        int i;
 
-        /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+        /* If rd is not tx_rd, it's going to get sent to a peer_ni and I'm the
          * RDMA sink */
         tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
        tx->tx_nfrags = nfrags;
@@ -753,12 +753,12 @@ static int
 kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
 __must_hold(&conn->ibc_lock)
 {
-        kib_msg_t         *msg = tx->tx_msg;
-        kib_peer_t        *peer = conn->ibc_peer;
-       struct lnet_ni    *ni = peer->ibp_ni;
-        int                ver = conn->ibc_version;
-        int                rc;
-        int                done;
+       kib_msg_t *msg = tx->tx_msg;
+       kib_peer_ni_t *peer_ni = conn->ibc_peer;
+       struct lnet_ni *ni = peer_ni->ibp_ni;
+       int ver = conn->ibc_version;
+       int rc;
+       int done;
 
        LASSERT(tx->tx_queued);
        /* We rely on this for QP sizing */
@@ -775,13 +775,13 @@ __must_hold(&conn->ibc_lock)
            kiblnd_concurrent_sends(ver, ni)) {
                 /* tx completions outstanding... */
                 CDEBUG(D_NET, "%s: posted enough\n",
-                       libcfs_nid2str(peer->ibp_nid));
+                       libcfs_nid2str(peer_ni->ibp_nid));
                 return -EAGAIN;
         }
 
         if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
                 CDEBUG(D_NET, "%s: no credits\n",
-                       libcfs_nid2str(peer->ibp_nid));
+                       libcfs_nid2str(peer_ni->ibp_nid));
                 return -EAGAIN;
         }
 
@@ -789,7 +789,7 @@ __must_hold(&conn->ibc_lock)
             conn->ibc_credits == 1 &&   /* last credit reserved */
             msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
                 CDEBUG(D_NET, "%s: not using last credit\n",
-                       libcfs_nid2str(peer->ibp_nid));
+                       libcfs_nid2str(peer_ni->ibp_nid));
                 return -EAGAIN;
         }
 
@@ -805,16 +805,16 @@ __must_hold(&conn->ibc_lock)
                 * kiblnd_check_sends_locked will queue NOOP again when
                 * posted NOOPs complete */
                spin_unlock(&conn->ibc_lock);
-               kiblnd_tx_done(peer->ibp_ni, tx);
+               kiblnd_tx_done(peer_ni->ibp_ni, tx);
                spin_lock(&conn->ibc_lock);
                 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
-                       libcfs_nid2str(peer->ibp_nid),
+                       libcfs_nid2str(peer_ni->ibp_nid),
                        conn->ibc_noops_posted);
                 return 0;
         }
 
-        kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
-                        peer->ibp_nid, conn->ibc_incarnation);
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+                        peer_ni->ibp_nid, conn->ibc_incarnation);
 
         conn->ibc_credits -= credit;
         conn->ibc_outstanding_credits = 0;
@@ -854,7 +854,7 @@ __must_hold(&conn->ibc_lock)
                }
 
                LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
-                        "bad wr_id %#llx, opc %d, flags %d, peer: %s\n",
+                        "bad wr_id %#llx, opc %d, flags %d, peer_ni: %s\n",
                         bad->wr_id, bad->opcode, bad->send_flags,
                         libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
@@ -887,15 +887,15 @@ __must_hold(&conn->ibc_lock)
 
         if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
                 CERROR("Error %d posting transmit to %s\n",
-                       rc, libcfs_nid2str(peer->ibp_nid));
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
         else
                 CDEBUG(D_NET, "Error %d posting transmit to %s\n",
-                       rc, libcfs_nid2str(peer->ibp_nid));
+                       rc, libcfs_nid2str(peer_ni->ibp_nid));
 
         kiblnd_close_conn(conn, rc);
 
         if (done)
-                kiblnd_tx_done(peer->ibp_ni, tx);
+                kiblnd_tx_done(peer_ni->ibp_ni, tx);
 
        spin_lock(&conn->ibc_lock);
 
@@ -1000,12 +1000,12 @@ kiblnd_tx_complete (kib_tx_t *tx, int status)
                 conn->ibc_noops_posted--;
 
         if (failed) {
-                tx->tx_waiting = 0;             /* don't wait for peer */
+                tx->tx_waiting = 0;             /* don't wait for peer_ni */
                 tx->tx_status = -EIO;
         }
 
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
-               !tx->tx_waiting &&               /* Not waiting for peer */
+               !tx->tx_waiting &&               /* Not waiting for peer_ni */
                !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
         if (idle)
                list_del(&tx->tx_list);
@@ -1084,7 +1084,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                 }
 
                if (tx->tx_nwrq >= conn->ibc_max_frags) {
-                       CERROR("RDMA has too many fragments for peer %s (%d), "
+                       CERROR("RDMA has too many fragments for peer_ni %s (%d), "
                               "src idx/frags: %d/%d dst idx/frags: %d/%d\n",
                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
                               conn->ibc_max_frags,
@@ -1242,25 +1242,25 @@ static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
 }
 
 static void
-kiblnd_connect_peer (kib_peer_t *peer)
+kiblnd_connect_peer (kib_peer_ni_t *peer_ni)
 {
         struct rdma_cm_id *cmid;
         kib_dev_t         *dev;
-        kib_net_t         *net = peer->ibp_ni->ni_data;
+        kib_net_t         *net = peer_ni->ibp_ni->ni_data;
         struct sockaddr_in srcaddr;
         struct sockaddr_in dstaddr;
         int                rc;
 
         LASSERT (net != NULL);
-        LASSERT (peer->ibp_connecting > 0);
-       LASSERT(!peer->ibp_reconnecting);
+        LASSERT (peer_ni->ibp_connecting > 0);
+       LASSERT(!peer_ni->ibp_reconnecting);
 
-        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer_ni, RDMA_PS_TCP,
                                      IB_QPT_RC);
 
         if (IS_ERR(cmid)) {
                 CERROR("Can't create CMID for %s: %ld\n",
-                       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+                       libcfs_nid2str(peer_ni->ibp_nid), PTR_ERR(cmid));
                 rc = PTR_ERR(cmid);
                 goto failed;
         }
@@ -1273,9 +1273,9 @@ kiblnd_connect_peer (kib_peer_t *peer)
         memset(&dstaddr, 0, sizeof(dstaddr));
         dstaddr.sin_family = AF_INET;
         dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
-        dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+        dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer_ni->ibp_nid));
 
-        kiblnd_peer_addref(peer);               /* cmid's ref */
+        kiblnd_peer_addref(peer_ni);               /* cmid's ref */
 
         if (*kiblnd_tunables.kib_use_priv_port) {
                 rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
@@ -1289,28 +1289,28 @@ kiblnd_connect_peer (kib_peer_t *peer)
         if (rc != 0) {
                 /* Can't initiate address resolution:  */
                 CERROR("Can't resolve addr for %s: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), rc);
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
                 goto failed2;
         }
 
         LASSERT (cmid->device != NULL);
        CDEBUG(D_NET, "%s: connection bound to %s:%pI4h:%s\n",
-               libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+               libcfs_nid2str(peer_ni->ibp_nid), dev->ibd_ifname,
               &dev->ibd_ifip, cmid->device->name);
 
        return;
 
  failed2:
-       kiblnd_peer_connect_failed(peer, 1, rc);
-       kiblnd_peer_decref(peer);               /* cmid's ref */
+       kiblnd_peer_connect_failed(peer_ni, 1, rc);
+       kiblnd_peer_decref(peer_ni);               /* cmid's ref */
        rdma_destroy_id(cmid);
        return;
  failed:
-       kiblnd_peer_connect_failed(peer, 1, rc);
+       kiblnd_peer_connect_failed(peer_ni, 1, rc);
 }
 
 bool
-kiblnd_reconnect_peer(kib_peer_t *peer)
+kiblnd_reconnect_peer(kib_peer_ni_t *peer_ni)
 {
        rwlock_t         *glock = &kiblnd_data.kib_global_lock;
        char             *reason = NULL;
@@ -1320,12 +1320,12 @@ kiblnd_reconnect_peer(kib_peer_t *peer)
        INIT_LIST_HEAD(&txs);
 
        write_lock_irqsave(glock, flags);
-       if (peer->ibp_reconnecting == 0) {
-               if (peer->ibp_accepting)
+       if (peer_ni->ibp_reconnecting == 0) {
+               if (peer_ni->ibp_accepting)
                        reason = "accepting";
-               else if (peer->ibp_connecting)
+               else if (peer_ni->ibp_connecting)
                        reason = "connecting";
-               else if (!list_empty(&peer->ibp_conns))
+               else if (!list_empty(&peer_ni->ibp_conns))
                        reason = "connected";
                else /* connected then closed */
                        reason = "closed";
@@ -1333,37 +1333,38 @@ kiblnd_reconnect_peer(kib_peer_t *peer)
                goto no_reconnect;
        }
 
-       LASSERT(!peer->ibp_accepting && !peer->ibp_connecting &&
-               list_empty(&peer->ibp_conns));
-       peer->ibp_reconnecting = 0;
+       LASSERT(!peer_ni->ibp_accepting && !peer_ni->ibp_connecting &&
+               list_empty(&peer_ni->ibp_conns));
+       peer_ni->ibp_reconnecting = 0;
 
-       if (!kiblnd_peer_active(peer)) {
-               list_splice_init(&peer->ibp_tx_queue, &txs);
+       if (!kiblnd_peer_active(peer_ni)) {
+               list_splice_init(&peer_ni->ibp_tx_queue, &txs);
                reason = "unlinked";
                goto no_reconnect;
        }
 
-       peer->ibp_connecting++;
-       peer->ibp_reconnected++;
+       peer_ni->ibp_connecting++;
+       peer_ni->ibp_reconnected++;
+
        write_unlock_irqrestore(glock, flags);
 
-       kiblnd_connect_peer(peer);
+       kiblnd_connect_peer(peer_ni);
        return true;
 
  no_reconnect:
        write_unlock_irqrestore(glock, flags);
 
        CWARN("Abort reconnection of %s: %s\n",
-             libcfs_nid2str(peer->ibp_nid), reason);
-       kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED);
+             libcfs_nid2str(peer_ni->ibp_nid), reason);
+       kiblnd_txlist_done(peer_ni->ibp_ni, &txs, -ECONNABORTED);
        return false;
 }
 
 void
 kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
 {
-        kib_peer_t        *peer;
-        kib_peer_t        *peer2;
+        kib_peer_ni_t        *peer_ni;
+        kib_peer_ni_t        *peer2;
         kib_conn_t        *conn;
        rwlock_t        *g_lock = &kiblnd_data.kib_global_lock;
         unsigned long      flags;
@@ -1375,14 +1376,14 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
         LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
         LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
 
-        /* First time, just use a read lock since I expect to find my peer
+        /* First time, just use a read lock since I expect to find my peer_ni
          * connected */
        read_lock_irqsave(g_lock, flags);
 
-        peer = kiblnd_find_peer_locked(nid);
-       if (peer != NULL && !list_empty(&peer->ibp_conns)) {
-                /* Found a peer with an established connection */
-                conn = kiblnd_get_conn_locked(peer);
+        peer_ni = kiblnd_find_peer_locked(ni, nid);
+       if (peer_ni != NULL && !list_empty(&peer_ni->ibp_conns)) {
+                /* Found a peer_ni with an established connection */
+                conn = kiblnd_get_conn_locked(peer_ni);
                 kiblnd_conn_addref(conn); /* 1 ref for me... */
 
                read_unlock_irqrestore(g_lock, flags);
@@ -1397,17 +1398,17 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
        /* Re-try with a write lock */
        write_lock(g_lock);
 
-        peer = kiblnd_find_peer_locked(nid);
-        if (peer != NULL) {
-               if (list_empty(&peer->ibp_conns)) {
-                        /* found a peer, but it's still connecting... */
-                       LASSERT(kiblnd_peer_connecting(peer));
+        peer_ni = kiblnd_find_peer_locked(ni, nid);
+        if (peer_ni != NULL) {
+               if (list_empty(&peer_ni->ibp_conns)) {
+                        /* found a peer_ni, but it's still connecting... */
+                       LASSERT(kiblnd_peer_connecting(peer_ni));
                         if (tx != NULL)
                                list_add_tail(&tx->tx_list,
-                                                  &peer->ibp_tx_queue);
+                                                  &peer_ni->ibp_tx_queue);
                        write_unlock_irqrestore(g_lock, flags);
                } else {
-                       conn = kiblnd_get_conn_locked(peer);
+                       conn = kiblnd_get_conn_locked(peer_ni);
                        kiblnd_conn_addref(conn); /* 1 ref for me... */
 
                        write_unlock_irqrestore(g_lock, flags);
@@ -1421,10 +1422,10 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
 
        write_unlock_irqrestore(g_lock, flags);
 
-       /* Allocate a peer ready to add to the peer table and retry */
-       rc = kiblnd_create_peer(ni, &peer, nid);
+       /* Allocate a peer_ni ready to add to the peer_ni table and retry */
+       rc = kiblnd_create_peer(ni, &peer_ni, nid);
        if (rc != 0) {
-               CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+               CERROR("Can't create peer_ni %s\n", libcfs_nid2str(nid));
                if (tx != NULL) {
                        tx->tx_status = -EHOSTUNREACH;
                        tx->tx_waiting = 0;
@@ -1435,10 +1436,10 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
 
        write_lock_irqsave(g_lock, flags);
 
-        peer2 = kiblnd_find_peer_locked(nid);
+        peer2 = kiblnd_find_peer_locked(ni, nid);
         if (peer2 != NULL) {
                if (list_empty(&peer2->ibp_conns)) {
-                        /* found a peer, but it's still connecting... */
+                        /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer2));
                         if (tx != NULL)
                                list_add_tail(&tx->tx_list,
@@ -1455,27 +1456,27 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
                         kiblnd_conn_decref(conn); /* ...to here */
                 }
 
-                kiblnd_peer_decref(peer);
+                kiblnd_peer_decref(peer_ni);
                 return;
         }
 
-        /* Brand new peer */
-        LASSERT (peer->ibp_connecting == 0);
-        peer->ibp_connecting = 1;
+        /* Brand new peer_ni */
+        LASSERT (peer_ni->ibp_connecting == 0);
+        peer_ni->ibp_connecting = 1;
 
         /* always called with a ref on ni, which prevents ni being shutdown */
         LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
 
         if (tx != NULL)
-               list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+               list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
 
-        kiblnd_peer_addref(peer);
-       list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+        kiblnd_peer_addref(peer_ni);
+       list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid));
 
        write_unlock_irqrestore(g_lock, flags);
 
-        kiblnd_connect_peer(peer);
-        kiblnd_peer_decref(peer);
+        kiblnd_connect_peer(peer_ni);
+        kiblnd_peer_decref(peer_ni);
 }
 
 int
@@ -1787,7 +1788,7 @@ kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
                        CERROR("Can't setup PUT sink for %s: %d\n",
                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
                        kiblnd_tx_done(ni, tx);
-                       /* tell peer it's over */
+                       /* tell peer_ni it's over */
                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
                                               rxmsg->ibm_u.putreq.ibprm_cookie);
                        break;
@@ -1844,15 +1845,15 @@ kiblnd_thread_fini (void)
 }
 
 static void
-kiblnd_peer_alive (kib_peer_t *peer)
+kiblnd_peer_alive (kib_peer_ni_t *peer_ni)
 {
        /* This is racy, but everyone's only writing cfs_time_current() */
-       peer->ibp_last_alive = cfs_time_current();
+       peer_ni->ibp_last_alive = cfs_time_current();
        smp_mb();
 }
 
 static void
-kiblnd_peer_notify (kib_peer_t *peer)
+kiblnd_peer_notify (kib_peer_ni_t *peer_ni)
 {
         int           error = 0;
         cfs_time_t    last_alive = 0;
@@ -1860,18 +1861,18 @@ kiblnd_peer_notify (kib_peer_t *peer)
 
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
-       if (kiblnd_peer_idle(peer) && peer->ibp_error != 0) {
-                error = peer->ibp_error;
-                peer->ibp_error = 0;
+       if (kiblnd_peer_idle(peer_ni) && peer_ni->ibp_error != 0) {
+                error = peer_ni->ibp_error;
+                peer_ni->ibp_error = 0;
 
-                last_alive = peer->ibp_last_alive;
+                last_alive = peer_ni->ibp_last_alive;
         }
 
        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
         if (error != 0)
-                lnet_notify(peer->ibp_ni,
-                            peer->ibp_nid, 0, last_alive);
+                lnet_notify(peer_ni->ibp_ni,
+                            peer_ni->ibp_nid, 0, last_alive);
 }
 
 void
@@ -1883,7 +1884,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
          * connection to be finished off by the connd.  Otherwise the connd is
          * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
-        kib_peer_t       *peer = conn->ibc_peer;
+        kib_peer_ni_t       *peer_ni = conn->ibc_peer;
         kib_dev_t        *dev;
         unsigned long     flags;
 
@@ -1902,10 +1903,10 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
            list_empty(&conn->ibc_tx_queue_nocred) &&
            list_empty(&conn->ibc_active_txs)) {
                 CDEBUG(D_NET, "closing conn to %s\n", 
-                       libcfs_nid2str(peer->ibp_nid));
+                       libcfs_nid2str(peer_ni->ibp_nid));
         } else {
                 CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
-                       libcfs_nid2str(peer->ibp_nid), error,
+                       libcfs_nid2str(peer_ni->ibp_nid), error,
                       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
                       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
                       list_empty(&conn->ibc_tx_queue_rsrvd) ?
@@ -1915,16 +1916,16 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
         }
 
-        dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+        dev = ((kib_net_t *)peer_ni->ibp_ni->ni_data)->ibn_dev;
        list_del(&conn->ibc_list);
         /* connd (see below) takes over ibc_list's ref */
 
-       if (list_empty(&peer->ibp_conns) &&    /* no more conns */
-            kiblnd_peer_active(peer)) {         /* still in peer table */
-                kiblnd_unlink_peer_locked(peer);
+       if (list_empty(&peer_ni->ibp_conns) &&    /* no more conns */
+            kiblnd_peer_active(peer_ni)) {         /* still in peer_ni table */
+                kiblnd_unlink_peer_locked(peer_ni);
 
                 /* set/clear error on last conn */
-                peer->ibp_error = conn->ibc_comms_error;
+                peer_ni->ibp_error = conn->ibc_comms_error;
         }
 
         kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
@@ -2041,7 +2042,7 @@ kiblnd_finalise_conn (kib_conn_t *conn)
 }
 
 static void
-kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
+kiblnd_peer_connect_failed(kib_peer_ni_t *peer_ni, int active, int error)
 {
        struct list_head zombies = LIST_HEAD_INIT(zombies);
        unsigned long   flags;
@@ -2052,52 +2053,52 @@ kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
        if (active) {
-               LASSERT (peer->ibp_connecting > 0);
-               peer->ibp_connecting--;
+               LASSERT(peer_ni->ibp_connecting > 0);
+               peer_ni->ibp_connecting--;
        } else {
-               LASSERT (peer->ibp_accepting > 0);
-               peer->ibp_accepting--;
+               LASSERT (peer_ni->ibp_accepting > 0);
+               peer_ni->ibp_accepting--;
        }
 
-       if (kiblnd_peer_connecting(peer)) {
+       if (kiblnd_peer_connecting(peer_ni)) {
                /* another connection attempt under way... */
                write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
                                        flags);
                return;
        }
 
-       peer->ibp_reconnected = 0;
-       if (list_empty(&peer->ibp_conns)) {
-               /* Take peer's blocked transmits to complete with error */
-               list_add(&zombies, &peer->ibp_tx_queue);
-               list_del_init(&peer->ibp_tx_queue);
+       peer_ni->ibp_reconnected = 0;
+       if (list_empty(&peer_ni->ibp_conns)) {
+               /* Take peer_ni's blocked transmits to complete with error */
+               list_add(&zombies, &peer_ni->ibp_tx_queue);
+               list_del_init(&peer_ni->ibp_tx_queue);
 
-               if (kiblnd_peer_active(peer))
-                       kiblnd_unlink_peer_locked(peer);
+               if (kiblnd_peer_active(peer_ni))
+                       kiblnd_unlink_peer_locked(peer_ni);
 
-               peer->ibp_error = error;
+               peer_ni->ibp_error = error;
        } else {
                /* Can't have blocked transmits if there are connections */
-               LASSERT(list_empty(&peer->ibp_tx_queue));
+               LASSERT(list_empty(&peer_ni->ibp_tx_queue));
        }
 
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-       kiblnd_peer_notify(peer);
+       kiblnd_peer_notify(peer_ni);
 
        if (list_empty(&zombies))
                return;
 
        CNETERR("Deleting messages for %s: connection failed\n",
-               libcfs_nid2str(peer->ibp_nid));
+               libcfs_nid2str(peer_ni->ibp_nid));
 
-       kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+       kiblnd_txlist_done(peer_ni->ibp_ni, &zombies, -EHOSTUNREACH);
 }
 
 static void
 kiblnd_connreq_done(kib_conn_t *conn, int status)
 {
-       kib_peer_t       *peer = conn->ibc_peer;
+       kib_peer_ni_t    *peer_ni = conn->ibc_peer;
        kib_tx_t         *tx;
        struct list_head txs;
        unsigned long    flags;
@@ -2106,21 +2107,21 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
         active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
 
        CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
-              libcfs_nid2str(peer->ibp_nid), active,
+              libcfs_nid2str(peer_ni->ibp_nid), active,
               conn->ibc_version, status);
 
        LASSERT (!in_interrupt());
        LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
-                 peer->ibp_connecting > 0) ||
+                 peer_ni->ibp_connecting > 0) ||
                 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
-                 peer->ibp_accepting > 0));
+                 peer_ni->ibp_accepting > 0));
 
         LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
         conn->ibc_connvars = NULL;
 
         if (status != 0) {
                 /* failed to establish connection */
-                kiblnd_peer_connect_failed(peer, active, status);
+                kiblnd_peer_connect_failed(peer_ni, active, status);
                 kiblnd_finalise_conn(conn);
                 return;
         }
@@ -2130,38 +2131,38 @@ kiblnd_connreq_done(kib_conn_t *conn, int status)
 
         conn->ibc_last_send = jiffies;
         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
-        kiblnd_peer_alive(peer);
+        kiblnd_peer_alive(peer_ni);
 
-       /* Add conn to peer's list and nuke any dangling conns from a different
-        * peer instance... */
+       /* Add conn to peer_ni's list and nuke any dangling conns from a different
+        * peer_ni instance... */
        kiblnd_conn_addref(conn);       /* +1 ref for ibc_list */
-       list_add(&conn->ibc_list, &peer->ibp_conns);
-       peer->ibp_reconnected = 0;
+       list_add(&conn->ibc_list, &peer_ni->ibp_conns);
+       peer_ni->ibp_reconnected = 0;
        if (active)
-               peer->ibp_connecting--;
+               peer_ni->ibp_connecting--;
        else
-               peer->ibp_accepting--;
+               peer_ni->ibp_accepting--;
 
-        if (peer->ibp_version == 0) {
-                peer->ibp_version     = conn->ibc_version;
-                peer->ibp_incarnation = conn->ibc_incarnation;
+        if (peer_ni->ibp_version == 0) {
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
         }
 
-        if (peer->ibp_version     != conn->ibc_version ||
-            peer->ibp_incarnation != conn->ibc_incarnation) {
-                kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+        if (peer_ni->ibp_version     != conn->ibc_version ||
+            peer_ni->ibp_incarnation != conn->ibc_incarnation) {
+                kiblnd_close_stale_conns_locked(peer_ni, conn->ibc_version,
                                                 conn->ibc_incarnation);
-                peer->ibp_version     = conn->ibc_version;
-                peer->ibp_incarnation = conn->ibc_incarnation;
+                peer_ni->ibp_version     = conn->ibc_version;
+                peer_ni->ibp_incarnation = conn->ibc_incarnation;
         }
 
        /* grab pending txs while I have the lock */
-       list_add(&txs, &peer->ibp_tx_queue);
-       list_del_init(&peer->ibp_tx_queue);
+       list_add(&txs, &peer_ni->ibp_tx_queue);
+       list_del_init(&peer_ni->ibp_tx_queue);
 
-        if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
+        if (!kiblnd_peer_active(peer_ni) ||        /* peer_ni has been deleted */
             conn->ibc_comms_error != 0) {       /* error has happened already */
-                lnet_ni_t *ni = peer->ibp_ni;
+                lnet_ni_t *ni = peer_ni->ibp_ni;
 
                 /* start to shut down connection */
                 kiblnd_close_conn_locked(conn, -ECONNABORTED);
@@ -2214,8 +2215,8 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         kib_msg_t             *reqmsg = priv;
         kib_msg_t             *ackmsg;
         kib_dev_t             *ibdev;
-        kib_peer_t            *peer;
-        kib_peer_t            *peer2;
+        kib_peer_ni_t            *peer_ni;
+        kib_peer_ni_t            *peer2;
         kib_conn_t            *conn;
         lnet_ni_t             *ni  = NULL;
         kib_net_t             *net = NULL;
@@ -2241,77 +2242,77 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         if (*kiblnd_tunables.kib_require_priv_port &&
             ntohs(peer_addr->sin_port) >= PROT_SOCK) {
                __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
-               CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
+               CERROR("peer_ni's port (%pI4h:%hu) is not privileged\n",
                       &ip, ntohs(peer_addr->sin_port));
-                goto failed;
-        }
+               goto failed;
+       }
 
-        if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
-                CERROR("Short connection request\n");
-                goto failed;
-        }
+       if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+               CERROR("Short connection request\n");
+               goto failed;
+       }
 
-        /* Future protocol version compatibility support!  If the
-         * o2iblnd-specific protocol changes, or when LNET unifies
-         * protocols over all LNDs, the initial connection will
-         * negotiate a protocol version.  I trap this here to avoid
-         * console errors; the reject tells the peer which protocol I
-         * speak. */
-        if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
-            reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
-                goto failed;
-        if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
-            reqmsg->ibm_version != IBLND_MSG_VERSION &&
-            reqmsg->ibm_version != IBLND_MSG_VERSION_1)
-                goto failed;
-        if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
-            reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
-            reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
-                goto failed;
+       /* Future protocol version compatibility support!  If the
+        * o2iblnd-specific protocol changes, or when LNET unifies
+        * protocols over all LNDs, the initial connection will
+        * negotiate a protocol version.  I trap this here to avoid
+        * console errors; the reject tells the peer_ni which protocol I
+        * speak. */
+       if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+           reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+               goto failed;
+       if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+               goto failed;
+       if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+               goto failed;
 
-        rc = kiblnd_unpack_msg(reqmsg, priv_nob);
-        if (rc != 0) {
-                CERROR("Can't parse connection request: %d\n", rc);
-                goto failed;
-        }
+       rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+       if (rc != 0) {
+               CERROR("Can't parse connection request: %d\n", rc);
+               goto failed;
+       }
 
-        nid = reqmsg->ibm_srcnid;
-        ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+       nid = reqmsg->ibm_srcnid;
+       ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
 
-        if (ni != NULL) {
-                net = (kib_net_t *)ni->ni_data;
-                rej.ibr_incarnation = net->ibn_incarnation;
-        }
+       if (ni != NULL) {
+               net = (kib_net_t *)ni->ni_data;
+               rej.ibr_incarnation = net->ibn_incarnation;
+       }
 
-        if (ni == NULL ||                         /* no matching net */
-            ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
-            net->ibn_dev != ibdev) {              /* wrong device */
+       if (ni == NULL ||                         /* no matching net */
+           ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+           net->ibn_dev != ibdev) {              /* wrong device */
                CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): "
-                       "bad dst nid %s\n", libcfs_nid2str(nid),
-                       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
-                       ibdev->ibd_ifname, ibdev->ibd_nnets,
+                      "bad dst nid %s\n", libcfs_nid2str(nid),
+                      ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+                      ibdev->ibd_ifname, ibdev->ibd_nnets,
                        &ibdev->ibd_ifip,
-                       libcfs_nid2str(reqmsg->ibm_dstnid));
+                      libcfs_nid2str(reqmsg->ibm_dstnid));
 
-                goto failed;
-        }
+               goto failed;
+       }
 
        /* check time stamp as soon as possible */
-        if (reqmsg->ibm_dststamp != 0 &&
-            reqmsg->ibm_dststamp != net->ibn_incarnation) {
-                CWARN("Stale connection request\n");
-                rej.ibr_why = IBLND_REJECT_CONN_STALE;
-                goto failed;
-        }
+       if (reqmsg->ibm_dststamp != 0 &&
+           reqmsg->ibm_dststamp != net->ibn_incarnation) {
+               CWARN("Stale connection request\n");
+               rej.ibr_why = IBLND_REJECT_CONN_STALE;
+               goto failed;
+       }
 
-        /* I can accept peer's version */
-        version = reqmsg->ibm_version;
+       /* I can accept peer_ni's version */
+       version = reqmsg->ibm_version;
 
-        if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
-                CERROR("Unexpected connreq msg type: %x from %s\n",
-                       reqmsg->ibm_type, libcfs_nid2str(nid));
-                goto failed;
-        }
+       if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+               CERROR("Unexpected connreq msg type: %x from %s\n",
+                      reqmsg->ibm_type, libcfs_nid2str(nid));
+               goto failed;
+       }
 
        if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
            kiblnd_msg_queue_size(version, ni)) {
@@ -2363,21 +2364,21 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                 goto failed;
         }
 
-       /* assume 'nid' is a new peer; create  */
-       rc = kiblnd_create_peer(ni, &peer, nid);
+       /* assume 'nid' is a new peer_ni; create  */
+       rc = kiblnd_create_peer(ni, &peer_ni, nid);
        if (rc != 0) {
-               CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+               CERROR("Can't create peer_ni for %s\n", libcfs_nid2str(nid));
                rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
                goto failed;
        }
 
        /* We have validated the peer's parameters so use those */
-       peer->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags;
-       peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
+       peer_ni->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags;
+       peer_ni->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
 
        write_lock_irqsave(g_lock, flags);
 
-        peer2 = kiblnd_find_peer_locked(nid);
+        peer2 = kiblnd_find_peer_locked(ni, nid);
         if (peer2 != NULL) {
                 if (peer2->ibp_version == 0) {
                         peer2->ibp_version     = version;
@@ -2399,10 +2400,10 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                              libcfs_nid2str(nid), peer2->ibp_version, version,
                              peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
 
-                       kiblnd_peer_decref(peer);
-                       rej.ibr_why = IBLND_REJECT_CONN_STALE;
-                       goto failed;
-               }
+                        kiblnd_peer_decref(peer_ni);
+                        rej.ibr_why = IBLND_REJECT_CONN_STALE;
+                        goto failed;
+                }
 
                /* Tie-break connection race in favour of the higher NID.
                 * If we keep running into a race condition multiple times,
@@ -2420,7 +2421,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                        CDEBUG(D_NET, "Conn race %s\n",
                               libcfs_nid2str(peer2->ibp_nid));
 
-                       kiblnd_peer_decref(peer);
+                       kiblnd_peer_decref(peer_ni);
                        rej.ibr_why = IBLND_REJECT_CONN_RACE;
                        goto failed;
                }
@@ -2429,7 +2430,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                                libcfs_nid2str(peer2->ibp_nid),
                                MAX_CONN_RACES_BEFORE_ABORT);
                /*
-                * passive connection is allowed even this peer is waiting for
+                * passive connection is allowed even this peer_ni is waiting for
                 * reconnection.
                 */
                peer2->ibp_reconnecting = 0;
@@ -2437,38 +2438,38 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                peer2->ibp_accepting++;
                kiblnd_peer_addref(peer2);
 
-               /* Race with kiblnd_launch_tx (active connect) to create peer
+               /* Race with kiblnd_launch_tx (active connect) to create peer_ni
                 * so copy validated parameters since we now know what the
-                * peer's limits are */
-               peer2->ibp_max_frags = peer->ibp_max_frags;
-               peer2->ibp_queue_depth = peer->ibp_queue_depth;
+                * peer_ni's limits are */
+               peer2->ibp_max_frags = peer_ni->ibp_max_frags;
+               peer2->ibp_queue_depth = peer_ni->ibp_queue_depth;
 
                write_unlock_irqrestore(g_lock, flags);
-                kiblnd_peer_decref(peer);
-                peer = peer2;
+                kiblnd_peer_decref(peer_ni);
+                peer_ni = peer2;
         } else {
-                /* Brand new peer */
-                LASSERT (peer->ibp_accepting == 0);
-                LASSERT (peer->ibp_version == 0 &&
-                         peer->ibp_incarnation == 0);
+                /* Brand new peer_ni */
+                LASSERT (peer_ni->ibp_accepting == 0);
+                LASSERT (peer_ni->ibp_version == 0 &&
+                         peer_ni->ibp_incarnation == 0);
 
-                peer->ibp_accepting   = 1;
-                peer->ibp_version     = version;
-                peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+                peer_ni->ibp_accepting   = 1;
+                peer_ni->ibp_version     = version;
+                peer_ni->ibp_incarnation = reqmsg->ibm_srcstamp;
 
                 /* I have a ref on ni that prevents it being shutdown */
                 LASSERT (net->ibn_shutdown == 0);
 
-                kiblnd_peer_addref(peer);
-               list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+                kiblnd_peer_addref(peer_ni);
+               list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid));
 
                write_unlock_irqrestore(g_lock, flags);
         }
 
-       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+       conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_PASSIVE_WAIT, version);
         if (conn == NULL) {
-                kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
-                kiblnd_peer_decref(peer);
+                kiblnd_peer_connect_failed(peer_ni, 0, -ENOMEM);
+                kiblnd_peer_decref(peer_ni);
                 rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
                 goto failed;
         }
@@ -2536,7 +2537,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                       __u64 incarnation, int why, kib_connparams_t *cp)
 {
        rwlock_t        *glock = &kiblnd_data.kib_global_lock;
-       kib_peer_t      *peer = conn->ibc_peer;
+       kib_peer_ni_t   *peer_ni = conn->ibc_peer;
        char            *reason;
        int              msg_size = IBLND_MSG_SIZE;
        int              frag_num = -1;
@@ -2545,8 +2546,8 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
        unsigned long    flags;
 
        LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
-       LASSERT(peer->ibp_connecting > 0);      /* 'conn' at least */
-       LASSERT(!peer->ibp_reconnecting);
+       LASSERT(peer_ni->ibp_connecting > 0);   /* 'conn' at least */
+       LASSERT(!peer_ni->ibp_reconnecting);
 
        if (cp) {
                msg_size        = cp->ibcp_max_msg_size;
@@ -2560,10 +2561,10 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
          * NB: reconnect is still needed even when ibp_tx_queue is
          * empty if ibp_version != version because reconnect may be
          * initiated by kiblnd_query() */
-       reconnect = (!list_empty(&peer->ibp_tx_queue) ||
-                    peer->ibp_version != version) &&
-                   peer->ibp_connecting == 1 &&
-                   peer->ibp_accepting == 0;
+       reconnect = (!list_empty(&peer_ni->ibp_tx_queue) ||
+                    peer_ni->ibp_version != version) &&
+                   peer_ni->ibp_connecting == 1 &&
+                   peer_ni->ibp_accepting == 0;
        if (!reconnect) {
                reason = "no need";
                goto out;
@@ -2575,14 +2576,14 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                 break;
 
        case IBLND_REJECT_RDMA_FRAGS: {
-               struct lnet_ioctl_config_lnd_tunables *tunables;
+               struct lnet_ioctl_config_o2iblnd_tunables *tunables;
 
                if (!cp) {
                        reason = "can't negotiate max frags";
                        goto out;
                }
-               tunables = peer->ibp_ni->ni_lnd_tunables;
-               if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
+               tunables = &peer_ni->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+               if (!tunables->lnd_map_on_demand) {
                        reason = "map_on_demand must be enabled";
                        goto out;
                }
@@ -2591,7 +2592,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                        goto out;
                }
 
-               peer->ibp_max_frags = frag_num;
+               peer_ni->ibp_max_frags = frag_num;
                reason = "rdma fragments";
                break;
        }
@@ -2605,7 +2606,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                        goto out;
                }
 
-               peer->ibp_queue_depth = queue_dep;
+               peer_ni->ibp_queue_depth = queue_dep;
                reason = "queue depth";
                break;
 
@@ -2623,21 +2624,21 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
         }
 
        conn->ibc_reconnect = 1;
-       peer->ibp_reconnecting = 1;
-       peer->ibp_version = version;
+       peer_ni->ibp_reconnecting = 1;
+       peer_ni->ibp_version = version;
        if (incarnation != 0)
-               peer->ibp_incarnation = incarnation;
+               peer_ni->ibp_incarnation = incarnation;
  out:
        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
        CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
-               libcfs_nid2str(peer->ibp_nid),
+               libcfs_nid2str(peer_ni->ibp_nid),
                reconnect ? "reconnect" : "don't reconnect",
                reason, IBLND_MSG_VERSION, version, msg_size,
                conn->ibc_queue_depth, queue_dep,
                conn->ibc_max_frags, frag_num);
        /*
-        * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer
+        * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer_ni
         * while destroying the zombie
         */
 }
@@ -2645,7 +2646,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 static void
 kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 {
-       kib_peer_t    *peer = conn->ibc_peer;
+       kib_peer_ni_t    *peer_ni = conn->ibc_peer;
 
        LASSERT (!in_interrupt());
        LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
@@ -2658,7 +2659,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
                 CNETERR("%s rejected: no listener at %d\n",
-                        libcfs_nid2str(peer->ibp_nid),
+                        libcfs_nid2str(peer_ni->ibp_nid),
                         *kiblnd_tunables.kib_service);
                 break;
 
@@ -2674,7 +2675,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                          * b) V2 will provide incarnation while rejecting me,
                          *    -1 will be overwrote.
                          *
-                         * if I try to connect to a V1 peer with V2 protocol,
+                         * if I try to connect to a V1 peer_ni with V2 protocol,
                          * it rejected me then upgrade to V2, I have no idea
                          * about the upgrading and try to reconnect with V1,
                          * in this case upgraded V2 can find out I'm trying to
@@ -2708,22 +2709,22 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                         if (rej->ibr_magic != IBLND_MSG_MAGIC &&
                             rej->ibr_magic != LNET_PROTO_MAGIC) {
                                 CERROR("%s rejected: consumer defined fatal error\n",
-                                       libcfs_nid2str(peer->ibp_nid));
+                                       libcfs_nid2str(peer_ni->ibp_nid));
                                 break;
                         }
 
                         if (rej->ibr_version != IBLND_MSG_VERSION &&
                             rej->ibr_version != IBLND_MSG_VERSION_1) {
                                 CERROR("%s rejected: o2iblnd version %x error\n",
-                                       libcfs_nid2str(peer->ibp_nid),
+                                       libcfs_nid2str(peer_ni->ibp_nid),
                                        rej->ibr_version);
                                 break;
                         }
 
                         if (rej->ibr_why     == IBLND_REJECT_FATAL &&
                             rej->ibr_version == IBLND_MSG_VERSION_1) {
-                                CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
-                                       libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+                                CDEBUG(D_NET, "rejected by old version peer_ni %s: %x\n",
+                                       libcfs_nid2str(peer_ni->ibp_nid), rej->ibr_version);
 
                                 if (conn->ibc_version != IBLND_MSG_VERSION_1)
                                         rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
@@ -2741,17 +2742,17 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 
                         case IBLND_REJECT_NO_RESOURCES:
                                 CERROR("%s rejected: o2iblnd no resources\n",
-                                       libcfs_nid2str(peer->ibp_nid));
+                                       libcfs_nid2str(peer_ni->ibp_nid));
                                 break;
 
                         case IBLND_REJECT_FATAL:
                                 CERROR("%s rejected: o2iblnd fatal error\n",
-                                       libcfs_nid2str(peer->ibp_nid));
+                                       libcfs_nid2str(peer_ni->ibp_nid));
                                 break;
 
                         default:
                                 CERROR("%s rejected: o2iblnd reason %d\n",
-                                       libcfs_nid2str(peer->ibp_nid),
+                                       libcfs_nid2str(peer_ni->ibp_nid),
                                        rej->ibr_why);
                                 break;
                         }
@@ -2760,7 +2761,7 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
                 /* fall through */
         default:
                 CNETERR("%s rejected: reason %d, size %d\n",
-                        libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+                        libcfs_nid2str(peer_ni->ibp_nid), reason, priv_nob);
                 break;
         }
 
@@ -2770,8 +2771,8 @@ kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
 static void
 kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 {
-        kib_peer_t    *peer = conn->ibc_peer;
-        lnet_ni_t     *ni   = peer->ibp_ni;
+        kib_peer_ni_t    *peer_ni = conn->ibc_peer;
+        lnet_ni_t     *ni   = peer_ni->ibp_ni;
         kib_net_t     *net  = ni->ni_data;
         kib_msg_t     *msg  = priv;
         int            ver  = conn->ibc_version;
@@ -2782,13 +2783,13 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 
         if (rc != 0) {
                 CERROR("Can't unpack connack from %s: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), rc);
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
                 goto failed;
         }
 
         if (msg->ibm_type != IBLND_MSG_CONNACK) {
                 CERROR("Unexpected message %d from %s\n",
-                       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+                       msg->ibm_type, libcfs_nid2str(peer_ni->ibp_nid));
                 rc = -EPROTO;
                 goto failed;
         }
@@ -2796,7 +2797,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
         if (ver != msg->ibm_version) {
                 CERROR("%s replied version %x is different with "
                        "requested version %x\n",
-                       libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+                       libcfs_nid2str(peer_ni->ibp_nid), msg->ibm_version, ver);
                 rc = -EPROTO;
                 goto failed;
         }
@@ -2804,7 +2805,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
        if (msg->ibm_u.connparams.ibcp_queue_depth >
            conn->ibc_queue_depth) {
                CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
-                      libcfs_nid2str(peer->ibp_nid),
+                      libcfs_nid2str(peer_ni->ibp_nid),
                       msg->ibm_u.connparams.ibcp_queue_depth,
                       conn->ibc_queue_depth);
                rc = -EPROTO;
@@ -2814,7 +2815,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
        if (msg->ibm_u.connparams.ibcp_max_frags >
            conn->ibc_max_frags) {
                CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
-                      libcfs_nid2str(peer->ibp_nid),
+                      libcfs_nid2str(peer_ni->ibp_nid),
                       msg->ibm_u.connparams.ibcp_max_frags,
                       conn->ibc_max_frags);
                rc = -EPROTO;
@@ -2823,7 +2824,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 
         if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
                 CERROR("%s max message size %d too big (%d max)\n",
-                       libcfs_nid2str(peer->ibp_nid),
+                       libcfs_nid2str(peer_ni->ibp_nid),
                        msg->ibm_u.connparams.ibcp_max_msg_size,
                        IBLND_MSG_SIZE);
                 rc = -EPROTO;
@@ -2841,7 +2842,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
         if (rc != 0) {
                 CERROR("Bad connection reply from %s, rc = %d, "
                        "version: %x max_frags: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), rc,
+                       libcfs_nid2str(peer_ni->ibp_nid), rc,
                        msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
                 goto failed;
         }
@@ -2871,7 +2872,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
 static int
 kiblnd_active_connect (struct rdma_cm_id *cmid)
 {
-        kib_peer_t              *peer = (kib_peer_t *)cmid->context;
+        kib_peer_ni_t              *peer_ni = (kib_peer_ni_t *)cmid->context;
         kib_conn_t              *conn;
         kib_msg_t               *msg;
         struct rdma_conn_param   cp;
@@ -2882,23 +2883,23 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
-       incarnation = peer->ibp_incarnation;
-       version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
-                                                peer->ibp_version;
+       incarnation = peer_ni->ibp_incarnation;
+       version     = (peer_ni->ibp_version == 0) ? IBLND_MSG_VERSION :
+                                                peer_ni->ibp_version;
 
        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 
-       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
+       conn = kiblnd_create_conn(peer_ni, cmid, IBLND_CONN_ACTIVE_CONNECT,
                                  version);
         if (conn == NULL) {
-                kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
-                kiblnd_peer_decref(peer); /* lose cmid's ref */
+                kiblnd_peer_connect_failed(peer_ni, 1, -ENOMEM);
+                kiblnd_peer_decref(peer_ni); /* lose cmid's ref */
                 return -ENOMEM;
         }
 
         /* conn "owns" cmid now, so I return success from here on to ensure the
          * CM callback doesn't destroy cmid. conn also takes over cmid's ref
-         * on peer */
+         * on peer_ni */
 
         msg = &conn->ibc_connvars->cv_msg;
 
@@ -2908,8 +2909,8 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
        msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
        msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
-        kiblnd_pack_msg(peer->ibp_ni, msg, version,
-                        0, peer->ibp_nid, incarnation);
+        kiblnd_pack_msg(peer_ni->ibp_ni, msg, version,
+                        0, peer_ni->ibp_nid, incarnation);
 
         memset(&cp, 0, sizeof(cp));
         cp.private_data        = msg;
@@ -2926,7 +2927,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
         rc = rdma_connect(cmid, &cp);
         if (rc != 0) {
                 CERROR("Can't connect to %s: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), rc);
+                       libcfs_nid2str(peer_ni->ibp_nid), rc);
                 kiblnd_connreq_done(conn, rc);
                 kiblnd_conn_decref(conn);
         }
@@ -2937,7 +2938,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
 int
 kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 {
-        kib_peer_t  *peer;
+        kib_peer_ni_t  *peer_ni;
         kib_conn_t  *conn;
        int          rc;
 
@@ -2956,22 +2957,22 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 return rc;
                 
        case RDMA_CM_EVENT_ADDR_ERROR:
-                peer = (kib_peer_t *)cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CNETERR("%s: ADDR ERROR %d\n",
-                       libcfs_nid2str(peer->ibp_nid), event->status);
-                kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
-                kiblnd_peer_decref(peer);
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
                 return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
 
        case RDMA_CM_EVENT_ADDR_RESOLVED:
-                peer = (kib_peer_t *)cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
 
                 CDEBUG(D_NET,"%s Addr resolved: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), event->status);
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
 
                 if (event->status != 0) {
                         CNETERR("Can't resolve address for %s: %d\n",
-                                libcfs_nid2str(peer->ibp_nid), event->status);
+                                libcfs_nid2str(peer_ni->ibp_nid), event->status);
                         rc = event->status;
                 } else {
                         rc = rdma_resolve_route(
@@ -2980,32 +2981,32 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                                 return 0;
                         /* Can't initiate route resolution */
                         CERROR("Can't resolve route for %s: %d\n",
-                               libcfs_nid2str(peer->ibp_nid), rc);
+                               libcfs_nid2str(peer_ni->ibp_nid), rc);
                 }
-                kiblnd_peer_connect_failed(peer, 1, rc);
-                kiblnd_peer_decref(peer);
+                kiblnd_peer_connect_failed(peer_ni, 1, rc);
+                kiblnd_peer_decref(peer_ni);
                 return rc;                      /* rc != 0 destroys cmid */
 
        case RDMA_CM_EVENT_ROUTE_ERROR:
-                peer = (kib_peer_t *)cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CNETERR("%s: ROUTE ERROR %d\n",
-                        libcfs_nid2str(peer->ibp_nid), event->status);
-                kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
-                kiblnd_peer_decref(peer);
+                        libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer_ni);
                 return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
 
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
-                peer = (kib_peer_t *)cmid->context;
+                peer_ni = (kib_peer_ni_t *)cmid->context;
                 CDEBUG(D_NET,"%s Route resolved: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), event->status);
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
 
                 if (event->status == 0)
                         return kiblnd_active_connect(cmid);
 
                 CNETERR("Can't resolve route for %s: %d\n",
-                       libcfs_nid2str(peer->ibp_nid), event->status);
-                kiblnd_peer_connect_failed(peer, 1, event->status);
-                kiblnd_peer_decref(peer);
+                       libcfs_nid2str(peer_ni->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer_ni, 1, event->status);
+                kiblnd_peer_decref(peer_ni);
                 return event->status;           /* rc != 0 destroys cmid */
                 
        case RDMA_CM_EVENT_UNREACHABLE:
@@ -3148,7 +3149,7 @@ kiblnd_check_conns (int idx)
        struct list_head  checksends = LIST_HEAD_INIT(checksends);
        struct list_head *peers = &kiblnd_data.kib_peers[idx];
        struct list_head *ptmp;
-       kib_peer_t       *peer;
+       kib_peer_ni_t    *peer_ni;
        kib_conn_t       *conn;
        struct list_head *ctmp;
        unsigned long     flags;
@@ -3159,9 +3160,9 @@ kiblnd_check_conns (int idx)
        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
        list_for_each(ptmp, peers) {
-               peer = list_entry(ptmp, kib_peer_t, ibp_list);
+               peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 
-               list_for_each(ctmp, &peer->ibp_conns) {
+               list_for_each(ctmp, &peer_ni->ibp_conns) {
                        int timedout;
                        int sendnoop;
 
@@ -3181,9 +3182,9 @@ kiblnd_check_conns (int idx)
                        if (timedout) {
                                CERROR("Timed out RDMA with %s (%lu): "
                                       "c: %u, oc: %u, rc: %u\n",
-                                      libcfs_nid2str(peer->ibp_nid),
+                                      libcfs_nid2str(peer_ni->ibp_nid),
                                       cfs_duration_sec(cfs_time_current() -
-                                                       peer->ibp_last_alive),
+                                                       peer_ni->ibp_last_alive),
                                       conn->ibc_credits,
                                       conn->ibc_outstanding_credits,
                                       conn->ibc_reserved_credits);
@@ -3241,7 +3242,7 @@ kiblnd_disconnect_conn (kib_conn_t *conn)
 }
 
 /*
- * High-water for reconnection to the same peer, reconnection attempt should
+ * High-water for reconnection to the same peer_ni, reconnection attempt should
  * be delayed after trying more than KIB_RECONN_HIGH_RACE.
  */
 #define KIB_RECONN_HIGH_RACE   10
@@ -3277,27 +3278,27 @@ kiblnd_connd (void *arg)
                 dropped_lock = 0;
 
                if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
-                       kib_peer_t *peer = NULL;
+                       kib_peer_ni_t *peer_ni = NULL;
 
                        conn = list_entry(kiblnd_data.kib_connd_zombies.next,
                                          kib_conn_t, ibc_list);
                        list_del(&conn->ibc_list);
                        if (conn->ibc_reconnect) {
-                               peer = conn->ibc_peer;
-                               kiblnd_peer_addref(peer);
+                               peer_ni = conn->ibc_peer;
+                               kiblnd_peer_addref(peer_ni);
                        }
 
                        spin_unlock_irqrestore(lock, flags);
                        dropped_lock = 1;
 
-                       kiblnd_destroy_conn(conn, !peer);
+                       kiblnd_destroy_conn(conn, !peer_ni);
 
                        spin_lock_irqsave(lock, flags);
-                       if (!peer)
+                       if (!peer_ni)
                                continue;
 
-                       conn->ibc_peer = peer;
-                       if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE)
+                       conn->ibc_peer = peer_ni;
+                       if (peer_ni->ibp_reconnected < KIB_RECONN_HIGH_RACE)
                                list_add_tail(&conn->ibc_list,
                                              &kiblnd_data.kib_reconn_list);
                        else
@@ -3356,7 +3357,7 @@ kiblnd_connd (void *arg)
 
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
-                         * proportion of the peer table and I need to check
+                         * proportion of the peer_ni table and I need to check
                          * every connection 'n' times within a timeout
                          * interval, to ensure I detect a timeout on any
                          * connection within (n+1)/n times the timeout
index 1466dd9..54a81b5 100644 (file)
@@ -164,7 +164,7 @@ kiblnd_msg_queue_size(int version, lnet_ni_t *ni)
        if (version == IBLND_MSG_VERSION_1)
                return IBLND_MSG_QUEUE_SIZE_V1;
        else if (ni)
-               return ni->ni_peertxcredits;
+               return ni->ni_net->net_tunables.lct_peer_tx_credits;
        else
                return peer_credits;
 }
@@ -173,21 +173,17 @@ int
 kiblnd_tunables_setup(lnet_ni_t *ni)
 {
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+       struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
 
        /*
         * if there was no tunables specified, setup the tunables to be
         * defaulted
         */
-       if (!ni->ni_lnd_tunables) {
-               LIBCFS_ALLOC(ni->ni_lnd_tunables,
-                            sizeof(*ni->ni_lnd_tunables));
-               if (!ni->ni_lnd_tunables)
-                       return -ENOMEM;
-
-               memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib,
+       if (!ni->ni_lnd_tunables_set)
+               memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib,
                       &default_tunables, sizeof(*tunables));
-       }
-       tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
        /* Current API version */
        tunables->lnd_version = 0;
@@ -198,35 +194,39 @@ kiblnd_tunables_setup(lnet_ni_t *ni)
                return -EINVAL;
        }
 
-       if (!ni->ni_peertimeout)
-               ni->ni_peertimeout = peer_timeout;
+       net_tunables = &ni->ni_net->net_tunables;
 
-       if (!ni->ni_maxtxcredits)
-               ni->ni_maxtxcredits = credits;
+       if (net_tunables->lct_peer_timeout == -1)
+               net_tunables->lct_peer_timeout = peer_timeout;
 
-       if (!ni->ni_peertxcredits)
-               ni->ni_peertxcredits = peer_credits;
+       if (net_tunables->lct_max_tx_credits == -1)
+               net_tunables->lct_max_tx_credits = credits;
 
-       if (!ni->ni_peerrtrcredits)
-               ni->ni_peerrtrcredits = peer_buffer_credits;
+       if (net_tunables->lct_peer_tx_credits == -1)
+               net_tunables->lct_peer_tx_credits = peer_credits;
 
-       if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT)
-               ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT;
+       if (net_tunables->lct_peer_rtr_credits == -1)
+               net_tunables->lct_peer_rtr_credits = peer_buffer_credits;
 
-       if (ni->ni_peertxcredits > IBLND_CREDITS_MAX)
-               ni->ni_peertxcredits = IBLND_CREDITS_MAX;
+       if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT)
+               net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT;
 
-       if (ni->ni_peertxcredits > credits)
-               ni->ni_peertxcredits = credits;
+       if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX)
+               net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX;
+
+       if (net_tunables->lct_peer_tx_credits >
+           net_tunables->lct_max_tx_credits)
+               net_tunables->lct_peer_tx_credits =
+                       net_tunables->lct_max_tx_credits;
 
        if (!tunables->lnd_peercredits_hiw)
                tunables->lnd_peercredits_hiw = peer_credits_hiw;
 
-       if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2)
-               tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2;
+       if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2)
+               tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2;
 
-       if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits)
-               tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1;
+       if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
+               tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
 
        if (tunables->lnd_map_on_demand < 0 ||
            tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
@@ -243,22 +243,24 @@ kiblnd_tunables_setup(lnet_ni_t *ni)
                if (tunables->lnd_map_on_demand > 0 &&
                    tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
                        tunables->lnd_concurrent_sends =
-                                               ni->ni_peertxcredits * 2;
+                                       net_tunables->lct_peer_tx_credits * 2;
                } else {
-                       tunables->lnd_concurrent_sends = ni->ni_peertxcredits;
+                       tunables->lnd_concurrent_sends =
+                               net_tunables->lct_peer_tx_credits;
                }
        }
 
-       if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2)
-               tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2;
+       if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
+               tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
 
-       if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2)
-               tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2;
+       if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2)
+               tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2;
 
-       if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) {
+       if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) {
                CWARN("Concurrent sends %d is lower than message "
                      "queue size: %d, performance may drop slightly.\n",
-                     tunables->lnd_concurrent_sends, ni->ni_peertxcredits);
+                     tunables->lnd_concurrent_sends,
+                     net_tunables->lct_peer_tx_credits);
        }
 
        if (!tunables->lnd_fmr_pool_size)
index 33c34cd..ad32457 100644 (file)
@@ -37,6 +37,7 @@
  * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
+#include <linux/pci.h>
 #include "socklnd.h"
 
 static lnd_t                   the_ksocklnd;
@@ -96,42 +97,42 @@ ksocknal_destroy_route (ksock_route_t *route)
 }
 
 static int
-ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+ksocknal_create_peer(ksock_peer_ni_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
 {
-       int             cpt = lnet_cpt_of_nid(id.nid);
+       int             cpt = lnet_cpt_of_nid(id.nid, ni);
        ksock_net_t     *net = ni->ni_data;
-       ksock_peer_t    *peer;
+       ksock_peer_ni_t *peer_ni;
 
        LASSERT(id.nid != LNET_NID_ANY);
        LASSERT(id.pid != LNET_PID_ANY);
        LASSERT(!in_interrupt());
 
-       LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
-       if (peer == NULL)
+       LIBCFS_CPT_ALLOC(peer_ni, lnet_cpt_table(), cpt, sizeof(*peer_ni));
+       if (peer_ni == NULL)
                return -ENOMEM;
 
-       peer->ksnp_ni = ni;
-       peer->ksnp_id = id;
-       atomic_set(&peer->ksnp_refcount, 1);    /* 1 ref for caller */
-       peer->ksnp_closing = 0;
-       peer->ksnp_accepting = 0;
-       peer->ksnp_proto = NULL;
-       peer->ksnp_last_alive = 0;
-       peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
-
-       INIT_LIST_HEAD(&peer->ksnp_conns);
-       INIT_LIST_HEAD(&peer->ksnp_routes);
-       INIT_LIST_HEAD(&peer->ksnp_tx_queue);
-       INIT_LIST_HEAD(&peer->ksnp_zc_req_list);
-       spin_lock_init(&peer->ksnp_lock);
+       peer_ni->ksnp_ni = ni;
+       peer_ni->ksnp_id = id;
+       atomic_set(&peer_ni->ksnp_refcount, 1); /* 1 ref for caller */
+       peer_ni->ksnp_closing = 0;
+       peer_ni->ksnp_accepting = 0;
+       peer_ni->ksnp_proto = NULL;
+       peer_ni->ksnp_last_alive = 0;
+       peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+       INIT_LIST_HEAD(&peer_ni->ksnp_conns);
+       INIT_LIST_HEAD(&peer_ni->ksnp_routes);
+       INIT_LIST_HEAD(&peer_ni->ksnp_tx_queue);
+       INIT_LIST_HEAD(&peer_ni->ksnp_zc_req_list);
+       spin_lock_init(&peer_ni->ksnp_lock);
 
        spin_lock_bh(&net->ksnn_lock);
 
        if (net->ksnn_shutdown) {
                spin_unlock_bh(&net->ksnn_lock);
 
-               LIBCFS_FREE(peer, sizeof(*peer));
-               CERROR("Can't create peer: network shutdown\n");
+               LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
+               CERROR("Can't create peer_ni: network shutdown\n");
                return -ESHUTDOWN;
        }
 
@@ -139,106 +140,106 @@ ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
 
        spin_unlock_bh(&net->ksnn_lock);
 
-       *peerp = peer;
+       *peerp = peer_ni;
        return 0;
 }
 
 void
-ksocknal_destroy_peer (ksock_peer_t *peer)
+ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni)
 {
-       ksock_net_t    *net = peer->ksnp_ni->ni_data;
+       ksock_net_t    *net = peer_ni->ksnp_ni->ni_data;
 
-       CDEBUG (D_NET, "peer %s %p deleted\n",
-               libcfs_id2str(peer->ksnp_id), peer);
+       CDEBUG (D_NET, "peer_ni %s %p deleted\n",
+               libcfs_id2str(peer_ni->ksnp_id), peer_ni);
 
-       LASSERT(atomic_read(&peer->ksnp_refcount) == 0);
-       LASSERT(peer->ksnp_accepting == 0);
-       LASSERT(list_empty(&peer->ksnp_conns));
-       LASSERT(list_empty(&peer->ksnp_routes));
-       LASSERT(list_empty(&peer->ksnp_tx_queue));
-       LASSERT(list_empty(&peer->ksnp_zc_req_list));
+       LASSERT(atomic_read(&peer_ni->ksnp_refcount) == 0);
+       LASSERT(peer_ni->ksnp_accepting == 0);
+       LASSERT(list_empty(&peer_ni->ksnp_conns));
+       LASSERT(list_empty(&peer_ni->ksnp_routes));
+       LASSERT(list_empty(&peer_ni->ksnp_tx_queue));
+       LASSERT(list_empty(&peer_ni->ksnp_zc_req_list));
 
-       LIBCFS_FREE(peer, sizeof(*peer));
+       LIBCFS_FREE(peer_ni, sizeof(*peer_ni));
 
-        /* NB a peer's connections and routes keep a reference on their peer
+        /* NB a peer_ni's connections and routes keep a reference on their peer_ni
          * until they are destroyed, so we can be assured that _all_ state to
-         * do with this peer has been cleaned up when its refcount drops to
+         * do with this peer_ni has been cleaned up when its refcount drops to
          * zero. */
        spin_lock_bh(&net->ksnn_lock);
        net->ksnn_npeers--;
        spin_unlock_bh(&net->ksnn_lock);
 }
 
-ksock_peer_t *
+ksock_peer_ni_t *
 ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
 {
        struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
        struct list_head *tmp;
-       ksock_peer_t     *peer;
+       ksock_peer_ni_t  *peer_ni;
 
        list_for_each(tmp, peer_list) {
 
-               peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+               peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
 
-               LASSERT(!peer->ksnp_closing);
+               LASSERT(!peer_ni->ksnp_closing);
 
-               if (peer->ksnp_ni != ni)
+               if (peer_ni->ksnp_ni != ni)
                        continue;
 
-               if (peer->ksnp_id.nid != id.nid ||
-                   peer->ksnp_id.pid != id.pid)
+               if (peer_ni->ksnp_id.nid != id.nid ||
+                   peer_ni->ksnp_id.pid != id.pid)
                        continue;
 
-               CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
-                      peer, libcfs_id2str(id),
-                      atomic_read(&peer->ksnp_refcount));
-               return peer;
+               CDEBUG(D_NET, "got peer_ni [%p] -> %s (%d)\n",
+                      peer_ni, libcfs_id2str(id),
+                      atomic_read(&peer_ni->ksnp_refcount));
+               return peer_ni;
        }
        return NULL;
 }
 
-ksock_peer_t *
+ksock_peer_ni_t *
 ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
 {
-        ksock_peer_t     *peer;
+        ksock_peer_ni_t     *peer_ni;
 
        read_lock(&ksocknal_data.ksnd_global_lock);
-       peer = ksocknal_find_peer_locked(ni, id);
-       if (peer != NULL)                       /* +1 ref for caller? */
-               ksocknal_peer_addref(peer);
+       peer_ni = ksocknal_find_peer_locked(ni, id);
+       if (peer_ni != NULL)                    /* +1 ref for caller? */
+               ksocknal_peer_addref(peer_ni);
        read_unlock(&ksocknal_data.ksnd_global_lock);
 
-        return (peer);
+        return (peer_ni);
 }
 
 static void
-ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+ksocknal_unlink_peer_locked (ksock_peer_ni_t *peer_ni)
 {
         int                i;
         __u32              ip;
         ksock_interface_t *iface;
 
-        for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+        for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++) {
                 LASSERT (i < LNET_MAX_INTERFACES);
-                ip = peer->ksnp_passive_ips[i];
+                ip = peer_ni->ksnp_passive_ips[i];
 
-                iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
-                /* All IPs in peer->ksnp_passive_ips[] come from the
+                iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
+                /* All IPs in peer_ni->ksnp_passive_ips[] come from the
                  * interface list, therefore the call must succeed. */
                 LASSERT (iface != NULL);
 
-                CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
-                       peer, iface, iface->ksni_nroutes);
+                CDEBUG(D_NET, "peer_ni=%p iface=%p ksni_nroutes=%d\n",
+                       peer_ni, iface, iface->ksni_nroutes);
                 iface->ksni_npeers--;
         }
 
-       LASSERT(list_empty(&peer->ksnp_conns));
-       LASSERT(list_empty(&peer->ksnp_routes));
-       LASSERT(!peer->ksnp_closing);
-       peer->ksnp_closing = 1;
-       list_del(&peer->ksnp_list);
+       LASSERT(list_empty(&peer_ni->ksnp_conns));
+       LASSERT(list_empty(&peer_ni->ksnp_routes));
+       LASSERT(!peer_ni->ksnp_closing);
+       peer_ni->ksnp_closing = 1;
+       list_del(&peer_ni->ksnp_list);
        /* lose peerlist's ref */
-       ksocknal_peer_decref(peer);
+       ksocknal_peer_decref(peer_ni);
 }
 
 static int
@@ -246,7 +247,7 @@ ksocknal_get_peer_info (lnet_ni_t *ni, int index,
                         lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
                         int *port, int *conn_count, int *share_count)
 {
-       ksock_peer_t      *peer;
+       ksock_peer_ni_t   *peer_ni;
        struct list_head  *ptmp;
        ksock_route_t     *route;
        struct list_head  *rtmp;
@@ -258,17 +259,17 @@ ksocknal_get_peer_info (lnet_ni_t *ni, int index,
 
        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-                       peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+                       peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
-                       if (peer->ksnp_ni != ni)
+                       if (peer_ni->ksnp_ni != ni)
                                continue;
 
-                       if (peer->ksnp_n_passive_ips == 0 &&
-                           list_empty(&peer->ksnp_routes)) {
+                       if (peer_ni->ksnp_n_passive_ips == 0 &&
+                           list_empty(&peer_ni->ksnp_routes)) {
                                if (index-- > 0)
                                        continue;
 
-                                *id = peer->ksnp_id;
+                                *id = peer_ni->ksnp_id;
                                 *myip = 0;
                                 *peer_ip = 0;
                                 *port = 0;
@@ -278,12 +279,12 @@ ksocknal_get_peer_info (lnet_ni_t *ni, int index,
                                 goto out;
                         }
 
-                       for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+                       for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++) {
                                if (index-- > 0)
                                        continue;
 
-                                *id = peer->ksnp_id;
-                                *myip = peer->ksnp_passive_ips[j];
+                                *id = peer_ni->ksnp_id;
+                                *myip = peer_ni->ksnp_passive_ips[j];
                                 *peer_ip = 0;
                                 *port = 0;
                                 *conn_count = 0;
@@ -292,14 +293,14 @@ ksocknal_get_peer_info (lnet_ni_t *ni, int index,
                                 goto out;
                         }
 
-                       list_for_each(rtmp, &peer->ksnp_routes) {
+                       list_for_each(rtmp, &peer_ni->ksnp_routes) {
                                if (index-- > 0)
                                        continue;
 
                                route = list_entry(rtmp, ksock_route_t,
                                                   ksnr_list);
 
-                               *id = peer->ksnp_id;
+                               *id = peer_ni->ksnp_id;
                                *myip = route->ksnr_myipaddr;
                                *peer_ip = route->ksnr_ipaddr;
                                *port = route->ksnr_port;
@@ -318,7 +319,7 @@ out:
 static void
 ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 {
-       ksock_peer_t      *peer = route->ksnr_peer;
+       ksock_peer_ni_t   *peer_ni = route->ksnr_peer;
        int                type = conn->ksnc_type;
        ksock_interface_t *iface;
 
@@ -329,12 +330,12 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
                if (route->ksnr_myipaddr == 0) {
                        /* route wasn't bound locally yet (the initial route) */
                        CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
-                              libcfs_id2str(peer->ksnp_id),
+                              libcfs_id2str(peer_ni->ksnp_id),
                               &route->ksnr_ipaddr,
                               &conn->ksnc_myipaddr);
                } else {
                        CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h "
-                              "to %pI4h\n", libcfs_id2str(peer->ksnp_id),
+                              "to %pI4h\n", libcfs_id2str(peer_ni->ksnp_id),
                               &route->ksnr_ipaddr,
                               &route->ksnr_myipaddr,
                               &conn->ksnc_myipaddr);
@@ -360,36 +361,36 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 }
 
 static void
-ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+ksocknal_add_route_locked (ksock_peer_ni_t *peer_ni, ksock_route_t *route)
 {
        struct list_head *tmp;
        ksock_conn_t     *conn;
        ksock_route_t    *route2;
 
-       LASSERT(!peer->ksnp_closing);
+       LASSERT(!peer_ni->ksnp_closing);
        LASSERT(route->ksnr_peer == NULL);
        LASSERT(!route->ksnr_scheduled);
        LASSERT(!route->ksnr_connecting);
        LASSERT(route->ksnr_connected == 0);
 
        /* LASSERT(unique) */
-       list_for_each(tmp, &peer->ksnp_routes) {
+       list_for_each(tmp, &peer_ni->ksnp_routes) {
                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
 
                if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
                        CERROR("Duplicate route %s %pI4h\n",
-                              libcfs_id2str(peer->ksnp_id),
+                              libcfs_id2str(peer_ni->ksnp_id),
                               &route->ksnr_ipaddr);
                        LBUG();
                }
        }
 
-       route->ksnr_peer = peer;
-       ksocknal_peer_addref(peer);
-       /* peer's routelist takes over my ref on 'route' */
-       list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+       route->ksnr_peer = peer_ni;
+       ksocknal_peer_addref(peer_ni);
+       /* peer_ni's routelist takes over my ref on 'route' */
+       list_add_tail(&route->ksnr_list, &peer_ni->ksnp_routes);
 
-       list_for_each(tmp, &peer->ksnp_conns) {
+       list_for_each(tmp, &peer_ni->ksnp_conns) {
                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
@@ -403,7 +404,7 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
 static void
 ksocknal_del_route_locked (ksock_route_t *route)
 {
-       ksock_peer_t      *peer = route->ksnr_peer;
+       ksock_peer_ni_t   *peer_ni = route->ksnr_peer;
        ksock_interface_t *iface;
        ksock_conn_t      *conn;
        struct list_head  *ctmp;
@@ -412,7 +413,7 @@ ksocknal_del_route_locked (ksock_route_t *route)
        LASSERT(!route->ksnr_deleted);
 
        /* Close associated conns */
-       list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+       list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
                if (conn->ksnc_route != route)
@@ -430,13 +431,13 @@ ksocknal_del_route_locked (ksock_route_t *route)
 
        route->ksnr_deleted = 1;
        list_del(&route->ksnr_list);
-       ksocknal_route_decref(route);           /* drop peer's ref */
+       ksocknal_route_decref(route);           /* drop peer_ni's ref */
 
-       if (list_empty(&peer->ksnp_routes) &&
-           list_empty(&peer->ksnp_conns)) {
-               /* I've just removed the last route to a peer with no active
+       if (list_empty(&peer_ni->ksnp_routes) &&
+           list_empty(&peer_ni->ksnp_conns)) {
+               /* I've just removed the last route to a peer_ni with no active
                 * connections */
-               ksocknal_unlink_peer_locked(peer);
+               ksocknal_unlink_peer_locked(peer_ni);
        }
 }
 
@@ -444,8 +445,8 @@ int
 ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
 {
        struct list_head *tmp;
-       ksock_peer_t     *peer;
-       ksock_peer_t     *peer2;
+       ksock_peer_ni_t  *peer_ni;
+       ksock_peer_ni_t  *peer2;
        ksock_route_t    *route;
        ksock_route_t    *route2;
        int               rc;
@@ -454,14 +455,14 @@ ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
             id.pid == LNET_PID_ANY)
                 return (-EINVAL);
 
-        /* Have a brand new peer ready... */
-        rc = ksocknal_create_peer(&peer, ni, id);
+        /* Have a brand new peer_ni ready... */
+        rc = ksocknal_create_peer(&peer_ni, ni, id);
         if (rc != 0)
                 return rc;
 
         route = ksocknal_create_route (ipaddr, port);
         if (route == NULL) {
-                ksocknal_peer_decref(peer);
+                ksocknal_peer_decref(peer_ni);
                 return (-ENOMEM);
         }
 
@@ -472,16 +473,16 @@ ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
 
        peer2 = ksocknal_find_peer_locked(ni, id);
        if (peer2 != NULL) {
-               ksocknal_peer_decref(peer);
-               peer = peer2;
+               ksocknal_peer_decref(peer_ni);
+               peer_ni = peer2;
        } else {
-               /* peer table takes my ref on peer */
-               list_add_tail(&peer->ksnp_list,
+               /* peer_ni table takes my ref on peer_ni */
+               list_add_tail(&peer_ni->ksnp_list,
                              ksocknal_nid2peerlist(id.nid));
        }
 
        route2 = NULL;
-       list_for_each(tmp, &peer->ksnp_routes) {
+       list_for_each(tmp, &peer_ni->ksnp_routes) {
                route2 = list_entry(tmp, ksock_route_t, ksnr_list);
 
                if (route2->ksnr_ipaddr == ipaddr)
@@ -490,7 +491,7 @@ ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
                route2 = NULL;
        }
        if (route2 == NULL) {
-               ksocknal_add_route_locked(peer, route);
+               ksocknal_add_route_locked(peer_ni, route);
                route->ksnr_share_count++;
        } else {
                ksocknal_route_decref(route);
@@ -503,7 +504,7 @@ ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
 }
 
 static void
-ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+ksocknal_del_peer_locked (ksock_peer_ni_t *peer_ni, __u32 ip)
 {
        ksock_conn_t     *conn;
        ksock_route_t    *route;
@@ -511,12 +512,12 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
        struct list_head *nxt;
        int               nshared;
 
-       LASSERT(!peer->ksnp_closing);
+       LASSERT(!peer_ni->ksnp_closing);
 
-       /* Extra ref prevents peer disappearing until I'm done with it */
-       ksocknal_peer_addref(peer);
+       /* Extra ref prevents peer_ni disappearing until I'm done with it */
+       ksocknal_peer_addref(peer_ni);
 
-       list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+       list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                /* no match */
@@ -529,7 +530,7 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
        }
 
        nshared = 0;
-       list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+       list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
                nshared += route->ksnr_share_count;
        }
@@ -538,7 +539,7 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
                /* remove everything else if there are no explicit entries
                 * left */
 
-               list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+               list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
                        route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                        /* we should only be removing auto-entries */
@@ -546,15 +547,15 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
                        ksocknal_del_route_locked(route);
                }
 
-               list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+               list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                        ksocknal_close_conn_locked(conn, 0);
                }
        }
 
-       ksocknal_peer_decref(peer);
-               /* NB peer unlinks itself when last conn/route is removed */
+       ksocknal_peer_decref(peer_ni);
+               /* NB peer_ni unlinks itself when last conn/route is removed */
 }
 
 static int
@@ -563,7 +564,7 @@ ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
        struct list_head  zombies = LIST_HEAD_INIT(zombies);
        struct list_head *ptmp;
        struct list_head *pnxt;
-       ksock_peer_t     *peer;
+       ksock_peer_ni_t     *peer_ni;
        int               lo;
        int               hi;
        int               i;
@@ -583,31 +584,31 @@ ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
        for (i = lo; i <= hi; i++) {
                list_for_each_safe(ptmp, pnxt,
                                   &ksocknal_data.ksnd_peers[i]) {
-                       peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+                       peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
-                       if (peer->ksnp_ni != ni)
+                       if (peer_ni->ksnp_ni != ni)
                                continue;
 
                        if (!((id.nid == LNET_NID_ANY ||
-                              peer->ksnp_id.nid == id.nid) &&
+                              peer_ni->ksnp_id.nid == id.nid) &&
                              (id.pid == LNET_PID_ANY ||
-                              peer->ksnp_id.pid == id.pid)))
+                              peer_ni->ksnp_id.pid == id.pid)))
                                continue;
 
-                       ksocknal_peer_addref(peer);     /* a ref for me... */
+                       ksocknal_peer_addref(peer_ni);  /* a ref for me... */
 
-                       ksocknal_del_peer_locked(peer, ip);
+                       ksocknal_del_peer_locked(peer_ni, ip);
 
-                       if (peer->ksnp_closing &&
-                           !list_empty(&peer->ksnp_tx_queue)) {
-                               LASSERT(list_empty(&peer->ksnp_conns));
-                               LASSERT(list_empty(&peer->ksnp_routes));
+                       if (peer_ni->ksnp_closing &&
+                           !list_empty(&peer_ni->ksnp_tx_queue)) {
+                               LASSERT(list_empty(&peer_ni->ksnp_conns));
+                               LASSERT(list_empty(&peer_ni->ksnp_routes));
 
-                               list_splice_init(&peer->ksnp_tx_queue,
+                               list_splice_init(&peer_ni->ksnp_tx_queue,
                                                 &zombies);
                        }
 
-                       ksocknal_peer_decref(peer);     /* ...till here */
+                       ksocknal_peer_decref(peer_ni);  /* ...till here */
 
                        rc = 0;                         /* matched! */
                }
@@ -623,7 +624,7 @@ ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
 static ksock_conn_t *
 ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
 {
-       ksock_peer_t     *peer;
+       ksock_peer_ni_t  *peer_ni;
        struct list_head *ptmp;
        ksock_conn_t     *conn;
        struct list_head *ctmp;
@@ -633,14 +634,14 @@ ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
 
        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-                       peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+                       peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
-                       LASSERT(!peer->ksnp_closing);
+                       LASSERT(!peer_ni->ksnp_closing);
 
-                       if (peer->ksnp_ni != ni)
+                       if (peer_ni->ksnp_ni != ni)
                                continue;
 
-                       list_for_each(ctmp, &peer->ksnp_conns) {
+                       list_for_each(ctmp, &peer_ni->ksnp_conns) {
                                if (index-- > 0)
                                        continue;
 
@@ -742,10 +743,10 @@ ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
 }
 
 static int
-ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+ksocknal_select_ips(ksock_peer_ni_t *peer_ni, __u32 *peerips, int n_peerips)
 {
        rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
-        ksock_net_t        *net = peer->ksnp_ni->ni_data;
+        ksock_net_t        *net = peer_ni->ksnp_ni->ni_data;
         ksock_interface_t  *iface;
         ksock_interface_t  *best_iface;
         int                 n_ips;
@@ -776,25 +777,25 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
         n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
                 MIN(n_peerips, net->ksnn_ninterfaces);
 
-        for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+        for (i = 0; peer_ni->ksnp_n_passive_ips < n_ips; i++) {
                 /*              ^ yes really... */
 
                 /* If we have any new interfaces, first tick off all the
-                 * peer IPs that match old interfaces, then choose new
-                 * interfaces to match the remaining peer IPS.
+                 * peer_ni IPs that match old interfaces, then choose new
+                 * interfaces to match the remaining peer_ni IPS.
                  * We don't forget interfaces we've stopped using; we might
                  * start using them again... */
 
-                if (i < peer->ksnp_n_passive_ips) {
+                if (i < peer_ni->ksnp_n_passive_ips) {
                         /* Old interface. */
-                        ip = peer->ksnp_passive_ips[i];
-                        best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+                        ip = peer_ni->ksnp_passive_ips[i];
+                        best_iface = ksocknal_ip2iface(peer_ni->ksnp_ni, ip);
 
-                        /* peer passive ips are kept up to date */
+                        /* peer_ni passive ips are kept up to date */
                         LASSERT(best_iface != NULL);
                 } else {
                         /* choose a new interface */
-                        LASSERT (i == peer->ksnp_n_passive_ips);
+                        LASSERT (i == peer_ni->ksnp_n_passive_ips);
 
                         best_iface = NULL;
                         best_netmatch = 0;
@@ -804,11 +805,11 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
                                 iface = &net->ksnn_interfaces[j];
                                 ip = iface->ksni_ipaddr;
 
-                                for (k = 0; k < peer->ksnp_n_passive_ips; k++)
-                                        if (peer->ksnp_passive_ips[k] == ip)
+                                for (k = 0; k < peer_ni->ksnp_n_passive_ips; k++)
+                                        if (peer_ni->ksnp_passive_ips[k] == ip)
                                                 break;
 
-                                if (k < peer->ksnp_n_passive_ips) /* using it already */
+                                if (k < peer_ni->ksnp_n_passive_ips) /* using it already */
                                         continue;
 
                                 k = ksocknal_match_peerip(iface, peerips, n_peerips);
@@ -830,17 +831,17 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
 
                         best_iface->ksni_npeers++;
                         ip = best_iface->ksni_ipaddr;
-                        peer->ksnp_passive_ips[i] = ip;
-                        peer->ksnp_n_passive_ips = i+1;
+                        peer_ni->ksnp_passive_ips[i] = ip;
+                        peer_ni->ksnp_n_passive_ips = i+1;
                 }
 
-                /* mark the best matching peer IP used */
+                /* mark the best matching peer_ni IP used */
                 j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
                 peerips[j] = 0;
         }
 
-        /* Overwrite input peer IP addresses */
-        memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+        /* Overwrite input peer_ni IP addresses */
+        memcpy(peerips, peer_ni->ksnp_passive_ips, n_ips * sizeof(*peerips));
 
        write_unlock_bh(global_lock);
 
@@ -848,12 +849,12 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
 }
 
 static void
-ksocknal_create_routes(ksock_peer_t *peer, int port,
+ksocknal_create_routes(ksock_peer_ni_t *peer_ni, int port,
                        __u32 *peer_ipaddrs, int npeer_ipaddrs)
 {
        ksock_route_t           *newroute = NULL;
        rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
-       lnet_ni_t               *ni = peer->ksnp_ni;
+       lnet_ni_t               *ni = peer_ni->ksnp_ni;
        ksock_net_t             *net = ni->ni_data;
        struct list_head        *rtmp;
        ksock_route_t           *route;
@@ -894,14 +895,14 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                        write_lock_bh(global_lock);
                 }
 
-                if (peer->ksnp_closing) {
-                        /* peer got closed under me */
+                if (peer_ni->ksnp_closing) {
+                        /* peer_ni got closed under me */
                         break;
                 }
 
                /* Already got a route? */
                route = NULL;
-               list_for_each(rtmp, &peer->ksnp_routes) {
+               list_for_each(rtmp, &peer_ni->ksnp_routes) {
                        route = list_entry(rtmp, ksock_route_t, ksnr_list);
 
                        if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
@@ -923,7 +924,7 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                        iface = &net->ksnn_interfaces[j];
 
                        /* Using this interface already? */
-                       list_for_each(rtmp, &peer->ksnp_routes) {
+                       list_for_each(rtmp, &peer_ni->ksnp_routes) {
                                route = list_entry(rtmp, ksock_route_t,
                                                   ksnr_list);
 
@@ -956,7 +957,7 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                 newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
                 best_iface->ksni_nroutes++;
 
-                ksocknal_add_route_locked(peer, newroute);
+                ksocknal_add_route_locked(peer_ni, newroute);
                 newroute = NULL;
         }
 
@@ -997,11 +998,11 @@ ksocknal_accept(lnet_ni_t *ni, struct socket *sock)
 }
 
 static int
-ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+ksocknal_connecting (ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 {
        ksock_route_t *route;
 
-       list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) {
+       list_for_each_entry(route, &peer_ni->ksnp_routes, ksnr_list) {
                if (route->ksnr_ipaddr == ipaddr)
                        return route->ksnr_connecting;
        }
@@ -1019,8 +1020,8 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         __u64              incarnation;
         ksock_conn_t      *conn;
         ksock_conn_t      *conn2;
-        ksock_peer_t      *peer = NULL;
-        ksock_peer_t      *peer2;
+        ksock_peer_ni_t      *peer_ni = NULL;
+        ksock_peer_ni_t      *peer2;
         ksock_sched_t     *sched;
        struct ksock_hello_msg *hello;
        int                cpt;
@@ -1071,21 +1072,21 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         if (rc != 0)
                 goto failed_1;
 
-        /* Find out/confirm peer's NID and connection type and get the
+        /* Find out/confirm peer_ni's NID and connection type and get the
          * vector of interfaces she's willing to let me connect to.
-         * Passive connections use the listener timeout since the peer sends
+         * Passive connections use the listener timeout since the peer_ni sends
          * eagerly */
 
         if (active) {
-                peer = route->ksnr_peer;
-                LASSERT(ni == peer->ksnp_ni);
+                peer_ni = route->ksnr_peer;
+                LASSERT(ni == peer_ni->ksnp_ni);
 
                 /* Active connection sends HELLO eagerly */
                 hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
-                peerid = peer->ksnp_id;
+                peerid = peer_ni->ksnp_id;
 
                write_lock_bh(global_lock);
-                conn->ksnc_proto = peer->ksnp_proto;
+                conn->ksnc_proto = peer_ni->ksnp_proto;
                write_unlock_bh(global_lock);
 
                 if (conn->ksnc_proto == NULL) {
@@ -1105,7 +1106,7 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
                 peerid.nid = LNET_NID_ANY;
                 peerid.pid = LNET_PID_ANY;
 
-                /* Passive, get protocol from peer */
+                /* Passive, get protocol from peer_ni */
                 conn->ksnc_proto = NULL;
         }
 
@@ -1117,13 +1118,13 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         LASSERT (conn->ksnc_proto != NULL);
         LASSERT (peerid.nid != LNET_NID_ANY);
 
-       cpt = lnet_cpt_of_nid(peerid.nid);
+       cpt = lnet_cpt_of_nid(peerid.nid, ni);
 
         if (active) {
-                ksocknal_peer_addref(peer);
+                ksocknal_peer_addref(peer_ni);
                write_lock_bh(global_lock);
         } else {
-                rc = ksocknal_create_peer(&peer, ni, peerid);
+                rc = ksocknal_create_peer(&peer_ni, ni, peerid);
                 if (rc != 0)
                         goto failed_1;
 
@@ -1134,57 +1135,57 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
 
                peer2 = ksocknal_find_peer_locked(ni, peerid);
                if (peer2 == NULL) {
-                       /* NB this puts an "empty" peer in the peer
+                       /* NB this puts an "empty" peer_ni in the peer_ni
                         * table (which takes my ref) */
-                       list_add_tail(&peer->ksnp_list,
+                       list_add_tail(&peer_ni->ksnp_list,
                                      ksocknal_nid2peerlist(peerid.nid));
                } else {
-                       ksocknal_peer_decref(peer);
-                       peer = peer2;
+                       ksocknal_peer_decref(peer_ni);
+                       peer_ni = peer2;
                }
 
                 /* +1 ref for me */
-                ksocknal_peer_addref(peer);
-                peer->ksnp_accepting++;
+                ksocknal_peer_addref(peer_ni);
+                peer_ni->ksnp_accepting++;
 
                 /* Am I already connecting to this guy?  Resolve in
                  * favour of higher NID... */
                 if (peerid.nid < ni->ni_nid &&
-                    ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+                    ksocknal_connecting(peer_ni, conn->ksnc_ipaddr)) {
                         rc = EALREADY;
                         warn = "connection race resolution";
                         goto failed_2;
                 }
         }
 
-        if (peer->ksnp_closing ||
+        if (peer_ni->ksnp_closing ||
             (active && route->ksnr_deleted)) {
-                /* peer/route got closed under me */
+                /* peer_ni/route got closed under me */
                 rc = -ESTALE;
-                warn = "peer/route removed";
+                warn = "peer_ni/route removed";
                 goto failed_2;
         }
 
-       if (peer->ksnp_proto == NULL) {
+       if (peer_ni->ksnp_proto == NULL) {
                /* Never connected before.
-                * NB recv_hello may have returned EPROTO to signal my peer
+                * NB recv_hello may have returned EPROTO to signal my peer_ni
                 * wants a different protocol than the one I asked for.
                 */
-               LASSERT(list_empty(&peer->ksnp_conns));
+               LASSERT(list_empty(&peer_ni->ksnp_conns));
 
-               peer->ksnp_proto = conn->ksnc_proto;
-               peer->ksnp_incarnation = incarnation;
+               peer_ni->ksnp_proto = conn->ksnc_proto;
+               peer_ni->ksnp_incarnation = incarnation;
        }
 
-        if (peer->ksnp_proto != conn->ksnc_proto ||
-            peer->ksnp_incarnation != incarnation) {
-                /* Peer rebooted or I've got the wrong protocol version */
-                ksocknal_close_peer_conns_locked(peer, 0, 0);
+        if (peer_ni->ksnp_proto != conn->ksnc_proto ||
+            peer_ni->ksnp_incarnation != incarnation) {
+                /* peer_ni rebooted or I've got the wrong protocol version */
+                ksocknal_close_peer_conns_locked(peer_ni, 0, 0);
 
-                peer->ksnp_proto = NULL;
+                peer_ni->ksnp_proto = NULL;
                 rc = ESTALE;
-                warn = peer->ksnp_incarnation != incarnation ?
-                       "peer rebooted" :
+                warn = peer_ni->ksnp_incarnation != incarnation ?
+                       "peer_ni rebooted" :
                        "wrong proto version";
                 goto failed_2;
         }
@@ -1205,7 +1206,7 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
        /* Refuse to duplicate an existing connection, unless this is a
         * loopback connection */
        if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
-               list_for_each(tmp, &peer->ksnp_conns) {
+               list_for_each(tmp, &peer_ni->ksnp_conns) {
                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                         if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
@@ -1213,7 +1214,7 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
                             conn2->ksnc_type != conn->ksnc_type)
                                 continue;
 
-                        /* Reply on a passive connection attempt so the peer
+                        /* Reply on a passive connection attempt so the peer_ni
                          * realises we're connected. */
                         LASSERT (rc == 0);
                         if (!active)
@@ -1230,16 +1231,16 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         if (active &&
             route->ksnr_ipaddr != conn->ksnc_ipaddr) {
                CERROR("Route %s %pI4h connected to %pI4h\n",
-                       libcfs_id2str(peer->ksnp_id),
+                       libcfs_id2str(peer_ni->ksnp_id),
                       &route->ksnr_ipaddr,
                       &conn->ksnc_ipaddr);
         }
 
        /* Search for a route corresponding to the new connection and
         * create an association.  This allows incoming connections created
-        * by routes in my peer to match my own route entries so I don't
+        * by routes in my peer_ni to match my own route entries so I don't
         * continually create duplicate routes. */
-       list_for_each(tmp, &peer->ksnp_routes) {
+       list_for_each(tmp, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
@@ -1249,10 +1250,10 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
                break;
        }
 
-        conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
-       peer->ksnp_last_alive = ktime_get_real_seconds();
-        peer->ksnp_send_keepalive = 0;
-        peer->ksnp_error = 0;
+       conn->ksnc_peer = peer_ni;                 /* conn takes my ref on peer_ni */
+       peer_ni->ksnp_last_alive = ktime_get_real_seconds();
+       peer_ni->ksnp_send_keepalive = 0;
+       peer_ni->ksnp_error = 0;
 
        sched = ksocknal_choose_scheduler_locked(cpt);
         sched->kss_nconns++;
@@ -1262,9 +1263,9 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
        /* Set the deadline for the outgoing HELLO to drain */
        conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
        conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-       smp_mb();   /* order with adding to peer's conn list */
+       smp_mb();   /* order with adding to peer_ni's conn list */
 
-       list_add(&conn->ksnc_list, &peer->ksnp_conns);
+       list_add(&conn->ksnc_list, &peer_ni->ksnp_conns);
        ksocknal_conn_addref(conn);
 
        ksocknal_new_packet(conn, 0);
@@ -1272,7 +1273,7 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
 
        /* Take packets blocking for this connection. */
-       list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+       list_for_each_entry_safe(tx, txtmp, &peer_ni->ksnp_tx_queue, tx_list) {
                if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) ==
                    SOCKNAL_MATCH_NO)
                        continue;
@@ -1300,10 +1301,10 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
 
         if (active) {
                 /* additional routes after interface exchange? */
-                ksocknal_create_routes(peer, conn->ksnc_port,
+                ksocknal_create_routes(peer_ni, conn->ksnc_port,
                                        hello->kshm_ips, hello->kshm_nips);
         } else {
-                hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+                hello->kshm_nips = ksocknal_select_ips(peer_ni, hello->kshm_ips,
                                                        hello->kshm_nips);
                 rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
         }
@@ -1325,7 +1326,7 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         ksocknal_lib_set_callback(sock, conn);
 
         if (!active)
-                peer->ksnp_accepting--;
+                peer_ni->ksnp_accepting--;
 
        write_unlock_bh(global_lock);
 
@@ -1348,12 +1349,12 @@ ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route,
         return rc;
 
 failed_2:
-       if (!peer->ksnp_closing &&
-           list_empty(&peer->ksnp_conns) &&
-           list_empty(&peer->ksnp_routes)) {
-               list_add(&zombies, &peer->ksnp_tx_queue);
-               list_del_init(&peer->ksnp_tx_queue);
-               ksocknal_unlink_peer_locked(peer);
+       if (!peer_ni->ksnp_closing &&
+           list_empty(&peer_ni->ksnp_conns) &&
+           list_empty(&peer_ni->ksnp_routes)) {
+               list_add(&zombies, &peer_ni->ksnp_tx_queue);
+               list_del_init(&peer_ni->ksnp_tx_queue);
+               ksocknal_unlink_peer_locked(peer_ni);
        }
 
        write_unlock_bh(global_lock);
@@ -1377,12 +1378,12 @@ failed_2:
                 }
 
                write_lock_bh(global_lock);
-                peer->ksnp_accepting--;
+                peer_ni->ksnp_accepting--;
                write_unlock_bh(global_lock);
         }
 
         ksocknal_txlist_done(ni, &zombies, 1);
-        ksocknal_peer_decref(peer);
+        ksocknal_peer_decref(peer_ni);
 
 failed_1:
        if (hello != NULL)
@@ -1402,16 +1403,16 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
         /* This just does the immmediate housekeeping, and queues the
          * connection for the reaper to terminate.
          * Caller holds ksnd_global_lock exclusively in irq context */
-        ksock_peer_t      *peer = conn->ksnc_peer;
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
         ksock_route_t     *route;
         ksock_conn_t      *conn2;
        struct list_head  *tmp;
 
-       LASSERT(peer->ksnp_error == 0);
+       LASSERT(peer_ni->ksnp_error == 0);
        LASSERT(!conn->ksnc_closing);
        conn->ksnc_closing = 1;
 
-       /* ksnd_deathrow_conns takes over peer's ref */
+       /* ksnd_deathrow_conns takes over peer_ni's ref */
        list_del(&conn->ksnc_list);
 
        route = conn->ksnc_route;
@@ -1421,7 +1422,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
                LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
 
                conn2 = NULL;
-               list_for_each(tmp, &peer->ksnp_conns) {
+               list_for_each(tmp, &peer_ni->ksnp_conns) {
                        conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                        if (conn2->ksnc_route == route &&
@@ -1438,35 +1439,35 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
                ksocknal_route_decref(route);   /* drop conn's ref on route */
        }
 
-       if (list_empty(&peer->ksnp_conns)) {
-               /* No more connections to this peer */
+       if (list_empty(&peer_ni->ksnp_conns)) {
+               /* No more connections to this peer_ni */
 
-               if (!list_empty(&peer->ksnp_tx_queue)) {
+               if (!list_empty(&peer_ni->ksnp_tx_queue)) {
                                ksock_tx_t *tx;
 
                        LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
 
                        /* throw them to the last connection...,
                         * these TXs will be send to /dev/null by scheduler */
-                       list_for_each_entry(tx, &peer->ksnp_tx_queue,
+                       list_for_each_entry(tx, &peer_ni->ksnp_tx_queue,
                                            tx_list)
                                ksocknal_tx_prep(conn, tx);
 
                        spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
-                       list_splice_init(&peer->ksnp_tx_queue,
+                       list_splice_init(&peer_ni->ksnp_tx_queue,
                                         &conn->ksnc_tx_queue);
                        spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
                }
 
                /* renegotiate protocol version */
-               peer->ksnp_proto = NULL;
+               peer_ni->ksnp_proto = NULL;
                /* stash last conn close reason */
-               peer->ksnp_error = error;
+               peer_ni->ksnp_error = error;
 
-               if (list_empty(&peer->ksnp_routes)) {
+               if (list_empty(&peer_ni->ksnp_routes)) {
                        /* I've just closed last conn belonging to a
-                        * peer with no routes to it */
-                       ksocknal_unlink_peer_locked(peer);
+                        * peer_ni with no routes to it */
+                       ksocknal_unlink_peer_locked(peer_ni);
                }
        }
 
@@ -1480,36 +1481,36 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 }
 
 void
-ksocknal_peer_failed (ksock_peer_t *peer)
+ksocknal_peer_failed (ksock_peer_ni_t *peer_ni)
 {
         int        notify = 0;
         cfs_time_t last_alive = 0;
 
        /* There has been a connection failure or comms error; but I'll only
-        * tell LNET I think the peer is dead if it's to another kernel and
+        * tell LNET I think the peer_ni is dead if it's to another kernel and
         * there are no connections or connection attempts in existence. */
 
        read_lock(&ksocknal_data.ksnd_global_lock);
 
-       if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
-            list_empty(&peer->ksnp_conns) &&
-            peer->ksnp_accepting == 0 &&
-            ksocknal_find_connecting_route_locked(peer) == NULL) {
+       if ((peer_ni->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+            list_empty(&peer_ni->ksnp_conns) &&
+            peer_ni->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
                notify = 1;
-               last_alive = peer->ksnp_last_alive;
+               last_alive = peer_ni->ksnp_last_alive;
        }
 
        read_unlock(&ksocknal_data.ksnd_global_lock);
 
        if (notify)
-               lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0,
+               lnet_notify(peer_ni->ksnp_ni, peer_ni->ksnp_id.nid, 0,
                            last_alive);
 }
 
 void
 ksocknal_finalize_zcreq(ksock_conn_t *conn)
 {
-       ksock_peer_t     *peer = conn->ksnc_peer;
+       ksock_peer_ni_t  *peer_ni = conn->ksnc_peer;
        ksock_tx_t       *tx;
        ksock_tx_t       *tmp;
        struct list_head  zlist = LIST_HEAD_INIT(zlist);
@@ -1518,9 +1519,9 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
         * abort all buffered data */
        LASSERT(conn->ksnc_sock == NULL);
 
-       spin_lock(&peer->ksnp_lock);
+       spin_lock(&peer_ni->ksnp_lock);
 
-       list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+       list_for_each_entry_safe(tx, tmp, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
                if (tx->tx_conn != conn)
                        continue;
 
@@ -1532,7 +1533,7 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
                list_add(&tx->tx_zc_list, &zlist);
        }
 
-       spin_unlock(&peer->ksnp_lock);
+       spin_unlock(&peer_ni->ksnp_lock);
 
        while (!list_empty(&zlist)) {
                tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
@@ -1549,7 +1550,7 @@ ksocknal_terminate_conn(ksock_conn_t *conn)
          * disengage the socket from its callbacks and close it.
          * ksnc_refcount will eventually hit zero, and then the reaper will
          * destroy it. */
-        ksock_peer_t     *peer = conn->ksnc_peer;
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
         ksock_sched_t    *sched = conn->ksnc_scheduler;
         int               failed = 0;
 
@@ -1583,17 +1584,17 @@ ksocknal_terminate_conn(ksock_conn_t *conn)
          * scheduler yet, but it _has_ committed to terminate... */
         conn->ksnc_scheduler->kss_nconns--;
 
-        if (peer->ksnp_error != 0) {
-                /* peer's last conn closed in error */
-               LASSERT(list_empty(&peer->ksnp_conns));
+        if (peer_ni->ksnp_error != 0) {
+                /* peer_ni's last conn closed in error */
+               LASSERT(list_empty(&peer_ni->ksnp_conns));
                 failed = 1;
-                peer->ksnp_error = 0;     /* avoid multiple notifications */
+                peer_ni->ksnp_error = 0;     /* avoid multiple notifications */
         }
 
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
         if (failed)
-                ksocknal_peer_failed(peer);
+                ksocknal_peer_failed(peer_ni);
 
         /* The socket is closed on the final put; either here, or in
          * ksocknal_{send,recv}msg().  Since we set up the linger2 option
@@ -1683,14 +1684,14 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
 }
 
 int
-ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni, __u32 ipaddr, int why)
 {
         ksock_conn_t       *conn;
        struct list_head         *ctmp;
        struct list_head         *cnxt;
         int                 count = 0;
 
-       list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
+       list_for_each_safe(ctmp, cnxt, &peer_ni->ksnp_conns) {
                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
                 if (ipaddr == 0 ||
@@ -1706,13 +1707,13 @@ ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
 int
 ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 {
-        ksock_peer_t     *peer = conn->ksnc_peer;
+        ksock_peer_ni_t     *peer_ni = conn->ksnc_peer;
         __u32             ipaddr = conn->ksnc_ipaddr;
         int               count;
 
        write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
-        count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+        count = ksocknal_close_peer_conns_locked (peer_ni, ipaddr, why);
 
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
@@ -1722,7 +1723,7 @@ ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 int
 ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
 {
-        ksock_peer_t       *peer;
+        ksock_peer_ni_t       *peer_ni;
        struct list_head         *ptmp;
        struct list_head         *pnxt;
         int                 lo;
@@ -1742,13 +1743,13 @@ ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
         for (i = lo; i <= hi; i++) {
                list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
 
-                       peer = list_entry(ptmp, ksock_peer_t, ksnp_list);
+                       peer_ni = list_entry(ptmp, ksock_peer_ni_t, ksnp_list);
 
-                        if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
-                              (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+                        if (!((id.nid == LNET_NID_ANY || id.nid == peer_ni->ksnp_id.nid) &&
+                              (id.pid == LNET_PID_ANY || id.pid == peer_ni->ksnp_id.pid)))
                                 continue;
 
-                        count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+                        count += ksocknal_close_peer_conns_locked (peer_ni, ipaddr, 0);
                 }
         }
 
@@ -1790,7 +1791,7 @@ ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
        int connect = 1;
        time64_t last_alive = 0;
        time64_t now = ktime_get_real_seconds();
-       ksock_peer_t *peer = NULL;
+       ksock_peer_ni_t *peer_ni = NULL;
        rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
        lnet_process_id_t id = {
                .nid = nid,
@@ -1799,13 +1800,13 @@ ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 
        read_lock(glock);
 
-        peer = ksocknal_find_peer_locked(ni, id);
-        if (peer != NULL) {
+        peer_ni = ksocknal_find_peer_locked(ni, id);
+        if (peer_ni != NULL) {
                struct list_head       *tmp;
                 ksock_conn_t     *conn;
                 int               bufnob;
 
-               list_for_each(tmp, &peer->ksnp_conns) {
+               list_for_each(tmp, &peer_ni->ksnp_conns) {
                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
                        bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
 
@@ -1813,13 +1814,13 @@ ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
                                 /* something got ACKed */
                                 conn->ksnc_tx_deadline =
                                         cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
-                                peer->ksnp_last_alive = now;
+                                peer_ni->ksnp_last_alive = now;
                                 conn->ksnc_tx_bufnob = bufnob;
                         }
                 }
 
-                last_alive = peer->ksnp_last_alive;
-                if (ksocknal_find_connectable_route_locked(peer) == NULL)
+                last_alive = peer_ni->ksnp_last_alive;
+                if (ksocknal_find_connectable_route_locked(peer_ni) == NULL)
                         connect = 0;
         }
 
@@ -1828,8 +1829,8 @@ ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
         if (last_alive != 0)
                 *when = last_alive;
 
-        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
-               libcfs_nid2str(nid), peer,
+        CDEBUG(D_NET, "peer_ni %s %p, alive %ld secs ago, connect %d\n",
+               libcfs_nid2str(nid), peer_ni,
                last_alive ? cfs_duration_sec(now - last_alive) : -1,
                connect);
 
@@ -1840,16 +1841,16 @@ ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
 
        write_lock_bh(glock);
 
-        peer = ksocknal_find_peer_locked(ni, id);
-        if (peer != NULL)
-                ksocknal_launch_all_connections_locked(peer);
+        peer_ni = ksocknal_find_peer_locked(ni, id);
+        if (peer_ni != NULL)
+                ksocknal_launch_all_connections_locked(peer_ni);
 
        write_unlock_bh(glock);
         return;
 }
 
 static void
-ksocknal_push_peer (ksock_peer_t *peer)
+ksocknal_push_peer (ksock_peer_ni_t *peer_ni)
 {
         int               index;
         int               i;
@@ -1862,7 +1863,7 @@ ksocknal_push_peer (ksock_peer_t *peer)
                 i = 0;
                 conn = NULL;
 
-               list_for_each(tmp, &peer->ksnp_conns) {
+               list_for_each(tmp, &peer_ni->ksnp_conns) {
                         if (i++ == index) {
                                conn = list_entry(tmp, ksock_conn_t,
                                                        ksnc_list);
@@ -1898,22 +1899,22 @@ ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
        }
 
        for (tmp = start; tmp <= end; tmp++) {
-               int     peer_off; /* searching offset in peer hash table */
+               int     peer_off; /* searching offset in peer_ni hash table */
 
                for (peer_off = 0; ; peer_off++) {
-                       ksock_peer_t *peer;
+                       ksock_peer_ni_t *peer_ni;
                        int           i = 0;
 
                        read_lock(&ksocknal_data.ksnd_global_lock);
-                       list_for_each_entry(peer, tmp, ksnp_list) {
+                       list_for_each_entry(peer_ni, tmp, ksnp_list) {
                                if (!((id.nid == LNET_NID_ANY ||
-                                      id.nid == peer->ksnp_id.nid) &&
+                                      id.nid == peer_ni->ksnp_id.nid) &&
                                      (id.pid == LNET_PID_ANY ||
-                                      id.pid == peer->ksnp_id.pid)))
+                                      id.pid == peer_ni->ksnp_id.pid)))
                                        continue;
 
                                if (i++ == peer_off) {
-                                       ksocknal_peer_addref(peer);
+                                       ksocknal_peer_addref(peer_ni);
                                        break;
                                }
                        }
@@ -1923,8 +1924,8 @@ ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
                                break;
 
                        rc = 0;
-                       ksocknal_push_peer(peer);
-                       ksocknal_peer_decref(peer);
+                       ksocknal_push_peer(peer_ni);
+                       ksocknal_peer_decref(peer_ni);
                }
        }
        return rc;
@@ -1939,7 +1940,7 @@ ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
         int                i;
         int                j;
        struct list_head        *ptmp;
-        ksock_peer_t      *peer;
+        ksock_peer_ni_t      *peer_ni;
        struct list_head        *rtmp;
         ksock_route_t     *route;
 
@@ -1965,14 +1966,14 @@ ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
 
                 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                        list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
-                               peer = list_entry(ptmp, ksock_peer_t,
+                               peer_ni = list_entry(ptmp, ksock_peer_ni_t,
                                                       ksnp_list);
 
-                                for (j = 0; j < peer->ksnp_n_passive_ips; j++)
-                                        if (peer->ksnp_passive_ips[j] == ipaddress)
+                                for (j = 0; j < peer_ni->ksnp_n_passive_ips; j++)
+                                        if (peer_ni->ksnp_passive_ips[j] == ipaddress)
                                                 iface->ksni_npeers++;
 
-                               list_for_each(rtmp, &peer->ksnp_routes) {
+                               list_for_each(rtmp, &peer_ni->ksnp_routes) {
                                        route = list_entry(rtmp,
                                                                ksock_route_t,
                                                                ksnr_list);
@@ -1993,7 +1994,7 @@ ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
 }
 
 static void
-ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+ksocknal_peer_del_interface_locked(ksock_peer_ni_t *peer_ni, __u32 ipaddr)
 {
        struct list_head         *tmp;
        struct list_head         *nxt;
@@ -2002,16 +2003,16 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
         int                 i;
         int                 j;
 
-        for (i = 0; i < peer->ksnp_n_passive_ips; i++)
-                if (peer->ksnp_passive_ips[i] == ipaddr) {
-                        for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
-                                peer->ksnp_passive_ips[j-1] =
-                                        peer->ksnp_passive_ips[j];
-                        peer->ksnp_n_passive_ips--;
+        for (i = 0; i < peer_ni->ksnp_n_passive_ips; i++)
+                if (peer_ni->ksnp_passive_ips[i] == ipaddr) {
+                        for (j = i+1; j < peer_ni->ksnp_n_passive_ips; j++)
+                                peer_ni->ksnp_passive_ips[j-1] =
+                                        peer_ni->ksnp_passive_ips[j];
+                        peer_ni->ksnp_n_passive_ips--;
                         break;
                 }
 
-       list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+       list_for_each_safe(tmp, nxt, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 if (route->ksnr_myipaddr != ipaddr)
@@ -2025,7 +2026,7 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
                 }
         }
 
-       list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+       list_for_each_safe(tmp, nxt, &peer_ni->ksnp_conns) {
                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
 
                 if (conn->ksnc_myipaddr == ipaddr)
@@ -2040,7 +2041,7 @@ ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
         int                rc = -ENOENT;
        struct list_head        *tmp;
        struct list_head        *nxt;
-        ksock_peer_t      *peer;
+        ksock_peer_ni_t      *peer_ni;
         __u32              this_ip;
         int                i;
         int                j;
@@ -2065,13 +2066,13 @@ ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
                 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
                        list_for_each_safe(tmp, nxt,
                                                &ksocknal_data.ksnd_peers[j]) {
-                               peer = list_entry(tmp, ksock_peer_t,
+                               peer_ni = list_entry(tmp, ksock_peer_ni_t,
                                                       ksnp_list);
 
-                                if (peer->ksnp_ni != ni)
+                                if (peer_ni->ksnp_ni != ni)
                                         continue;
 
-                                ksocknal_peer_del_interface_locked(peer, this_ip);
+                                ksocknal_peer_del_interface_locked(peer_ni, this_ip);
                         }
                 }
         }
@@ -2494,7 +2495,7 @@ ksocknal_base_startup(void)
 static void
 ksocknal_debug_peerhash (lnet_ni_t *ni)
 {
-       ksock_peer_t    *peer = NULL;
+       ksock_peer_ni_t *peer_ni = NULL;
        struct list_head        *tmp;
        int             i;
 
@@ -2502,29 +2503,29 @@ ksocknal_debug_peerhash (lnet_ni_t *ni)
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
-                       peer = list_entry(tmp, ksock_peer_t, ksnp_list);
+                       peer_ni = list_entry(tmp, ksock_peer_ni_t, ksnp_list);
 
-                        if (peer->ksnp_ni == ni) break;
+                        if (peer_ni->ksnp_ni == ni) break;
 
-                        peer = NULL;
+                        peer_ni = NULL;
                 }
         }
 
-        if (peer != NULL) {
+        if (peer_ni != NULL) {
                 ksock_route_t *route;
                 ksock_conn_t  *conn;
 
-               CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+               CWARN ("Active peer_ni on shutdown: %s, ref %d, scnt %d, "
                       "closing %d, accepting %d, err %d, zcookie %llu, "
-                      "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
-                      atomic_read(&peer->ksnp_refcount),
-                      peer->ksnp_sharecount, peer->ksnp_closing,
-                      peer->ksnp_accepting, peer->ksnp_error,
-                      peer->ksnp_zc_next_cookie,
-                      !list_empty(&peer->ksnp_tx_queue),
-                      !list_empty(&peer->ksnp_zc_req_list));
-
-               list_for_each(tmp, &peer->ksnp_routes) {
+                      "txq %d, zc_req %d\n", libcfs_id2str(peer_ni->ksnp_id),
+                      atomic_read(&peer_ni->ksnp_refcount),
+                      peer_ni->ksnp_sharecount, peer_ni->ksnp_closing,
+                      peer_ni->ksnp_accepting, peer_ni->ksnp_error,
+                      peer_ni->ksnp_zc_next_cookie,
+                      !list_empty(&peer_ni->ksnp_tx_queue),
+                      !list_empty(&peer_ni->ksnp_zc_req_list));
+
+               list_for_each(tmp, &peer_ni->ksnp_routes) {
                        route = list_entry(tmp, ksock_route_t, ksnr_list);
                        CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
                               "del %d\n", atomic_read(&route->ksnr_refcount),
@@ -2532,7 +2533,7 @@ ksocknal_debug_peerhash (lnet_ni_t *ni)
                               route->ksnr_connected, route->ksnr_deleted);
                }
 
-               list_for_each(tmp, &peer->ksnp_conns) {
+               list_for_each(tmp, &peer_ni->ksnp_conns) {
                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
                        CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
                               atomic_read(&conn->ksnc_conn_refcount),
@@ -2565,7 +2566,7 @@ ksocknal_shutdown (lnet_ni_t *ni)
        /* Delete all peers */
        ksocknal_del_peer(ni, anyid, 0);
 
-       /* Wait for all peer state to clean up */
+       /* Wait for all peer_ni state to clean up */
        i = 2;
        spin_lock_bh(&net->ksnn_lock);
        while (net->ksnn_npeers != 0) {
@@ -2749,7 +2750,8 @@ ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
        int     rc;
        int     i;
 
-       LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+       if (ncpts > 0 && ncpts > cfs_cpt_number(lnet_cpt_table()))
+               return -EINVAL;
 
        for (i = 0; i < ncpts; i++) {
                struct ksock_sched_info *info;
@@ -2771,11 +2773,13 @@ ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
 int
 ksocknal_startup (lnet_ni_t *ni)
 {
-        ksock_net_t  *net;
-        int           rc;
-        int           i;
+       ksock_net_t  *net;
+       int           rc;
+       int           i;
+       struct net_device *net_dev;
+       int node_id;
 
-        LASSERT (ni->ni_lnd == &the_ksocklnd);
+        LASSERT (ni->ni_net->net_lnd == &the_ksocklnd);
 
         if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
                 rc = ksocknal_base_startup();
@@ -2790,10 +2794,18 @@ ksocknal_startup (lnet_ni_t *ni)
        spin_lock_init(&net->ksnn_lock);
         net->ksnn_incarnation = ksocknal_new_incarnation();
         ni->ni_data = net;
-        ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
-        ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
-        ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
-        ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+       if (!ni->ni_net->net_tunables_set) {
+               ni->ni_net->net_tunables.lct_peer_timeout =
+                       *ksocknal_tunables.ksnd_peertimeout;
+               ni->ni_net->net_tunables.lct_max_tx_credits =
+                       *ksocknal_tunables.ksnd_credits;
+               ni->ni_net->net_tunables.lct_peer_tx_credits =
+                       *ksocknal_tunables.ksnd_peertxcredits;
+               ni->ni_net->net_tunables.lct_peer_rtr_credits =
+                       *ksocknal_tunables.ksnd_peerrtrcredits;
+               ni->ni_net->net_tunables_set = true;
+       }
+
 
         if (ni->ni_interfaces[0] == NULL) {
                 rc = ksocknal_enumerate_interfaces(net);
@@ -2827,10 +2839,21 @@ ksocknal_startup (lnet_ni_t *ni)
                        strlcpy(net->ksnn_interfaces[i].ksni_name,
                                ni->ni_interfaces[i],
                                sizeof(net->ksnn_interfaces[i].ksni_name));
+
                }
                net->ksnn_ninterfaces = i;
        }
 
+       net_dev = dev_get_by_name(&init_net,
+                                 net->ksnn_interfaces[0].ksni_name);
+       if (net_dev != NULL) {
+               node_id = dev_to_node(&net_dev->dev);
+               ni->ni_dev_cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
+               dev_put(net_dev);
+       } else {
+               ni->ni_dev_cpt = CFS_CPT_ANY;
+       }
+
        /* call it before add it to ksocknal_data.ksnd_nets */
        rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
        if (rc != 0)
index 6d38849..fd24314 100644 (file)
@@ -71,7 +71,7 @@
 #define SOCKNAL_NSCHEDS                3
 #define SOCKNAL_NSCHEDS_HIGH   (SOCKNAL_NSCHEDS << 1)
 
-#define SOCKNAL_PEER_HASH_SIZE  101             /* # peer lists */
+#define SOCKNAL_PEER_HASH_SIZE  101             /* # peer_ni lists */
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
 #define SOCKNAL_INSANITY_RECONN 5000            /* connd is trying on reconn infinitely */
 #define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
@@ -154,9 +154,9 @@ typedef struct
         int              *ksnd_keepalive_count; /* # probes */
         int              *ksnd_keepalive_intvl; /* time between probes */
         int              *ksnd_credits;         /* # concurrent sends */
-        int              *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
-        int              *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
-        int              *ksnd_peertimeout;     /* seconds to consider peer dead */
+        int              *ksnd_peertxcredits;   /* # concurrent sends to 1 peer_ni */
+        int              *ksnd_peerrtrcredits;  /* # per-peer_ni router buffer credits */
+        int              *ksnd_peertimeout;     /* seconds to consider peer_ni dead */
         int              *ksnd_enable_csum;     /* enable check sum */
         int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
         int              *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
@@ -196,7 +196,7 @@ typedef struct
        int                     ksnd_init;      /* initialisation state */
        int                     ksnd_nnets;     /* # networks set up */
        struct list_head        ksnd_nets;      /* list of nets */
-       /* stabilize peer/conn ops */
+       /* stabilize peer_ni/conn ops */
        rwlock_t                ksnd_global_lock;
        /* hash table of all my known peers */
        struct list_head        *ksnd_peers;
@@ -273,7 +273,7 @@ struct ksock_proto;                             /* forward ref */
 typedef struct                                  /* transmit packet */
 {
        struct list_head   tx_list;     /* queue on conn for transmission etc */
-       struct list_head   tx_zc_list;  /* queue on peer for ZC request */
+       struct list_head   tx_zc_list;  /* queue on peer_ni for ZC request */
        atomic_t       tx_refcount;    /* tx reference count */
        int            tx_nob;         /* # packet bytes */
        int            tx_resid;       /* residual bytes */
@@ -321,9 +321,9 @@ typedef union {
 
 typedef struct ksock_conn
 {
-       struct ksock_peer  *ksnc_peer;          /* owning peer */
+       struct ksock_peer  *ksnc_peer;          /* owning peer_ni */
        struct ksock_route *ksnc_route;         /* owning route */
-       struct list_head    ksnc_list;          /* stash on peer's conn list */
+       struct list_head    ksnc_list;          /* stash on peer_ni's conn list */
        struct socket       *ksnc_sock;         /* actual socket */
        void                *ksnc_saved_data_ready; /* socket's original data_ready() callback */
        void                *ksnc_saved_write_space; /* socket's original write_space() callback */
@@ -331,8 +331,8 @@ typedef struct ksock_conn
        atomic_t            ksnc_sock_refcount; /* sock refcount */
        ksock_sched_t       *ksnc_scheduler;  /* who schedules this connection */
        __u32               ksnc_myipaddr;   /* my IP */
-        __u32               ksnc_ipaddr;     /* peer's IP */
-        int                 ksnc_port;       /* peer's port */
+        __u32               ksnc_ipaddr;     /* peer_ni's IP */
+        int                 ksnc_port;       /* peer_ni's port */
        signed int          ksnc_type:3;     /* type of connection,
                                              * should be signed value */
        unsigned int        ksnc_closing:1;  /* being shut down */
@@ -388,9 +388,9 @@ typedef struct ksock_conn
 
 typedef struct ksock_route
 {
-       struct list_head   ksnr_list;           /* chain on peer route list */
+       struct list_head   ksnr_list;           /* chain on peer_ni route list */
        struct list_head   ksnr_connd_list;     /* chain on ksnr_connd_routes */
-       struct ksock_peer *ksnr_peer;           /* owning peer */
+       struct ksock_peer *ksnr_peer;           /* owning peer_ni */
        atomic_t           ksnr_refcount;       /* # users */
        cfs_time_t            ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
        cfs_duration_t        ksnr_retry_interval; /* how long between retries */
@@ -400,7 +400,7 @@ typedef struct ksock_route
         unsigned int          ksnr_scheduled:1; /* scheduled for attention */
         unsigned int          ksnr_connecting:1;/* connection establishment in progress */
         unsigned int          ksnr_connected:4; /* connections established by type */
-        unsigned int          ksnr_deleted:1;   /* been removed from peer? */
+        unsigned int          ksnr_deleted:1;   /* been removed from peer_ni? */
         unsigned int          ksnr_share_count; /* created explicitly? */
         int                   ksnr_conn_count;  /* # conns established by this route */
 } ksock_route_t;
@@ -409,7 +409,7 @@ typedef struct ksock_route
 
 typedef struct ksock_peer
 {
-       struct list_head        ksnp_list;      /* stash on global peer list */
+       struct list_head        ksnp_list;      /* stash on global peer_ni list */
        cfs_time_t            ksnp_last_alive;  /* when (in jiffies) I was last alive */
        lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
        atomic_t              ksnp_refcount; /* # users */
@@ -418,8 +418,8 @@ typedef struct ksock_peer
         int                   ksnp_accepting;/* # passive connections pending */
         int                   ksnp_error;    /* errno on closing last conn */
         __u64                 ksnp_zc_next_cookie;/* ZC completion cookie */
-        __u64                 ksnp_incarnation;   /* latest known peer incarnation */
-        struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+        __u64                 ksnp_incarnation;   /* latest known peer_ni incarnation */
+        struct ksock_proto   *ksnp_proto;    /* latest known peer_ni protocol */
        struct list_head        ksnp_conns;     /* all active connections */
        struct list_head        ksnp_routes;    /* routes */
        struct list_head        ksnp_tx_queue;  /* waiting packets */
@@ -430,7 +430,7 @@ typedef struct ksock_peer
         lnet_ni_t            *ksnp_ni;       /* which network */
         int                   ksnp_n_passive_ips; /* # of... */
         __u32                 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
-} ksock_peer_t;
+} ksock_peer_ni_t;
 
 typedef struct ksock_connreq
 {
@@ -592,20 +592,20 @@ ksocknal_route_decref (ksock_route_t *route)
 }
 
 static inline void
-ksocknal_peer_addref (ksock_peer_t *peer)
+ksocknal_peer_addref (ksock_peer_ni_t *peer_ni)
 {
-       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
-       atomic_inc(&peer->ksnp_refcount);
+       LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+       atomic_inc(&peer_ni->ksnp_refcount);
 }
 
-extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+extern void ksocknal_destroy_peer (ksock_peer_ni_t *peer_ni);
 
 static inline void
-ksocknal_peer_decref (ksock_peer_t *peer)
+ksocknal_peer_decref (ksock_peer_ni_t *peer_ni)
 {
-       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
-       if (atomic_dec_and_test(&peer->ksnp_refcount))
-               ksocknal_destroy_peer (peer);
+       LASSERT (atomic_read (&peer_ni->ksnp_refcount) > 0);
+       if (atomic_dec_and_test(&peer_ni->ksnp_refcount))
+               ksocknal_destroy_peer (peer_ni);
 }
 
 int ksocknal_startup (lnet_ni_t *ni);
@@ -619,19 +619,19 @@ int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
 int ksocknal_accept(lnet_ni_t *ni, struct socket *sock);
 
 extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
-extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
-extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
-extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern ksock_peer_ni_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_ni_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_ni_t *peer_ni);
 extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
                                 struct socket *sock, int type);
 extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
 extern void ksocknal_terminate_conn (ksock_conn_t *conn);
 extern void ksocknal_destroy_conn (ksock_conn_t *conn);
-extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_ni_t *peer_ni,
                                               __u32 ipaddr, int why);
 extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
 extern int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr);
-extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni,
                                                ksock_tx_t *tx, int nonblk);
 
 extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
@@ -647,9 +647,9 @@ extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
 extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
 extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
 extern void ksocknal_thread_fini (void);
-extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
-extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
-extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni);
 extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
 extern int ksocknal_scheduler (void *arg);
 extern int ksocknal_connd (void *arg);
index 386006e..b69599b 100644 (file)
@@ -439,11 +439,11 @@ static void
 ksocknal_check_zc_req(ksock_tx_t *tx)
 {
         ksock_conn_t   *conn = tx->tx_conn;
-        ksock_peer_t   *peer = conn->ksnc_peer;
+        ksock_peer_ni_t   *peer_ni = conn->ksnc_peer;
 
         /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
          * to ksnp_zc_req_list if some fragment of this message should be sent
-         * zero-copy.  Our peer will send an ACK containing this cookie when
+         * zero-copy.  Our peer_ni will send an ACK containing this cookie when
          * she has received this message to tell us we can signal completion.
          * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
          * ksnp_zc_req_list. */
@@ -461,46 +461,46 @@ ksocknal_check_zc_req(ksock_tx_t *tx)
 
         ksocknal_tx_addref(tx);
 
-       spin_lock(&peer->ksnp_lock);
+       spin_lock(&peer_ni->ksnp_lock);
 
-        /* ZC_REQ is going to be pinned to the peer */
+        /* ZC_REQ is going to be pinned to the peer_ni */
         tx->tx_deadline =
                 cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 
         LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
 
-        tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+        tx->tx_msg.ksm_zc_cookies[0] = peer_ni->ksnp_zc_next_cookie++;
 
-        if (peer->ksnp_zc_next_cookie == 0)
-                peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+        if (peer_ni->ksnp_zc_next_cookie == 0)
+                peer_ni->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
 
-       list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+       list_add_tail(&tx->tx_zc_list, &peer_ni->ksnp_zc_req_list);
 
-       spin_unlock(&peer->ksnp_lock);
+       spin_unlock(&peer_ni->ksnp_lock);
 }
 
 static void
 ksocknal_uncheck_zc_req(ksock_tx_t *tx)
 {
-       ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+       ksock_peer_ni_t   *peer_ni = tx->tx_conn->ksnc_peer;
 
        LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
        LASSERT(tx->tx_zc_capable);
 
        tx->tx_zc_checked = 0;
 
-       spin_lock(&peer->ksnp_lock);
+       spin_lock(&peer_ni->ksnp_lock);
 
        if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
                /* Not waiting for an ACK */
-               spin_unlock(&peer->ksnp_lock);
+               spin_unlock(&peer_ni->ksnp_lock);
                return;
        }
 
        tx->tx_msg.ksm_zc_cookies[0] = 0;
        list_del(&tx->tx_zc_list);
 
-       spin_unlock(&peer->ksnp_lock);
+       spin_unlock(&peer_ni->ksnp_lock);
 
        ksocknal_tx_decref(tx);
 }
@@ -606,14 +606,14 @@ ksocknal_launch_connection_locked (ksock_route_t *route)
 }
 
 void
-ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+ksocknal_launch_all_connections_locked (ksock_peer_ni_t *peer_ni)
 {
         ksock_route_t *route;
 
         /* called holding write lock on ksnd_global_lock */
         for (;;) {
                 /* launch any/all connections that need it */
-                route = ksocknal_find_connectable_route_locked(peer);
+                route = ksocknal_find_connectable_route_locked(peer_ni);
                 if (route == NULL)
                         return;
 
@@ -622,7 +622,7 @@ ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
 }
 
 ksock_conn_t *
-ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+ksocknal_find_conn_locked(ksock_peer_ni_t *peer_ni, ksock_tx_t *tx, int nonblk)
 {
        struct list_head *tmp;
         ksock_conn_t     *conn;
@@ -631,7 +631,7 @@ ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
         int               tnob     = 0;
         int               fnob     = 0;
 
-       list_for_each(tmp, &peer->ksnp_conns) {
+       list_for_each(tmp, &peer_ni->ksnp_conns) {
                ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
                int           nob = atomic_read(&c->ksnc_tx_nob) +
                                        c->ksnc_sock->sk->sk_wmem_queued;
@@ -777,13 +777,13 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 
 
 ksock_route_t *
-ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+ksocknal_find_connectable_route_locked (ksock_peer_ni_t *peer_ni)
 {
         cfs_time_t     now = cfs_time_current();
        struct list_head    *tmp;
         ksock_route_t *route;
 
-       list_for_each(tmp, &peer->ksnp_routes) {
+       list_for_each(tmp, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
@@ -814,12 +814,12 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 }
 
 ksock_route_t *
-ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+ksocknal_find_connecting_route_locked (ksock_peer_ni_t *peer_ni)
 {
        struct list_head        *tmp;
         ksock_route_t     *route;
 
-       list_for_each(tmp, &peer->ksnp_routes) {
+       list_for_each(tmp, &peer_ni->ksnp_routes) {
                route = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
@@ -834,7 +834,7 @@ ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
 int
 ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
 {
-        ksock_peer_t     *peer;
+        ksock_peer_ni_t     *peer_ni;
         ksock_conn_t     *conn;
        rwlock_t     *g_lock;
         int               retry;
@@ -846,10 +846,10 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
 
         for (retry = 0;; retry = 1) {
                read_lock(g_lock);
-                peer = ksocknal_find_peer_locked(ni, id);
-                if (peer != NULL) {
-                        if (ksocknal_find_connectable_route_locked(peer) == NULL) {
-                                conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+                peer_ni = ksocknal_find_peer_locked(ni, id);
+                if (peer_ni != NULL) {
+                        if (ksocknal_find_connectable_route_locked(peer_ni) == NULL) {
+                                conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
                                 if (conn != NULL) {
                                         /* I've got no routes that need to be
                                          * connecting and I do have an actual
@@ -866,8 +866,8 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
 
                write_lock_bh(g_lock);
 
-                peer = ksocknal_find_peer_locked(ni, id);
-                if (peer != NULL)
+                peer_ni = ksocknal_find_peer_locked(ni, id);
+                if (peer_ni != NULL)
                         break;
 
                write_unlock_bh(g_lock);
@@ -879,7 +879,7 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
                 }
 
                 if (retry) {
-                        CERROR("Can't find peer %s\n", libcfs_id2str(id));
+                        CERROR("Can't find peer_ni %s\n", libcfs_id2str(id));
                         return -EHOSTUNREACH;
                 }
 
@@ -887,15 +887,15 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
                                        LNET_NIDADDR(id.nid),
                                        lnet_acceptor_port());
                 if (rc != 0) {
-                        CERROR("Can't add peer %s: %d\n",
+                        CERROR("Can't add peer_ni %s: %d\n",
                                libcfs_id2str(id), rc);
                         return rc;
                 }
         }
 
-        ksocknal_launch_all_connections_locked(peer);
+        ksocknal_launch_all_connections_locked(peer_ni);
 
-        conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+        conn = ksocknal_find_conn_locked(peer_ni, tx, tx->tx_nonblk);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
                 ksocknal_queue_tx_locked (tx, conn);
@@ -903,14 +903,14 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
                 return (0);
         }
 
-        if (peer->ksnp_accepting > 0 ||
-            ksocknal_find_connecting_route_locked (peer) != NULL) {
-                /* the message is going to be pinned to the peer */
+        if (peer_ni->ksnp_accepting > 0 ||
+            ksocknal_find_connecting_route_locked (peer_ni) != NULL) {
+                /* the message is going to be pinned to the peer_ni */
                 tx->tx_deadline =
                         cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
 
                 /* Queue the message until a connection is established */
-               list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue);
+               list_add_tail(&tx->tx_list, &peer_ni->ksnp_tx_queue);
                write_unlock_bh(g_lock);
                 return 0;
         }
@@ -1235,7 +1235,7 @@ ksocknal_process_receive (ksock_conn_t *conn)
                 conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
 
                 if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
-                        /* Userspace peer */
+                        /* Userspace peer_ni */
                         lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
                         id   = &conn->ksnc_peer->ksnp_id;
 
@@ -1752,7 +1752,7 @@ ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn,
         proto = ksocknal_parse_proto_version(hello);
         if (proto == NULL) {
                 if (!active) {
-                        /* unknown protocol from peer, tell peer my protocol */
+                        /* unknown protocol from peer_ni, tell peer_ni my protocol */
                         conn->ksnc_proto = &ksocknal_protocol_v3x;
 #if SOCKNAL_VERSION_DEBUG
                         if (*ksocknal_tunables.ksnd_protocol == 2)
@@ -1792,7 +1792,7 @@ ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn,
 
         if (!active &&
             conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
-                /* Userspace NAL assigns peer process ID from socket */
+                /* Userspace NAL assigns peer_ni process ID from socket */
                 recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
                 recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
         } else {
@@ -1803,7 +1803,7 @@ ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn,
         if (!active) {
                 *peerid = recv_id;
 
-               /* peer determines type */
+               /* peer_ni determines type */
                conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
                if (conn->ksnc_type == SOCKLND_CONN_NONE) {
                        CERROR("Unexpected type %d from %s ip %pI4h\n",
@@ -1845,7 +1845,7 @@ static int
 ksocknal_connect (ksock_route_t *route)
 {
        struct list_head        zombies = LIST_HEAD_INIT(zombies);
-        ksock_peer_t     *peer = route->ksnr_peer;
+        ksock_peer_ni_t     *peer_ni = route->ksnr_peer;
         int               type;
         int               wanted;
        struct socket     *sock;
@@ -1866,19 +1866,19 @@ ksocknal_connect (ksock_route_t *route)
         for (;;) {
                 wanted = ksocknal_route_mask() & ~route->ksnr_connected;
 
-                /* stop connecting if peer/route got closed under me, or
+                /* stop connecting if peer_ni/route got closed under me, or
                  * route got connected while queued */
-                if (peer->ksnp_closing || route->ksnr_deleted ||
+                if (peer_ni->ksnp_closing || route->ksnr_deleted ||
                     wanted == 0) {
                         retry_later = 0;
                         break;
                 }
 
-                /* reschedule if peer is connecting to me */
-                if (peer->ksnp_accepting > 0) {
+                /* reschedule if peer_ni is connecting to me */
+                if (peer_ni->ksnp_accepting > 0) {
                         CDEBUG(D_NET,
-                               "peer %s(%d) already connecting to me, retry later.\n",
-                               libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+                               "peer_ni %s(%d) already connecting to me, retry later.\n",
+                               libcfs_nid2str(peer_ni->ksnp_id.nid), peer_ni->ksnp_accepting);
                         retry_later = 1;
                 }
 
@@ -1900,21 +1900,21 @@ ksocknal_connect (ksock_route_t *route)
 
                 if (cfs_time_aftereq(cfs_time_current(), deadline)) {
                         rc = -ETIMEDOUT;
-                        lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                        lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
                                                    route->ksnr_ipaddr,
                                                    route->ksnr_port);
                         goto failed;
                 }
 
-                rc = lnet_connect(&sock, peer->ksnp_id.nid,
+                rc = lnet_connect(&sock, peer_ni->ksnp_id.nid,
                                   route->ksnr_myipaddr,
                                   route->ksnr_ipaddr, route->ksnr_port);
                 if (rc != 0)
                         goto failed;
 
-                rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+                rc = ksocknal_create_conn(peer_ni->ksnp_ni, route, sock, type);
                 if (rc < 0) {
-                        lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                        lnet_connect_console_error(rc, peer_ni->ksnp_id.nid,
                                                    route->ksnr_ipaddr,
                                                    route->ksnr_port);
                         goto failed;
@@ -1924,8 +1924,8 @@ ksocknal_connect (ksock_route_t *route)
                  * race or I have to renegotiate protocol version */
                 retry_later = (rc != 0);
                 if (retry_later)
-                        CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
-                               libcfs_nid2str(peer->ksnp_id.nid));
+                        CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
+                               libcfs_nid2str(peer_ni->ksnp_id.nid));
 
                write_lock_bh(&ksocknal_data.ksnd_global_lock);
         }
@@ -1935,10 +1935,10 @@ ksocknal_connect (ksock_route_t *route)
 
         if (retry_later) {
                 /* re-queue for attention; this frees me up to handle
-                 * the peer's incoming connection request */
+                 * the peer_ni's incoming connection request */
 
                 if (rc == EALREADY ||
-                    (rc == 0 && peer->ksnp_accepting > 0)) {
+                    (rc == 0 && peer_ni->ksnp_accepting > 0)) {
                         /* We want to introduce a delay before next
                          * attempt to connect if we lost conn race,
                          * but the race is resolved quickly usually,
@@ -1974,28 +1974,28 @@ ksocknal_connect (ksock_route_t *route)
         route->ksnr_timeout = cfs_time_add(cfs_time_current(),
                                            route->ksnr_retry_interval);
 
-       if (!list_empty(&peer->ksnp_tx_queue) &&
-            peer->ksnp_accepting == 0 &&
-            ksocknal_find_connecting_route_locked(peer) == NULL) {
+       if (!list_empty(&peer_ni->ksnp_tx_queue) &&
+            peer_ni->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer_ni) == NULL) {
                 ksock_conn_t *conn;
 
                 /* ksnp_tx_queue is queued on a conn on successful
                  * connection for V1.x and V2.x */
-               if (!list_empty(&peer->ksnp_conns)) {
-                       conn = list_entry(peer->ksnp_conns.next,
+               if (!list_empty(&peer_ni->ksnp_conns)) {
+                       conn = list_entry(peer_ni->ksnp_conns.next,
                                               ksock_conn_t, ksnc_list);
                         LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
                 }
 
                 /* take all the blocked packets while I've got the lock and
                  * complete below... */
-               list_splice_init(&peer->ksnp_tx_queue, &zombies);
+               list_splice_init(&peer_ni->ksnp_tx_queue, &zombies);
         }
 
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-        ksocknal_peer_failed(peer);
-        ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+        ksocknal_peer_failed(peer_ni);
+        ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, 1);
         return 0;
 }
 
@@ -2242,13 +2242,13 @@ ksocknal_connd (void *arg)
 }
 
 static ksock_conn_t *
-ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 {
         /* We're called with a shared lock on ksnd_global_lock */
         ksock_conn_t      *conn;
        struct list_head        *ctmp;
 
-       list_for_each(ctmp, &peer->ksnp_conns) {
+       list_for_each(ctmp, &peer_ni->ksnp_conns) {
                 int     error;
                conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
 
@@ -2264,7 +2264,7 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                                 CNETERR("A connection with %s "
                                        "(%pI4h:%d) was reset; "
                                         "it may have rebooted.\n",
-                                        libcfs_id2str(peer->ksnp_id),
+                                        libcfs_id2str(peer_ni->ksnp_id),
                                        &conn->ksnc_ipaddr,
                                         conn->ksnc_port);
                                 break;
@@ -2272,7 +2272,7 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                                 CNETERR("A connection with %s "
                                        "(%pI4h:%d) timed out; the "
                                         "network or node may be down.\n",
-                                        libcfs_id2str(peer->ksnp_id),
+                                        libcfs_id2str(peer_ni->ksnp_id),
                                        &conn->ksnc_ipaddr,
                                         conn->ksnc_port);
                                 break;
@@ -2280,7 +2280,7 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                                 CNETERR("An unexpected network error %d "
                                         "occurred with %s "
                                        "(%pI4h:%d\n", error,
-                                        libcfs_id2str(peer->ksnp_id),
+                                        libcfs_id2str(peer_ni->ksnp_id),
                                        &conn->ksnc_ipaddr,
                                         conn->ksnc_port);
                                 break;
@@ -2296,7 +2296,7 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                         ksocknal_conn_addref(conn);
                        CNETERR("Timeout receiving from %s (%pI4h:%d), "
                                 "state %d wanted %d left %d\n",
-                                libcfs_id2str(peer->ksnp_id),
+                                libcfs_id2str(peer_ni->ksnp_id),
                                &conn->ksnc_ipaddr,
                                 conn->ksnc_port,
                                 conn->ksnc_rx_state,
@@ -2314,7 +2314,7 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                         ksocknal_conn_addref(conn);
                        CNETERR("Timeout sending data to %s (%pI4h:%d) "
                                 "the network or that node may be down.\n",
-                                libcfs_id2str(peer->ksnp_id),
+                                libcfs_id2str(peer_ni->ksnp_id),
                                &conn->ksnc_ipaddr, conn->ksnc_port);
                         return (conn);
                 }
@@ -2324,15 +2324,15 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
 }
 
 static inline void
-ksocknal_flush_stale_txs(ksock_peer_t *peer)
+ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
 {
         ksock_tx_t        *tx;
        struct list_head        stale_txs = LIST_HEAD_INIT(stale_txs);
 
        write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
-       while (!list_empty(&peer->ksnp_tx_queue)) {
-               tx = list_entry(peer->ksnp_tx_queue.next,
+       while (!list_empty(&peer_ni->ksnp_tx_queue)) {
+               tx = list_entry(peer_ni->ksnp_tx_queue.next,
                                      ksock_tx_t, tx_list);
 
                 if (!cfs_time_aftereq(cfs_time_current(),
@@ -2345,11 +2345,11 @@ ksocknal_flush_stale_txs(ksock_peer_t *peer)
 
        write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-        ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+        ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, 1);
 }
 
 static int
-ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+ksocknal_send_keepalive_locked(ksock_peer_ni_t *peer_ni)
 __must_hold(&ksocknal_data.ksnd_global_lock)
 {
         ksock_sched_t  *sched;
@@ -2357,27 +2357,27 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
         ksock_tx_t     *tx;
 
        /* last_alive will be updated by create_conn */
-       if (list_empty(&peer->ksnp_conns))
+       if (list_empty(&peer_ni->ksnp_conns))
                 return 0;
 
-        if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+        if (peer_ni->ksnp_proto != &ksocknal_protocol_v3x)
                 return 0;
 
         if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
             cfs_time_before(cfs_time_current(),
-                            cfs_time_add(peer->ksnp_last_alive,
+                            cfs_time_add(peer_ni->ksnp_last_alive,
                                          cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
                 return 0;
 
         if (cfs_time_before(cfs_time_current(),
-                            peer->ksnp_send_keepalive))
+                            peer_ni->ksnp_send_keepalive))
                 return 0;
 
         /* retry 10 secs later, so we wouldn't put pressure
-         * on this peer if we failed to send keepalive this time */
-        peer->ksnp_send_keepalive = cfs_time_shift(10);
+         * on this peer_ni if we failed to send keepalive this time */
+        peer_ni->ksnp_send_keepalive = cfs_time_shift(10);
 
-        conn = ksocknal_find_conn_locked(peer, NULL, 1);
+        conn = ksocknal_find_conn_locked(peer_ni, NULL, 1);
         if (conn != NULL) {
                 sched = conn->ksnc_scheduler;
 
@@ -2400,7 +2400,7 @@ __must_hold(&ksocknal_data.ksnd_global_lock)
                return -ENOMEM;
        }
 
-       if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+       if (ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id) == 0) {
                read_lock(&ksocknal_data.ksnd_global_lock);
                return 1;
        }
@@ -2416,7 +2416,7 @@ static void
 ksocknal_check_peer_timeouts (int idx)
 {
        struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
-        ksock_peer_t     *peer;
+        ksock_peer_ni_t     *peer_ni;
         ksock_conn_t     *conn;
         ksock_tx_t       *tx;
 
@@ -2426,18 +2426,18 @@ ksocknal_check_peer_timeouts (int idx)
          * take a look... */
        read_lock(&ksocknal_data.ksnd_global_lock);
 
-       list_for_each_entry(peer, peers, ksnp_list) {
+       list_for_each_entry(peer_ni, peers, ksnp_list) {
                ksock_tx_t *tx_stale;
                cfs_time_t  deadline = 0;
                int         resid = 0;
                int         n     = 0;
 
-                if (ksocknal_send_keepalive_locked(peer) != 0) {
+                if (ksocknal_send_keepalive_locked(peer_ni) != 0) {
                        read_unlock(&ksocknal_data.ksnd_global_lock);
                         goto again;
                 }
 
-                conn = ksocknal_find_timed_out_conn (peer);
+                conn = ksocknal_find_timed_out_conn (peer_ni);
 
                 if (conn != NULL) {
                        read_unlock(&ksocknal_data.ksnd_global_lock);
@@ -2445,7 +2445,7 @@ ksocknal_check_peer_timeouts (int idx)
                         ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
 
                         /* NB we won't find this one again, but we can't
-                         * just proceed with the next peer, since we dropped
+                         * just proceed with the next peer_ni, since we dropped
                          * ksnd_global_lock and it might be dead already! */
                         ksocknal_conn_decref(conn);
                         goto again;
@@ -2453,30 +2453,30 @@ ksocknal_check_peer_timeouts (int idx)
 
                 /* we can't process stale txs right here because we're
                  * holding only shared lock */
-               if (!list_empty(&peer->ksnp_tx_queue)) {
+               if (!list_empty(&peer_ni->ksnp_tx_queue)) {
                         ksock_tx_t *tx =
-                               list_entry(peer->ksnp_tx_queue.next,
+                               list_entry(peer_ni->ksnp_tx_queue.next,
                                                 ksock_tx_t, tx_list);
 
                         if (cfs_time_aftereq(cfs_time_current(),
                                              tx->tx_deadline)) {
 
-                                ksocknal_peer_addref(peer);
+                                ksocknal_peer_addref(peer_ni);
                                read_unlock(&ksocknal_data.ksnd_global_lock);
 
-                                ksocknal_flush_stale_txs(peer);
+                                ksocknal_flush_stale_txs(peer_ni);
 
-                                ksocknal_peer_decref(peer);
+                                ksocknal_peer_decref(peer_ni);
                                 goto again;
                         }
                 }
 
-               if (list_empty(&peer->ksnp_zc_req_list))
+               if (list_empty(&peer_ni->ksnp_zc_req_list))
                         continue;
 
                tx_stale = NULL;
-               spin_lock(&peer->ksnp_lock);
-               list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+               spin_lock(&peer_ni->ksnp_lock);
+               list_for_each_entry(tx, &peer_ni->ksnp_zc_req_list, tx_zc_list) {
                         if (!cfs_time_aftereq(cfs_time_current(),
                                               tx->tx_deadline))
                                 break;
@@ -2489,7 +2489,7 @@ ksocknal_check_peer_timeouts (int idx)
                 }
 
                if (tx_stale == NULL) {
-                       spin_unlock(&peer->ksnp_lock);
+                       spin_unlock(&peer_ni->ksnp_lock);
                        continue;
                }
 
@@ -2498,13 +2498,13 @@ ksocknal_check_peer_timeouts (int idx)
                conn     = tx_stale->tx_conn;
                ksocknal_conn_addref(conn);
 
-               spin_unlock(&peer->ksnp_lock);
+               spin_unlock(&peer_ni->ksnp_lock);
                read_unlock(&ksocknal_data.ksnd_global_lock);
 
-               CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+               CERROR("Total %d stale ZC_REQs for peer_ni %s detected; the "
                       "oldest(%p) timed out %ld secs ago, "
                       "resid: %d, wmem: %d\n",
-                      n, libcfs_nid2str(peer->ksnp_id.nid), tx_stale,
+                      n, libcfs_nid2str(peer_ni->ksnp_id.nid), tx_stale,
                       cfs_duration_sec(cfs_time_current() - deadline),
                       resid, conn->ksnc_sock->sk->sk_wmem_queued);
 
@@ -2602,7 +2602,7 @@ int ksocknal_reaper(void *arg)
                         int       chunk = ksocknal_data.ksnd_peer_hash_size;
 
                         /* Time to check for timeouts on a few more peers: I do
-                         * checks every 'p' seconds on a proportion of the peer
+                         * checks every 'p' seconds on a proportion of the peer_ni
                          * table and I need to check every connection 'n' times
                          * within a timeout interval, to ensure I detect a
                          * timeout on any connection within (n+1)/n times the
index 6384875..1215488 100644 (file)
@@ -43,7 +43,7 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
         LASSERT (!conn->ksnc_closing);
 
         if (rc != 0) {
-                CERROR ("Error %d getting sock peer IP\n", rc);
+                CERROR ("Error %d getting sock peer_ni IP\n", rc);
                 return rc;
         }
 
@@ -189,7 +189,7 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
         /* Remind the socket to ACK eagerly.  If I don't, the socket might
          * think I'm about to send something it could piggy-back the ACK
          * on, introducing delay in completing zero-copy sends in my
-         * peer. */
+         * peer_ni. */
 
        kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
                          (char *)&opt, sizeof(opt));
index 12165a3..dff1166 100644 (file)
@@ -361,14 +361,14 @@ ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
 static int
 ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
 {
-       ksock_peer_t   *peer = c->ksnc_peer;
+       ksock_peer_ni_t   *peer_ni = c->ksnc_peer;
        ksock_conn_t   *conn;
        ksock_tx_t     *tx;
        int             rc;
 
        read_lock(&ksocknal_data.ksnd_global_lock);
 
-       conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+       conn = ksocknal_find_conn_locked(peer_ni, NULL, !!remote);
        if (conn != NULL) {
                ksock_sched_t *sched = conn->ksnc_scheduler;
 
@@ -393,7 +393,7 @@ ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
         if (tx == NULL)
                 return -ENOMEM;
 
-        if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+        if ((rc = ksocknal_launch_packet(peer_ni->ksnp_ni, tx, peer_ni->ksnp_id)) == 0)
                 return 0;
 
         ksocknal_free_tx(tx);
@@ -404,7 +404,7 @@ ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
 static int
 ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
 {
-        ksock_peer_t      *peer = conn->ksnc_peer;
+        ksock_peer_ni_t      *peer_ni = conn->ksnc_peer;
         ksock_tx_t        *tx;
         ksock_tx_t        *tmp;
        struct list_head        zlist = LIST_HEAD_INIT(zlist);
@@ -421,10 +421,10 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
                 return count == 1 ? 0 : -EPROTO;
         }
 
-       spin_lock(&peer->ksnp_lock);
+       spin_lock(&peer_ni->ksnp_lock);
 
        list_for_each_entry_safe(tx, tmp,
-                                     &peer->ksnp_zc_req_list, tx_zc_list) {
+                                     &peer_ni->ksnp_zc_req_list, tx_zc_list) {
                 __u64 c = tx->tx_msg.ksm_zc_cookies[0];
 
                 if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
@@ -437,7 +437,7 @@ ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
                 }
         }
 
-       spin_unlock(&peer->ksnp_lock);
+       spin_unlock(&peer_ni->ksnp_lock);
 
        while (!list_empty(&zlist)) {
                tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
index 4de013a..8230ceb 100644 (file)
@@ -310,8 +310,8 @@ lnet_accept(struct socket *sock, __u32 magic)
        if (flip)
                __swab64s(&cr.acr_nid);
 
-       ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
-       if (ni == NULL ||               /* no matching net */
+       ni = lnet_nid2ni_addref(cr.acr_nid);
+       if (ni == NULL ||               /* no matching net */
            ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
                if (ni != NULL)
                        lnet_ni_decref(ni);
@@ -321,7 +321,7 @@ lnet_accept(struct socket *sock, __u32 magic)
                return -EPERM;
        }
 
-       if (ni->ni_lnd->lnd_accept == NULL) {
+       if (ni->ni_net->net_lnd->lnd_accept == NULL) {
                /* This catches a request for the loopback LND */
                lnet_ni_decref(ni);
                LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h "
@@ -333,7 +333,7 @@ lnet_accept(struct socket *sock, __u32 magic)
        CDEBUG(D_NET, "Accept %s from %pI4h\n",
               libcfs_nid2str(cr.acr_nid), &peer_ip);
 
-       rc = ni->ni_lnd->lnd_accept(ni, sock);
+       rc = ni->ni_net->net_lnd->lnd_accept(ni, sock);
 
        lnet_ni_decref(ni);
        return rc;
@@ -476,7 +476,7 @@ lnet_acceptor_start(void)
        if (rc <= 0)
                return rc;
 
-       if (lnet_count_acceptor_nis() == 0)  /* not required */
+       if (lnet_count_acceptor_nets() == 0)  /* not required */
                return 0;
 
        task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure,
index 270629d..f64c834 100644 (file)
@@ -57,6 +57,25 @@ static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
 module_param(rnet_htable_size, int, 0444);
 MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table");
 
+static int use_tcp_bonding = false;
+module_param(use_tcp_bonding, int, 0444);
+MODULE_PARM_DESC(use_tcp_bonding,
+                "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
+
+unsigned int lnet_numa_range = 0;
+module_param(lnet_numa_range, uint, 0444);
+MODULE_PARM_DESC(lnet_numa_range,
+               "NUMA range to consider during Multi-Rail selection");
+
+/*
+ * This sequence number keeps track of how many times DLC was used to
+ * update the local NIs. It is incremented when a NI is added or
+ * removed and checked when sending a message to determine if there is
+ * a need to re-run the selection algorithm. See lnet_select_pathway()
+ * for more details on its usage.
+ */
+static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
+
 static int lnet_ping(lnet_process_id_t id, signed long timeout,
                     lnet_process_id_t __user *ids, int n_ids);
 
@@ -584,9 +603,9 @@ lnet_prepare(lnet_pid_t requested_pid)
        the_lnet.ln_pid = requested_pid;
 
        INIT_LIST_HEAD(&the_lnet.ln_test_peers);
-       INIT_LIST_HEAD(&the_lnet.ln_nis);
-       INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
-       INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+       INIT_LIST_HEAD(&the_lnet.ln_peers);
+       INIT_LIST_HEAD(&the_lnet.ln_remote_peer_ni_list);
+       INIT_LIST_HEAD(&the_lnet.ln_nets);
        INIT_LIST_HEAD(&the_lnet.ln_routers);
        INIT_LIST_HEAD(&the_lnet.ln_drop_rules);
        INIT_LIST_HEAD(&the_lnet.ln_delay_rules);
@@ -667,9 +686,7 @@ lnet_unprepare (void)
 
        LASSERT(the_lnet.ln_refcount == 0);
        LASSERT(list_empty(&the_lnet.ln_test_peers));
-       LASSERT(list_empty(&the_lnet.ln_nis));
-       LASSERT(list_empty(&the_lnet.ln_nis_cpt));
-       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+       LASSERT(list_empty(&the_lnet.ln_nets));
 
        lnet_portals_destroy();
 
@@ -686,7 +703,7 @@ lnet_unprepare (void)
        lnet_res_container_cleanup(&the_lnet.ln_eq_container);
 
        lnet_msg_containers_destroy();
-       lnet_peer_tables_destroy();
+       lnet_peer_uninit();
        lnet_rtrpools_free(0);
 
        if (the_lnet.ln_counters != NULL) {
@@ -700,18 +717,17 @@ lnet_unprepare (void)
 }
 
 lnet_ni_t  *
-lnet_net2ni_locked(__u32 net, int cpt)
+lnet_net2ni_locked(__u32 net_id, int cpt)
 {
-       struct list_head *tmp;
-       lnet_ni_t        *ni;
+       struct lnet_ni   *ni;
+       struct lnet_net  *net;
 
        LASSERT(cpt != LNET_LOCK_EX);
 
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
-
-               if (LNET_NIDNET(ni->ni_nid) == net) {
-                       lnet_ni_addref_locked(ni, cpt);
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               if (net->net_id == net_id) {
+                       ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+                                       ni_netlist);
                        return ni;
                }
        }
@@ -720,19 +736,34 @@ lnet_net2ni_locked(__u32 net, int cpt)
 }
 
 lnet_ni_t *
-lnet_net2ni(__u32 net)
+lnet_net2ni_addref(__u32 net)
 {
        lnet_ni_t *ni;
 
        lnet_net_lock(0);
        ni = lnet_net2ni_locked(net, 0);
+       if (ni)
+               lnet_ni_addref_locked(ni, 0);
        lnet_net_unlock(0);
 
        return ni;
 }
-EXPORT_SYMBOL(lnet_net2ni);
+EXPORT_SYMBOL(lnet_net2ni_addref);
+
+struct lnet_net *
+lnet_get_net_locked(__u32 net_id)
+{
+       struct lnet_net  *net;
+
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               if (net->net_id == net_id)
+                       return net;
+       }
+
+       return NULL;
+}
 
-static unsigned int
+unsigned int
 lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
 {
        __u64           key = nid;
@@ -752,31 +783,41 @@ lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
 }
 
 int
-lnet_cpt_of_nid_locked(lnet_nid_t nid)
+lnet_cpt_of_nid_locked(lnet_nid_t nid, struct lnet_ni *ni)
 {
-       struct lnet_ni *ni;
+       struct lnet_net *net;
 
        /* must called with hold of lnet_net_lock */
        if (LNET_CPT_NUMBER == 1)
                return 0; /* the only one */
 
-       /* take lnet_net_lock(any) would be OK */
-       if (!list_empty(&the_lnet.ln_nis_cpt)) {
-               list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
-                       if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
-                               continue;
+       /*
+        * If NI is provided then use the CPT identified in the NI cpt
+        * list if one exists. If one doesn't exist, then that NI is
+        * associated with all CPTs and it follows that the net it belongs
+        * to is implicitly associated with all CPTs, so just hash the nid
+        * and return that.
+        */
+       if (ni != NULL) {
+               if (ni->ni_cpts != NULL)
+                       return ni->ni_cpts[lnet_nid_cpt_hash(nid,
+                                                            ni->ni_ncpts)];
+               else
+                       return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+       }
 
-                       LASSERT(ni->ni_cpts != NULL);
-                       return ni->ni_cpts[lnet_nid_cpt_hash
-                                          (nid, ni->ni_ncpts)];
-               }
+       /* no NI provided so look at the net */
+       net = lnet_get_net_locked(LNET_NIDNET(nid));
+
+       if (net != NULL && net->net_cpts != NULL) {
+               return net->net_cpts[lnet_nid_cpt_hash(nid, net->net_ncpts)];
        }
 
        return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 }
 
 int
-lnet_cpt_of_nid(lnet_nid_t nid)
+lnet_cpt_of_nid(lnet_nid_t nid, struct lnet_ni *ni)
 {
        int     cpt;
        int     cpt2;
@@ -784,11 +825,10 @@ lnet_cpt_of_nid(lnet_nid_t nid)
        if (LNET_CPT_NUMBER == 1)
                return 0; /* the only one */
 
-       if (list_empty(&the_lnet.ln_nis_cpt))
-               return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
-
        cpt = lnet_net_lock_current();
-       cpt2 = lnet_cpt_of_nid_locked(nid);
+
+       cpt2 = lnet_cpt_of_nid_locked(nid, ni);
+
        lnet_net_unlock(cpt);
 
        return cpt2;
@@ -796,42 +836,66 @@ lnet_cpt_of_nid(lnet_nid_t nid)
 EXPORT_SYMBOL(lnet_cpt_of_nid);
 
 int
-lnet_islocalnet(__u32 net)
+lnet_islocalnet(__u32 net_id)
 {
-       struct lnet_ni  *ni;
+       struct lnet_net *net;
        int             cpt;
+       bool            local;
 
        cpt = lnet_net_lock_current();
 
-       ni = lnet_net2ni_locked(net, cpt);
-       if (ni != NULL)
-               lnet_ni_decref_locked(ni, cpt);
+       net = lnet_get_net_locked(net_id);
+
+       local = net != NULL;
 
        lnet_net_unlock(cpt);
 
-       return ni != NULL;
+       return local;
+}
+
+bool
+lnet_is_ni_healthy_locked(struct lnet_ni *ni)
+{
+       if (ni->ni_state == LNET_NI_STATE_ACTIVE ||
+           ni->ni_state == LNET_NI_STATE_DEGRADED)
+               return true;
+
+       return false;
 }
 
 lnet_ni_t  *
 lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
 {
+       struct lnet_net  *net;
        struct lnet_ni   *ni;
-       struct list_head *tmp;
 
        LASSERT(cpt != LNET_LOCK_EX);
 
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
-
-               if (ni->ni_nid == nid) {
-                       lnet_ni_addref_locked(ni, cpt);
-                       return ni;
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (ni->ni_nid == nid)
+                               return ni;
                }
        }
 
        return NULL;
 }
 
+lnet_ni_t *
+lnet_nid2ni_addref(lnet_nid_t nid)
+{
+       lnet_ni_t *ni;
+
+       lnet_net_lock(0);
+       ni = lnet_nid2ni_locked(nid, 0);
+       if (ni)
+               lnet_ni_addref_locked(ni, 0);
+       lnet_net_unlock(0);
+
+       return ni;
+}
+EXPORT_SYMBOL(lnet_nid2ni_addref);
+
 int
 lnet_islocalnid(lnet_nid_t nid)
 {
@@ -840,27 +904,24 @@ lnet_islocalnid(lnet_nid_t nid)
 
        cpt = lnet_net_lock_current();
        ni = lnet_nid2ni_locked(nid, cpt);
-       if (ni != NULL)
-               lnet_ni_decref_locked(ni, cpt);
        lnet_net_unlock(cpt);
 
        return ni != NULL;
 }
 
 int
-lnet_count_acceptor_nis (void)
+lnet_count_acceptor_nets(void)
 {
        /* Return the # of NIs that need the acceptor. */
        int              count = 0;
-       struct list_head *tmp;
-       struct lnet_ni   *ni;
+       struct lnet_net  *net;
        int              cpt;
 
        cpt = lnet_net_lock_current();
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
-
-               if (ni->ni_lnd->lnd_accept != NULL)
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               /* all socklnd type networks should have the acceptor
+                * thread started */
+               if (net->net_lnd->lnd_accept != NULL)
                        count++;
        }
 
@@ -891,15 +952,42 @@ lnet_ping_info_create(int num_ni)
 }
 
 static inline int
+lnet_get_net_ni_count_locked(struct lnet_net *net)
+{
+       struct lnet_ni  *ni;
+       int             count = 0;
+
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+               count++;
+
+       return count;
+}
+
+static inline int
+lnet_get_net_ni_count_pre(struct lnet_net *net)
+{
+       struct lnet_ni  *ni;
+       int             count = 0;
+
+       list_for_each_entry(ni, &net->net_ni_added, ni_netlist)
+               count++;
+
+       return count;
+}
+
+static inline int
 lnet_get_ni_count(void)
 {
-       struct lnet_ni *ni;
-       int            count = 0;
+       struct lnet_ni  *ni;
+       struct lnet_net *net;
+       int             count = 0;
 
        lnet_net_lock(0);
 
-       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list)
-               count++;
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+                       count++;
+       }
 
        lnet_net_unlock(0);
 
@@ -917,14 +1005,17 @@ lnet_ping_info_free(struct lnet_ping_info *pinfo)
 static void
 lnet_ping_info_destroy(void)
 {
+       struct lnet_net *net;
        struct lnet_ni  *ni;
 
        lnet_net_lock(LNET_LOCK_EX);
 
-       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
-               lnet_ni_lock(ni);
-               ni->ni_status = NULL;
-               lnet_ni_unlock(ni);
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       lnet_ni_lock(ni);
+                       ni->ni_status = NULL;
+                       lnet_ni_unlock(ni);
+               }
        }
 
        lnet_ping_info_free(the_lnet.ln_ping_info);
@@ -1029,24 +1120,29 @@ static void
 lnet_ping_info_install_locked(struct lnet_ping_info *ping_info)
 {
        int                     i;
-       lnet_ni_t               *ni;
+       struct lnet_ni          *ni;
+       struct lnet_net         *net;
        struct lnet_ni_status *ns;
 
        i = 0;
-       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
-               LASSERT(i < ping_info->pi_nnis);
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       LASSERT(i < ping_info->pi_nnis);
 
-               ns = &ping_info->pi_ni[i];
+                       ns = &ping_info->pi_ni[i];
 
-               ns->ns_nid = ni->ni_nid;
+                       ns->ns_nid = ni->ni_nid;
 
-               lnet_ni_lock(ni);
-               ns->ns_status = (ni->ni_status != NULL) ?
-                               ni->ni_status->ns_status : LNET_NI_STATUS_UP;
-               ni->ni_status = ns;
-               lnet_ni_unlock(ni);
+                       lnet_ni_lock(ni);
+                       ns->ns_status = (ni->ni_status != NULL) ?
+                                       ni->ni_status->ns_status :
+                                               LNET_NI_STATUS_UP;
+                       ni->ni_status = ns;
+                       lnet_ni_unlock(ni);
+
+                       i++;
+               }
 
-               i++;
        }
 }
 
@@ -1101,11 +1197,11 @@ lnet_ni_tq_credits(lnet_ni_t *ni)
        LASSERT(ni->ni_ncpts >= 1);
 
        if (ni->ni_ncpts == 1)
-               return ni->ni_maxtxcredits;
+               return ni->ni_net->net_tunables.lct_max_tx_credits;
 
-       credits = ni->ni_maxtxcredits / ni->ni_ncpts;
-       credits = max(credits, 8 * ni->ni_peertxcredits);
-       credits = min(credits, ni->ni_maxtxcredits);
+       credits = ni->ni_net->net_tunables.lct_max_tx_credits / ni->ni_ncpts;
+       credits = max(credits, 8 * ni->ni_net->net_tunables.lct_peer_tx_credits);
+       credits = min(credits, ni->ni_net->net_tunables.lct_max_tx_credits);
 
        return credits;
 }
@@ -1119,37 +1215,43 @@ lnet_ni_unlink_locked(lnet_ni_t *ni)
        }
 
        /* move it to zombie list and nobody can find it anymore */
-       LASSERT(!list_empty(&ni->ni_list));
-       list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
-       lnet_ni_decref_locked(ni, 0);   /* drop ln_nis' ref */
+       LASSERT(!list_empty(&ni->ni_netlist));
+       list_move(&ni->ni_netlist, &ni->ni_net->net_ni_zombie);
+       lnet_ni_decref_locked(ni, 0);
 }
 
 static void
-lnet_clear_zombies_nis_locked(void)
+lnet_clear_zombies_nis_locked(struct lnet_net *net)
 {
        int             i;
        int             islo;
        lnet_ni_t       *ni;
+       struct list_head *zombie_list = &net->net_ni_zombie;
 
-       /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
-        * and shut them down in guaranteed thread context */
+       /*
+        * Now wait for the NIs I just nuked to show up on the zombie
+        * list and shut them down in guaranteed thread context
+        */
        i = 2;
-       while (!list_empty(&the_lnet.ln_nis_zombie)) {
+       while (!list_empty(zombie_list)) {
                int     *ref;
                int     j;
 
-               ni = list_entry(the_lnet.ln_nis_zombie.next,
-                               lnet_ni_t, ni_list);
-               list_del_init(&ni->ni_list);
+               ni = list_entry(zombie_list->next,
+                               lnet_ni_t, ni_netlist);
+               list_del_init(&ni->ni_netlist);
+               /* the ni should be in deleting state. If it's not it's
+                * a bug */
+               LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
                cfs_percpt_for_each(ref, j, ni->ni_refs) {
                        if (*ref == 0)
                                continue;
                        /* still busy, add it back to zombie list */
-                       list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+                       list_add(&ni->ni_netlist, zombie_list);
                        break;
                }
 
-               if (!list_empty(&ni->ni_list)) {
+               if (!list_empty(&ni->ni_netlist)) {
                        lnet_net_unlock(LNET_LOCK_EX);
                        ++i;
                        if ((i & (-i)) == i) {
@@ -1163,16 +1265,12 @@ lnet_clear_zombies_nis_locked(void)
                        continue;
                }
 
-               ni->ni_lnd->lnd_refcount--;
                lnet_net_unlock(LNET_LOCK_EX);
 
-               islo = ni->ni_lnd->lnd_type == LOLND;
+               islo = ni->ni_net->net_lnd->lnd_type == LOLND;
 
                LASSERT(!in_interrupt());
-               (ni->ni_lnd->lnd_shutdown)(ni);
-
-               /* can't deref lnd anymore now; it might have unregistered
-                * itself...  */
+               (net->net_lnd->lnd_shutdown)(ni);
 
                if (!islo)
                        CDEBUG(D_LNI, "Removed LNI %s\n",
@@ -1184,212 +1282,155 @@ lnet_clear_zombies_nis_locked(void)
        }
 }
 
-static void
-lnet_shutdown_lndnis(void)
-{
-       int             i;
-       lnet_ni_t       *ni;
-
-       /* NB called holding the global mutex */
-
-       /* All quiet on the API front */
-       LASSERT(!the_lnet.ln_shutdown);
-       LASSERT(the_lnet.ln_refcount == 0);
-       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
-
-       lnet_net_lock(LNET_LOCK_EX);
-       the_lnet.ln_shutdown = 1;       /* flag shutdown */
-
-       /* Unlink NIs from the global table */
-       while (!list_empty(&the_lnet.ln_nis)) {
-               ni = list_entry(the_lnet.ln_nis.next,
-                               lnet_ni_t, ni_list);
-               lnet_ni_unlink_locked(ni);
-       }
-
-       /* Drop the cached loopback NI. */
-       if (the_lnet.ln_loni != NULL) {
-               lnet_ni_decref_locked(the_lnet.ln_loni, 0);
-               the_lnet.ln_loni = NULL;
-       }
-
-       lnet_net_unlock(LNET_LOCK_EX);
-
-       /* Clear lazy portals and drop delayed messages which hold refs
-        * on their lnet_msg_t::msg_rxpeer */
-       for (i = 0; i < the_lnet.ln_nportals; i++)
-               LNetClearLazyPortal(i);
-
-       /* Clear the peer table and wait for all peers to go (they hold refs on
-        * their NIs) */
-       lnet_peer_tables_cleanup(NULL);
-
-       lnet_net_lock(LNET_LOCK_EX);
-
-       lnet_clear_zombies_nis_locked();
-       the_lnet.ln_shutdown = 0;
-       lnet_net_unlock(LNET_LOCK_EX);
-}
-
 /* shutdown down the NI and release refcount */
 static void
 lnet_shutdown_lndni(struct lnet_ni *ni)
 {
        int i;
+       struct lnet_net *net = ni->ni_net;
 
        lnet_net_lock(LNET_LOCK_EX);
+       ni->ni_state = LNET_NI_STATE_DELETING;
        lnet_ni_unlink_locked(ni);
+       lnet_incr_dlc_seq();
        lnet_net_unlock(LNET_LOCK_EX);
 
        /* clear messages for this NI on the lazy portal */
        for (i = 0; i < the_lnet.ln_nportals; i++)
                lnet_clear_lazy_portal(ni, i, "Shutting down NI");
 
-       /* Do peer table cleanup for this ni */
-       lnet_peer_tables_cleanup(ni);
-
        lnet_net_lock(LNET_LOCK_EX);
-       lnet_clear_zombies_nis_locked();
+       lnet_clear_zombies_nis_locked(net);
        lnet_net_unlock(LNET_LOCK_EX);
 }
 
-static int
-lnet_startup_lndni(struct lnet_ni *ni, struct lnet_ioctl_config_data *conf)
+static void
+lnet_shutdown_lndnet(struct lnet_net *net)
 {
-       struct lnet_ioctl_config_lnd_tunables *lnd_tunables = NULL;
-       int                     rc = -EINVAL;
-       __u32                   lnd_type;
-       lnd_t                   *lnd;
-       struct lnet_tx_queue    *tq;
-       int                     i;
+       struct lnet_ni *ni;
+
+       lnet_net_lock(LNET_LOCK_EX);
 
-       lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+       net->net_state = LNET_NET_STATE_DELETING;
 
-       LASSERT(libcfs_isknown_lnd(lnd_type));
+       list_del_init(&net->net_list);
 
-       if (lnd_type == CIBLND || lnd_type == OPENIBLND ||
-           lnd_type == IIBLND || lnd_type == VIBLND) {
-               CERROR("LND %s obsoleted\n", libcfs_lnd2str(lnd_type));
-               goto failed0;
+       while (!list_empty(&net->net_ni_list)) {
+               ni = list_entry(net->net_ni_list.next,
+                               lnet_ni_t, ni_netlist);
+               lnet_net_unlock(LNET_LOCK_EX);
+               lnet_shutdown_lndni(ni);
+               lnet_net_lock(LNET_LOCK_EX);
        }
 
-       /* Make sure this new NI is unique. */
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       /* Do peer table cleanup for this net */
+       lnet_peer_tables_cleanup(net);
+
        lnet_net_lock(LNET_LOCK_EX);
-       rc = lnet_net_unique(LNET_NIDNET(ni->ni_nid), &the_lnet.ln_nis);
+       /*
+        * decrement ref count on lnd only when the entire network goes
+        * away
+        */
+       net->net_lnd->lnd_refcount--;
+
        lnet_net_unlock(LNET_LOCK_EX);
 
-       if (!rc) {
-               if (lnd_type == LOLND) {
-                       lnet_ni_free(ni);
-                       return 0;
-               }
+       lnet_net_free(net);
+}
 
-               CERROR("Net %s is not unique\n",
-                      libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
+static void
+lnet_shutdown_lndnets(void)
+{
+       struct lnet_net *net;
 
-               rc = -EEXIST;
-               goto failed0;
-       }
+       /* NB called holding the global mutex */
 
-       mutex_lock(&the_lnet.ln_lnd_mutex);
-       lnd = lnet_find_lnd_by_type(lnd_type);
+       /* All quiet on the API front */
+       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_refcount == 0);
 
-       if (lnd == NULL) {
-               mutex_unlock(&the_lnet.ln_lnd_mutex);
-               rc = request_module("%s", libcfs_lnd2modname(lnd_type));
-               mutex_lock(&the_lnet.ln_lnd_mutex);
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_shutdown = 1;       /* flag shutdown */
 
-               lnd = lnet_find_lnd_by_type(lnd_type);
-               if (lnd == NULL) {
-                       mutex_unlock(&the_lnet.ln_lnd_mutex);
-                       CERROR("Can't load LND %s, module %s, rc=%d\n",
-                              libcfs_lnd2str(lnd_type),
-                              libcfs_lnd2modname(lnd_type), rc);
-#ifndef HAVE_MODULE_LOADING_SUPPORT
-                       LCONSOLE_ERROR_MSG(0x104, "Your kernel must be "
-                                          "compiled with kernel module "
-                                          "loading support.");
-#endif
-                       rc = -EINVAL;
-                       goto failed0;
-               }
+       while (!list_empty(&the_lnet.ln_nets)) {
+               /*
+                * move the nets to the zombie list to avoid them being
+                * picked up for new work. LONET is also included in the
+                * Nets that will be moved to the zombie list
+                */
+               net = list_entry(the_lnet.ln_nets.next,
+                                struct lnet_net, net_list);
+               list_move(&net->net_list, &the_lnet.ln_net_zombie);
        }
 
-       lnet_net_lock(LNET_LOCK_EX);
-       lnd->lnd_refcount++;
+       /* Drop the cached loopback Net. */
+       if (the_lnet.ln_loni != NULL) {
+               lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+               the_lnet.ln_loni = NULL;
+       }
        lnet_net_unlock(LNET_LOCK_EX);
 
-       ni->ni_lnd = lnd;
+       /* iterate through the net zombie list and delete each net */
+       while (!list_empty(&the_lnet.ln_net_zombie)) {
+               net = list_entry(the_lnet.ln_net_zombie.next,
+                                struct lnet_net, net_list);
+               lnet_shutdown_lndnet(net);
+       }
 
-       if (conf && conf->cfg_hdr.ioc_len > sizeof(*conf))
-               lnd_tunables = (struct lnet_ioctl_config_lnd_tunables *)conf->cfg_bulk;
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_shutdown = 0;
+       lnet_net_unlock(LNET_LOCK_EX);
+}
 
-       if (lnd_tunables != NULL) {
-               LIBCFS_ALLOC(ni->ni_lnd_tunables,
-                            sizeof(*ni->ni_lnd_tunables));
-               if (ni->ni_lnd_tunables == NULL) {
-                       mutex_unlock(&the_lnet.ln_lnd_mutex);
-                       rc = -ENOMEM;
-                       goto failed0;
-               }
-               memcpy(ni->ni_lnd_tunables, lnd_tunables,
-                      sizeof(*ni->ni_lnd_tunables));
-       }
+static int
+lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
+{
+       int                     rc = -EINVAL;
+       struct lnet_tx_queue    *tq;
+       int                     i;
+       struct lnet_net         *net = ni->ni_net;
+
+       mutex_lock(&the_lnet.ln_lnd_mutex);
 
-       /* If given some LND tunable parameters, parse those now to
-        * override the values in the NI structure. */
-       if (conf) {
-               if (conf->cfg_config_u.cfg_net.net_peer_rtr_credits >= 0)
-                       ni->ni_peerrtrcredits =
-                               conf->cfg_config_u.cfg_net.net_peer_rtr_credits;
-               if (conf->cfg_config_u.cfg_net.net_peer_timeout >= 0)
-                       ni->ni_peertimeout =
-                               conf->cfg_config_u.cfg_net.net_peer_timeout;
-               if (conf->cfg_config_u.cfg_net.net_peer_tx_credits >= 0)
-                       ni->ni_peertxcredits =
-                               conf->cfg_config_u.cfg_net.net_peer_tx_credits;
-               if (conf->cfg_config_u.cfg_net.net_max_tx_credits >= 0)
-                       ni->ni_maxtxcredits =
-                               conf->cfg_config_u.cfg_net.net_max_tx_credits;
+       if (tun) {
+               memcpy(&ni->ni_lnd_tunables, tun, sizeof(*tun));
+               ni->ni_lnd_tunables_set = true;
        }
 
-       rc = (lnd->lnd_startup)(ni);
+       rc = (net->net_lnd->lnd_startup)(ni);
 
        mutex_unlock(&the_lnet.ln_lnd_mutex);
 
        if (rc != 0) {
                LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n",
-                                  rc, libcfs_lnd2str(lnd->lnd_type));
+                                  rc, libcfs_lnd2str(net->net_lnd->lnd_type));
                lnet_net_lock(LNET_LOCK_EX);
-               lnd->lnd_refcount--;
+               net->net_lnd->lnd_refcount--;
                lnet_net_unlock(LNET_LOCK_EX);
                goto failed0;
        }
 
-       LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+       ni->ni_state = LNET_NI_STATE_ACTIVE;
 
-       lnet_net_lock(LNET_LOCK_EX);
-       /* refcount for ln_nis */
-       lnet_ni_addref_locked(ni, 0);
-       list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
-       if (ni->ni_cpts != NULL) {
-               lnet_ni_addref_locked(ni, 0);
-               list_add_tail(&ni->ni_cptlist, &the_lnet.ln_nis_cpt);
-       }
-
-       lnet_net_unlock(LNET_LOCK_EX);
-
-       if (lnd->lnd_type == LOLND) {
+       /* We keep a reference on the loopback net through the loopback NI */
+       if (net->net_lnd->lnd_type == LOLND) {
                lnet_ni_addref(ni);
                LASSERT(the_lnet.ln_loni == NULL);
                the_lnet.ln_loni = ni;
+               ni->ni_net->net_tunables.lct_peer_tx_credits = 0;
+               ni->ni_net->net_tunables.lct_peer_rtr_credits = 0;
+               ni->ni_net->net_tunables.lct_max_tx_credits = 0;
+               ni->ni_net->net_tunables.lct_peer_timeout = 0;
                return 0;
        }
 
-       if (ni->ni_peertxcredits == 0 || ni->ni_maxtxcredits == 0) {
+       if (ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ||
+           ni->ni_net->net_tunables.lct_max_tx_credits == 0) {
                LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
-                                  libcfs_lnd2str(lnd->lnd_type),
-                                  ni->ni_peertxcredits == 0 ?
+                                  libcfs_lnd2str(net->net_lnd->lnd_type),
+                                  ni->ni_net->net_tunables.lct_peer_tx_credits == 0 ?
                                        "" : "per-peer ");
                /* shutdown the NI since if we get here then it must've already
                 * been started
@@ -1404,10 +1445,15 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_ioctl_config_data *conf)
                tq->tq_credits = lnet_ni_tq_credits(ni);
        }
 
+       atomic_set(&ni->ni_tx_credits,
+                  lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+
        CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
-               libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+               libcfs_nid2str(ni->ni_nid),
+               ni->ni_net->net_tunables.lct_peer_tx_credits,
                lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
-               ni->ni_peerrtrcredits, ni->ni_peertimeout);
+               ni->ni_net->net_tunables.lct_peer_rtr_credits,
+               ni->ni_net->net_tunables.lct_peer_timeout);
 
        return 0;
 failed0:
@@ -1416,73 +1462,265 @@ failed0:
 }
 
 static int
-lnet_startup_lndnis(struct list_head *nilist)
+lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 {
        struct lnet_ni          *ni;
+       struct lnet_net         *net_l = NULL;
+       struct list_head        local_ni_list;
        int                     rc;
        int                     ni_count = 0;
+       __u32                   lnd_type;
+       lnd_t                   *lnd;
+       int                     peer_timeout =
+               net->net_tunables.lct_peer_timeout;
+       int                     maxtxcredits =
+               net->net_tunables.lct_max_tx_credits;
+       int                     peerrtrcredits =
+               net->net_tunables.lct_peer_rtr_credits;
 
-       while (!list_empty(nilist)) {
-               ni = list_entry(nilist->next, lnet_ni_t, ni_list);
-               list_del(&ni->ni_list);
-               rc = lnet_startup_lndni(ni, NULL);
+       INIT_LIST_HEAD(&local_ni_list);
 
-               if (rc < 0)
-                       goto failed;
+       /*
+        * make sure that this net is unique. If it isn't then
+        * we are adding interfaces to an already existing network, and
+        * 'net' is just a convenient way to pass in the list.
+        * if it is unique we need to find the LND and load it if
+        * necessary.
+        */
+       if (lnet_net_unique(net->net_id, &the_lnet.ln_nets, &net_l)) {
+               lnd_type = LNET_NETTYP(net->net_id);
 
-               ni_count++;
-       }
+               LASSERT(libcfs_isknown_lnd(lnd_type));
 
-       return ni_count;
-failed:
-       lnet_shutdown_lndnis();
+               if (lnd_type == CIBLND || lnd_type == OPENIBLND ||
+                   lnd_type == IIBLND || lnd_type == VIBLND) {
+                       CERROR("LND %s obsoleted\n", libcfs_lnd2str(lnd_type));
+                       rc = -EINVAL;
+                       goto failed0;
+               }
 
-       return rc;
-}
+               mutex_lock(&the_lnet.ln_lnd_mutex);
+               lnd = lnet_find_lnd_by_type(lnd_type);
 
-/**
- * Initialize LNet library.
- *
- * Automatically called at module loading time. Caller has to call
- * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the
- * latter returned 0. It must be called exactly once.
- *
- * \retval 0 on success
- * \retval -ve on failures.
- */
-int lnet_lib_init(void)
-{
-       int rc;
+               if (lnd == NULL) {
+                       mutex_unlock(&the_lnet.ln_lnd_mutex);
+                       rc = request_module("%s", libcfs_lnd2modname(lnd_type));
+                       mutex_lock(&the_lnet.ln_lnd_mutex);
+
+                       lnd = lnet_find_lnd_by_type(lnd_type);
+                       if (lnd == NULL) {
+                               mutex_unlock(&the_lnet.ln_lnd_mutex);
+                               CERROR("Can't load LND %s, module %s, rc=%d\n",
+                               libcfs_lnd2str(lnd_type),
+                               libcfs_lnd2modname(lnd_type), rc);
+#ifndef HAVE_MODULE_LOADING_SUPPORT
+                               LCONSOLE_ERROR_MSG(0x104, "Your kernel must be "
+                                               "compiled with kernel module "
+                                               "loading support.");
+#endif
+                               rc = -EINVAL;
+                               goto failed0;
+                       }
+               }
 
-       lnet_assert_wire_constants();
+               lnet_net_lock(LNET_LOCK_EX);
+               lnd->lnd_refcount++;
+               lnet_net_unlock(LNET_LOCK_EX);
 
-       memset(&the_lnet, 0, sizeof(the_lnet));
+               net->net_lnd = lnd;
 
-       /* refer to global cfs_cpt_table for now */
-       the_lnet.ln_cpt_table   = cfs_cpt_table;
-       the_lnet.ln_cpt_number  = cfs_cpt_number(cfs_cpt_table);
+               mutex_unlock(&the_lnet.ln_lnd_mutex);
 
-       LASSERT(the_lnet.ln_cpt_number > 0);
-       if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
-               /* we are under risk of consuming all lh_cookie */
-               CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
-                      "please change setting of CPT-table and retry\n",
-                      the_lnet.ln_cpt_number, LNET_CPT_MAX);
-               return -E2BIG;
+               net_l = net;
        }
 
-       while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
-               the_lnet.ln_cpt_bits++;
+       /*
+        * net_l: if the network being added is unique then net_l
+        *        will point to that network
+        *        if the network being added is not unique then
+        *        net_l points to the existing network.
+        *
+        * When we enter the loop below, we'll pick NIs off he
+        * network beign added and start them up, then add them to
+        * a local ni list. Once we've successfully started all
+        * the NIs then we join the local NI list (of started up
+        * networks) with the net_l->net_ni_list, which should
+        * point to the correct network to add the new ni list to
+        *
+        * If any of the new NIs fail to start up, then we want to
+        * iterate through the local ni list, which should include
+        * any NIs which were successfully started up, and shut
+        * them down.
+        *
+        * After than we want to delete the network being added,
+        * to avoid a memory leak.
+        */
 
-       rc = lnet_create_locks();
-       if (rc != 0) {
-               CERROR("Can't create LNet global locks: %d\n", rc);
-               return rc;
+       /*
+        * When a network uses TCP bonding then all its interfaces
+        * must be specified when the network is first defined: the
+        * TCP bonding code doesn't allow for interfaces to be added
+        * or removed.
+        */
+       if (net_l != net && net_l != NULL && use_tcp_bonding &&
+           LNET_NETTYP(net_l->net_id) == SOCKLND) {
+               rc = -EINVAL;
+               goto failed0;
        }
 
-       the_lnet.ln_refcount = 0;
-       LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+       while (!list_empty(&net->net_ni_added)) {
+               ni = list_entry(net->net_ni_added.next, struct lnet_ni,
+                               ni_netlist);
+               list_del_init(&ni->ni_netlist);
+
+               /* make sure that the the NI we're about to start
+                * up is actually unique. if it's not fail. */
+               if (!lnet_ni_unique_net(&net_l->net_ni_list,
+                                       ni->ni_interfaces[0])) {
+                       rc = -EINVAL;
+                       goto failed1;
+               }
+
+               /* adjust the pointer the parent network, just in case it
+                * the net is a duplicate */
+               ni->ni_net = net_l;
+
+               rc = lnet_startup_lndni(ni, tun);
+
+               LASSERT(ni->ni_net->net_tunables.lct_peer_timeout <= 0 ||
+                       ni->ni_net->net_lnd->lnd_query != NULL);
+
+               if (rc < 0)
+                       goto failed1;
+
+               lnet_ni_addref(ni);
+               list_add_tail(&ni->ni_netlist, &local_ni_list);
+
+               ni_count++;
+       }
+
+       lnet_net_lock(LNET_LOCK_EX);
+       list_splice_tail(&local_ni_list, &net_l->net_ni_list);
+       lnet_incr_dlc_seq();
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       /* if the network is not unique then we don't want to keep
+        * it around after we're done. Free it. Otherwise add that
+        * net to the global the_lnet.ln_nets */
+       if (net_l != net && net_l != NULL) {
+               /*
+                * TODO - note. currently the tunables can not be updated
+                * once added
+                */
+               lnet_net_free(net);
+       } else {
+               net->net_state = LNET_NET_STATE_ACTIVE;
+               /*
+                * restore tunables after it has been overwitten by the
+                * lnd
+                */
+               if (peer_timeout != -1)
+                       net->net_tunables.lct_peer_timeout = peer_timeout;
+               if (maxtxcredits != -1)
+                       net->net_tunables.lct_max_tx_credits = maxtxcredits;
+               if (peerrtrcredits != -1)
+                       net->net_tunables.lct_peer_rtr_credits = peerrtrcredits;
+
+               lnet_net_lock(LNET_LOCK_EX);
+               list_add_tail(&net->net_list, &the_lnet.ln_nets);
+               lnet_net_unlock(LNET_LOCK_EX);
+       }
+
+       return ni_count;
+
+failed1:
+       /*
+        * shutdown the new NIs that are being started up
+        * free the NET being started
+        */
+       while (!list_empty(&local_ni_list)) {
+               ni = list_entry(local_ni_list.next, struct lnet_ni,
+                               ni_netlist);
+
+               lnet_shutdown_lndni(ni);
+       }
+
+failed0:
+       lnet_net_free(net);
+
+       return rc;
+}
+
+static int
+lnet_startup_lndnets(struct list_head *netlist)
+{
+       struct lnet_net         *net;
+       int                     rc;
+       int                     ni_count = 0;
+
+       while (!list_empty(netlist)) {
+               net = list_entry(netlist->next, struct lnet_net, net_list);
+               list_del_init(&net->net_list);
+
+               rc = lnet_startup_lndnet(net, NULL);
+
+               if (rc < 0)
+                       goto failed;
+
+               ni_count += rc;
+       }
+
+       return ni_count;
+failed:
+       lnet_shutdown_lndnets();
+
+       return rc;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Automatically called at module loading time. Caller has to call
+ * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the
+ * latter returned 0. It must be called exactly once.
+ *
+ * \retval 0 on success
+ * \retval -ve on failures.
+ */
+int lnet_lib_init(void)
+{
+       int rc;
+
+       lnet_assert_wire_constants();
+
+       memset(&the_lnet, 0, sizeof(the_lnet));
+
+       /* refer to global cfs_cpt_table for now */
+       the_lnet.ln_cpt_table   = cfs_cpt_table;
+       the_lnet.ln_cpt_number  = cfs_cpt_number(cfs_cpt_table);
+
+       LASSERT(the_lnet.ln_cpt_number > 0);
+       if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+               /* we are under risk of consuming all lh_cookie */
+               CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+                      "please change setting of CPT-table and retry\n",
+                      the_lnet.ln_cpt_number, LNET_CPT_MAX);
+               return -E2BIG;
+       }
+
+       while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+               the_lnet.ln_cpt_bits++;
+
+       rc = lnet_create_locks();
+       if (rc != 0) {
+               CERROR("Can't create LNet global locks: %d\n", rc);
+               return rc;
+       }
+
+       the_lnet.ln_refcount = 0;
+       LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
        INIT_LIST_HEAD(&the_lnet.ln_lnds);
+       INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
        INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
        INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
 
@@ -1543,6 +1781,7 @@ LNetNIInit(lnet_pid_t requested_pid)
        struct lnet_ping_info   *pinfo;
        lnet_handle_md_t        md_handle;
        struct list_head        net_head;
+       struct lnet_net         *net;
 
        INIT_LIST_HEAD(&net_head);
 
@@ -1562,8 +1801,15 @@ LNetNIInit(lnet_pid_t requested_pid)
                return rc;
        }
 
-       /* Add in the loopback network */
-       if (lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, &net_head) == NULL) {
+       /* create a network for Loopback network */
+       net = lnet_net_alloc(LNET_MKNET(LOLND, 0), &net_head);
+       if (net == NULL) {
+               rc = -ENOMEM;
+               goto err_empty_list;
+       }
+
+       /* Add in the loopback NI */
+       if (lnet_ni_alloc(net, NULL, NULL) == NULL) {
                rc = -ENOMEM;
                goto err_empty_list;
        }
@@ -1575,13 +1821,13 @@ LNetNIInit(lnet_pid_t requested_pid)
         * in this case.  On cleanup in case of failure only clean up
         * routes if it has been loaded */
        if (!the_lnet.ln_nis_from_mod_params) {
-               rc = lnet_parse_networks(&net_head,
-                                        lnet_get_networks());
+               rc = lnet_parse_networks(&net_head, lnet_get_networks(),
+                                        use_tcp_bonding);
                if (rc < 0)
                        goto err_empty_list;
        }
 
-       ni_count = lnet_startup_lndnis(&net_head);
+       ni_count = lnet_startup_lndnets(&net_head);
        if (ni_count < 0) {
                rc = ni_count;
                goto err_empty_list;
@@ -1634,17 +1880,17 @@ err_destroy_routes:
        if (!the_lnet.ln_nis_from_mod_params)
                lnet_destroy_routes();
 err_shutdown_lndnis:
-       lnet_shutdown_lndnis();
+       lnet_shutdown_lndnets();
 err_empty_list:
        lnet_unprepare();
        LASSERT(rc < 0);
        mutex_unlock(&the_lnet.ln_api_mutex);
        while (!list_empty(&net_head)) {
-               struct lnet_ni *ni;
+               struct lnet_net *net;
 
-               ni = list_entry(net_head.next, struct lnet_ni, ni_list);
-               list_del_init(&ni->ni_list);
-               lnet_ni_free(ni);
+               net = list_entry(net_head.next, struct lnet_net, net_list);
+               list_del_init(&net->net_list);
+               lnet_net_free(net);
        }
        return rc;
 }
@@ -1682,7 +1928,7 @@ LNetNIFini()
 
                lnet_acceptor_stop();
                lnet_destroy_routes();
-               lnet_shutdown_lndnis();
+               lnet_shutdown_lndnets();
                lnet_unprepare();
        }
 
@@ -1691,21 +1937,98 @@ LNetNIFini()
 }
 EXPORT_SYMBOL(LNetNIFini);
 
+
+static int lnet_handle_dbg_task(struct lnet_ioctl_dbg *dbg,
+                               struct lnet_dbg_task_info *dbg_info)
+{
+       switch (dbg->dbg_task) {
+       case LNET_DBG_INCR_DLC_SEQ:
+               lnet_incr_dlc_seq();
+       }
+
+       return 0;
+}
 /**
  * Grabs the ni data from the ni structure and fills the out
  * parameters
  *
  * \param[in] ni network       interface structure
- * \param[out] cpt_count       the number of cpts the ni is on
- * \param[out] nid             Network Interface ID
- * \param[out] peer_timeout    NI peer timeout
- * \param[out] peer_tx_crdits  NI peer transmit credits
- * \param[out] peer_rtr_credits NI peer router credits
- * \param[out] max_tx_credits  NI max transmit credit
- * \param[out] net_config      Network configuration
+ * \param[out] cfg_ni          NI config information
+ * \param[out] tun             network and LND tunables
  */
 static void
-lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_data *config)
+lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_ni *cfg_ni,
+                  struct lnet_ioctl_config_lnd_tunables *tun,
+                  struct lnet_ioctl_element_stats *stats,
+                  __u32 tun_size)
+{
+       size_t min_size = 0;
+       int i;
+
+       if (!ni || !cfg_ni || !tun)
+               return;
+
+       if (ni->ni_interfaces[0] != NULL) {
+               for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) {
+                       if (ni->ni_interfaces[i] != NULL) {
+                               strncpy(cfg_ni->lic_ni_intf[i],
+                                       ni->ni_interfaces[i],
+                                       sizeof(cfg_ni->lic_ni_intf[i]));
+                       }
+               }
+       }
+
+       cfg_ni->lic_nid = ni->ni_nid;
+       cfg_ni->lic_status = ni->ni_status->ns_status;
+       cfg_ni->lic_tcp_bonding = use_tcp_bonding;
+       cfg_ni->lic_dev_cpt = ni->ni_dev_cpt;
+
+       memcpy(&tun->lt_cmn, &ni->ni_net->net_tunables, sizeof(tun->lt_cmn));
+
+       if (stats) {
+               stats->send_count = atomic_read(&ni->ni_stats.send_count);
+               stats->recv_count = atomic_read(&ni->ni_stats.recv_count);
+       }
+
+       /*
+        * tun->lt_tun will always be present, but in order to be
+        * backwards compatible, we need to deal with the cases when
+        * tun->lt_tun is smaller than what the kernel has, because it
+        * comes from an older version of a userspace program, then we'll
+        * need to copy as much information as we have available space.
+        */
+       min_size = tun_size - sizeof(tun->lt_cmn);
+       memcpy(&tun->lt_tun, &ni->ni_lnd_tunables, min_size);
+
+       /* copy over the cpts */
+       if (ni->ni_ncpts == LNET_CPT_NUMBER &&
+           ni->ni_cpts == NULL)  {
+               for (i = 0; i < ni->ni_ncpts; i++)
+                       cfg_ni->lic_cpts[i] = i;
+       } else {
+               for (i = 0;
+                    ni->ni_cpts != NULL && i < ni->ni_ncpts &&
+                    i < LNET_MAX_SHOW_NUM_CPT;
+                    i++)
+                       cfg_ni->lic_cpts[i] = ni->ni_cpts[i];
+       }
+       cfg_ni->lic_ncpts = ni->ni_ncpts;
+}
+
+/**
+ * NOTE: This is a legacy function left in the code to be backwards
+ * compatible with older userspace programs. It should eventually be
+ * removed.
+ *
+ * Grabs the ni data from the ni structure and fills the out
+ * parameters
+ *
+ * \param[in] ni network       interface structure
+ * \param[out] config          config information
+ */
+static void
+lnet_fill_ni_info_legacy(struct lnet_ni *ni,
+                        struct lnet_ioctl_config_data *config)
 {
        struct lnet_ioctl_net_config *net_config;
        struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL;
@@ -1732,10 +2055,14 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_data *config)
        }
 
        config->cfg_nid = ni->ni_nid;
-       config->cfg_config_u.cfg_net.net_peer_timeout = ni->ni_peertimeout;
-       config->cfg_config_u.cfg_net.net_max_tx_credits = ni->ni_maxtxcredits;
-       config->cfg_config_u.cfg_net.net_peer_tx_credits = ni->ni_peertxcredits;
-       config->cfg_config_u.cfg_net.net_peer_rtr_credits = ni->ni_peerrtrcredits;
+       config->cfg_config_u.cfg_net.net_peer_timeout =
+               ni->ni_net->net_tunables.lct_peer_timeout;
+       config->cfg_config_u.cfg_net.net_max_tx_credits =
+               ni->ni_net->net_tunables.lct_max_tx_credits;
+       config->cfg_config_u.cfg_net.net_peer_tx_credits =
+               ni->ni_net->net_tunables.lct_peer_tx_credits;
+       config->cfg_config_u.cfg_net.net_peer_rtr_credits =
+               ni->ni_net->net_tunables.lct_peer_rtr_credits;
 
        net_config->ni_status = ni->ni_status->ns_status;
 
@@ -1757,46 +2084,99 @@ lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_data *config)
        if (config->cfg_hdr.ioc_len > min_size)
                tunable_size = config->cfg_hdr.ioc_len - min_size;
 
-       /* Don't copy to much data to user space */
-       min_size = min(tunable_size, sizeof(*ni->ni_lnd_tunables));
+       /* Don't copy too much data to user space */
+       min_size = min(tunable_size, sizeof(ni->ni_lnd_tunables));
        lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk;
 
-       if (ni->ni_lnd_tunables && lnd_cfg && min_size) {
-               memcpy(lnd_cfg, ni->ni_lnd_tunables, min_size);
+       if (lnd_cfg && min_size) {
+               memcpy(&lnd_cfg->lt_tun, &ni->ni_lnd_tunables, min_size);
                config->cfg_config_u.cfg_net.net_interface_count = 1;
 
                /* Tell user land that kernel side has less data */
-               if (tunable_size > sizeof(*ni->ni_lnd_tunables)) {
+               if (tunable_size > sizeof(ni->ni_lnd_tunables)) {
                        min_size = tunable_size - sizeof(ni->ni_lnd_tunables);
                        config->cfg_hdr.ioc_len -= min_size;
                }
        }
 }
 
-static int
+struct lnet_ni *
+lnet_get_ni_idx_locked(int idx)
+{
+       struct lnet_ni          *ni;
+       struct lnet_net         *net;
+
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (idx-- == 0)
+                               return ni;
+               }
+       }
+
+       return NULL;
+}
+
+struct lnet_ni *
+lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
+{
+       struct lnet_ni          *ni;
+       struct lnet_net         *net = mynet;
+
+       if (prev == NULL) {
+               if (net == NULL)
+                       net = list_entry(the_lnet.ln_nets.next, struct lnet_net,
+                                       net_list);
+               ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+                               ni_netlist);
+
+               return ni;
+       }
+
+       if (prev->ni_netlist.next == &prev->ni_net->net_ni_list) {
+               /* if you reached the end of the ni list and the net is
+                * specified, then there are no more nis in that net */
+               if (net != NULL)
+                       return NULL;
+
+               /* we reached the end of this net ni list. move to the
+                * next net */
+               if (prev->ni_net->net_list.next == &the_lnet.ln_nets)
+                       /* no more nets and no more NIs. */
+                       return NULL;
+
+               /* get the next net */
+               net = list_entry(prev->ni_net->net_list.next, struct lnet_net,
+                                net_list);
+               /* get the ni on it */
+               ni = list_entry(net->net_ni_list.next, struct lnet_ni,
+                               ni_netlist);
+
+               return ni;
+       }
+
+       /* there are more nis left */
+       ni = list_entry(prev->ni_netlist.next, struct lnet_ni, ni_netlist);
+
+       return ni;
+}
+
+int
 lnet_get_net_config(struct lnet_ioctl_config_data *config)
 {
        struct lnet_ni *ni;
-       struct list_head *tmp;
-       int idx = config->cfg_count;
+       int cpt;
        int rc = -ENOENT;
-       int cpt, i = 0;
-
-       if (unlikely(!config->cfg_bulk))
-               return -EINVAL;
+       int idx = config->cfg_count;
 
        cpt = lnet_net_lock_current();
 
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               if (i++ != idx)
-                       continue;
+       ni = lnet_get_ni_idx_locked(idx);
 
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
+       if (ni != NULL) {
+               rc = 0;
                lnet_ni_lock(ni);
-               lnet_fill_ni_info(ni, config);
+               lnet_fill_ni_info_legacy(ni, config);
                lnet_ni_unlock(ni);
-               rc = 0;
-               break;
        }
 
        lnet_net_unlock(cpt);
@@ -1804,127 +2184,421 @@ lnet_get_net_config(struct lnet_ioctl_config_data *config)
 }
 
 int
-lnet_dyn_add_ni(lnet_pid_t requested_pid, struct lnet_ioctl_config_data *conf)
+lnet_get_ni_config(struct lnet_ioctl_config_ni *cfg_ni,
+                  struct lnet_ioctl_config_lnd_tunables *tun,
+                  struct lnet_ioctl_element_stats *stats,
+                  __u32 tun_size)
 {
-       char                    *nets = conf->cfg_config_u.cfg_net.net_intf;
-       struct lnet_ping_info   *pinfo;
-       lnet_handle_md_t        md_handle;
        struct lnet_ni          *ni;
-       struct list_head        net_head;
-       int                     rc;
-       lnet_remotenet_t        *rnet;
+       int                     cpt;
+       int                     rc = -ENOENT;
 
-       INIT_LIST_HEAD(&net_head);
+       if (!cfg_ni || !tun || !stats)
+               return -EINVAL;
 
-       /* Create a ni structure for the network string */
-       rc = lnet_parse_networks(&net_head, nets);
-       if (rc <= 0)
-               return rc == 0 ? -EINVAL : rc;
+       cpt = lnet_net_lock_current();
 
-       mutex_lock(&the_lnet.ln_api_mutex);
+       ni = lnet_get_ni_idx_locked(cfg_ni->lic_idx);
 
-       if (rc > 1) {
-               rc = -EINVAL; /* only add one interface per call */
-               goto failed0;
+       if (ni) {
+               rc = 0;
+               lnet_ni_lock(ni);
+               lnet_fill_ni_info(ni, cfg_ni, tun, stats, tun_size);
+               lnet_ni_unlock(ni);
        }
 
-       ni = list_entry(net_head.next, struct lnet_ni, ni_list);
+       lnet_net_unlock(cpt);
+       return rc;
+}
+
+static int lnet_add_net_common(struct lnet_net *net,
+                              struct lnet_ioctl_config_lnd_tunables *tun)
+{
+       __u32                   net_id;
+       lnet_ping_info_t        *pinfo;
+       lnet_handle_md_t        md_handle;
+       int                     rc;
+       lnet_remotenet_t        *rnet;
+       int                     net_ni_count;
+       int                     num_acceptor_nets;
 
        lnet_net_lock(LNET_LOCK_EX);
-       rnet = lnet_find_net_locked(LNET_NIDNET(ni->ni_nid));
+       rnet = lnet_find_rnet_locked(net->net_id);
        lnet_net_unlock(LNET_LOCK_EX);
-       /* make sure that the net added doesn't invalidate the current
-        * configuration LNet is keeping */
-       if (rnet != NULL) {
+       /*
+        * make sure that the net added doesn't invalidate the current
+        * configuration LNet is keeping
+        */
+       if (rnet) {
                CERROR("Adding net %s will invalidate routing configuration\n",
-                      nets);
-               rc = -EUSERS;
-               goto failed0;
+                      libcfs_net2str(net->net_id));
+               lnet_net_free(net);
+               return -EUSERS;
        }
 
-       rc = lnet_ping_info_setup(&pinfo, &md_handle, 1 + lnet_get_ni_count(),
+       /*
+        * make sure you calculate the correct number of slots in the ping
+        * info. Since the ping info is a flattened list of all the NIs,
+        * we should allocate enough slots to accomodate the number of NIs
+        * which will be added.
+        *
+        * since ni hasn't been configured yet, use
+        * lnet_get_net_ni_count_pre() which checks the net_ni_added list
+        */
+       net_ni_count = lnet_get_net_ni_count_pre(net);
+
+       rc = lnet_ping_info_setup(&pinfo, &md_handle,
+                                 net_ni_count + lnet_get_ni_count(),
                                  false);
-       if (rc != 0)
-               goto failed0;
+       if (rc < 0) {
+               lnet_net_free(net);
+               return rc;
+       }
 
-       list_del_init(&ni->ni_list);
+       if (tun)
+               memcpy(&net->net_tunables,
+                      &tun->lt_cmn, sizeof(net->net_tunables));
+       else
+               memset(&net->net_tunables, -1, sizeof(net->net_tunables));
 
-       rc = lnet_startup_lndni(ni, conf);
-       if (rc != 0)
-               goto failed1;
+       /*
+        * before starting this network get a count of the current TCP
+        * networks which require the acceptor thread running. If that
+        * count is == 0 before we start up this network, then we'd want to
+        * start up the acceptor thread after starting up this network
+        */
+       num_acceptor_nets = lnet_count_acceptor_nets();
 
-       if (ni->ni_lnd->lnd_accept != NULL) {
+       net_id = net->net_id;
+
+       rc = lnet_startup_lndnet(net,
+                                (tun) ? &tun->lt_tun : NULL);
+       if (rc < 0)
+               goto failed;
+
+       lnet_net_lock(LNET_LOCK_EX);
+       net = lnet_get_net_locked(net_id);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       LASSERT(net);
+
+       /*
+        * Start the acceptor thread if this is the first network
+        * being added that requires the thread.
+        */
+       if (net->net_lnd->lnd_accept && num_acceptor_nets == 0) {
                rc = lnet_acceptor_start();
                if (rc < 0) {
-                       /* shutdown the ni that we just started */
+                       /* shutdown the net that we just started */
                        CERROR("Failed to start up acceptor thread\n");
-                       lnet_shutdown_lndni(ni);
-                       goto failed1;
+                       lnet_shutdown_lndnet(net);
+                       goto failed;
                }
        }
 
+       lnet_net_lock(LNET_LOCK_EX);
+       lnet_peer_net_added(net);
+       lnet_net_unlock(LNET_LOCK_EX);
+
        lnet_ping_target_update(pinfo, md_handle);
-       mutex_unlock(&the_lnet.ln_api_mutex);
 
        return 0;
 
-failed1:
+failed:
        lnet_ping_md_unlink(pinfo, &md_handle);
        lnet_ping_info_free(pinfo);
-failed0:
+       return rc;
+}
+
+static int lnet_handle_legacy_ip2nets(char *ip2nets,
+                                     struct lnet_ioctl_config_lnd_tunables *tun)
+{
+       struct lnet_net *net;
+       char *nets;
+       int rc;
+       struct list_head net_head;
+
+       INIT_LIST_HEAD(&net_head);
+
+       rc = lnet_parse_ip2nets(&nets, ip2nets);
+       if (rc < 0)
+               return rc;
+
+       rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding);
+       if (rc < 0)
+               return rc;
+
+       mutex_lock(&the_lnet.ln_api_mutex);
+       while (!list_empty(&net_head)) {
+               net = list_entry(net_head.next, struct lnet_net, net_list);
+               list_del_init(&net->net_list);
+               rc = lnet_add_net_common(net, tun);
+               if (rc < 0)
+                       goto out;
+       }
+
+out:
        mutex_unlock(&the_lnet.ln_api_mutex);
+
        while (!list_empty(&net_head)) {
-               ni = list_entry(net_head.next, struct lnet_ni, ni_list);
-               list_del_init(&ni->ni_list);
-               lnet_ni_free(ni);
+               net = list_entry(net_head.next, struct lnet_net, net_list);
+               list_del_init(&net->net_list);
+               lnet_net_free(net);
        }
        return rc;
 }
 
-int
-lnet_dyn_del_ni(__u32 net)
+int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
 {
-       lnet_ni_t        *ni;
-       struct lnet_ping_info *pinfo;
+       struct lnet_net *net;
+       struct lnet_ni *ni;
+       struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+       int rc, i;
+       __u32 net_id;
+
+       /* get the tunables if they are available */
+       if (conf->lic_cfg_hdr.ioc_len >=
+           sizeof(*conf) + sizeof(*tun))
+               tun = (struct lnet_ioctl_config_lnd_tunables *)
+                       conf->lic_bulk;
+
+       /* handle legacy ip2nets from DLC */
+       if (conf->lic_legacy_ip2nets[0] != '\0')
+               return lnet_handle_legacy_ip2nets(conf->lic_legacy_ip2nets,
+                                                 tun);
+
+       net_id = LNET_NIDNET(conf->lic_nid);
+
+       net = lnet_net_alloc(net_id, NULL);
+       if (!net)
+               return -ENOMEM;
+
+       for (i = 0; i < conf->lic_ncpts; i++) {
+               if (conf->lic_cpts[i] >= LNET_CPT_NUMBER)
+                       return -EINVAL;
+       }
+
+       ni = lnet_ni_alloc_w_cpt_array(net, conf->lic_cpts, conf->lic_ncpts,
+                                      conf->lic_ni_intf[0]);
+       if (!ni)
+               return -ENOMEM;
+
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       rc = lnet_add_net_common(net, tun);
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return rc;
+}
+
+int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
+{
+       struct lnet_net  *net;
+       struct lnet_ni *ni;
+       __u32 net_id = LNET_NIDNET(conf->lic_nid);
+       lnet_ping_info_t *pinfo;
        lnet_handle_md_t  md_handle;
        int               rc;
+       int               net_count;
+       __u32             addr;
 
        /* don't allow userspace to shutdown the LOLND */
-       if (LNET_NETTYP(net) == LOLND)
+       if (LNET_NETTYP(net_id) == LOLND)
                return -EINVAL;
 
        mutex_lock(&the_lnet.ln_api_mutex);
+
+       lnet_net_lock(0);
+
+       net = lnet_get_net_locked(net_id);
+       if (!net) {
+               CERROR("net %s not found\n",
+                      libcfs_net2str(net_id));
+               rc = -ENOENT;
+               goto net_unlock;
+       }
+
+       addr = LNET_NIDADDR(conf->lic_nid);
+       if (addr == 0) {
+               /* remove the entire net */
+               net_count = lnet_get_net_ni_count_locked(net);
+
+               lnet_net_unlock(0);
+
+               /* create and link a new ping info, before removing the old one */
+               rc = lnet_ping_info_setup(&pinfo, &md_handle,
+                                       lnet_get_ni_count() - net_count,
+                                       false);
+               if (rc != 0)
+                       goto out;
+
+               lnet_shutdown_lndnet(net);
+
+               if (lnet_count_acceptor_nets() == 0)
+                       lnet_acceptor_stop();
+
+               lnet_ping_target_update(pinfo, md_handle);
+
+               goto out;
+       }
+
+       ni = lnet_nid2ni_locked(conf->lic_nid, 0);
+       if (!ni) {
+               CERROR("nid %s not found \n",
+                      libcfs_nid2str(conf->lic_nid));
+               rc = -ENOENT;
+               goto net_unlock;
+       }
+
+       net_count = lnet_get_net_ni_count_locked(net);
+
+       lnet_net_unlock(0);
+
        /* create and link a new ping info, before removing the old one */
        rc = lnet_ping_info_setup(&pinfo, &md_handle,
                                  lnet_get_ni_count() - 1, false);
        if (rc != 0)
                goto out;
 
-       ni = lnet_net2ni(net);
-       if (ni == NULL) {
-               rc = -EINVAL;
+       lnet_shutdown_lndni(ni);
+
+       if (lnet_count_acceptor_nets() == 0)
+               lnet_acceptor_stop();
+
+       lnet_ping_target_update(pinfo, md_handle);
+
+       /* check if the net is empty and remove it if it is */
+       if (net_count == 1)
+               lnet_shutdown_lndnet(net);
+
+       goto out;
+
+net_unlock:
+       lnet_net_unlock(0);
+out:
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return rc;
+}
+
+/*
+ * lnet_dyn_add_net and lnet_dyn_del_net are now deprecated.
+ * They are only expected to be called for unique networks.
+ * That can be as a result of older DLC library
+ * calls. Multi-Rail DLC and beyond no longer uses these APIs.
+ */
+int
+lnet_dyn_add_net(struct lnet_ioctl_config_data *conf)
+{
+       struct lnet_net         *net;
+       struct list_head        net_head;
+       int                     rc;
+       struct lnet_ioctl_config_lnd_tunables tun;
+       char *nets = conf->cfg_config_u.cfg_net.net_intf;
+
+       INIT_LIST_HEAD(&net_head);
+
+       /* Create a net/ni structures for the network string */
+       rc = lnet_parse_networks(&net_head, nets, use_tcp_bonding);
+       if (rc <= 0)
+               return rc == 0 ? -EINVAL : rc;
+
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       if (rc > 1) {
+               rc = -EINVAL; /* only add one network per call */
                goto failed;
        }
 
-       /* decrement the reference counter taken by lnet_net2ni() */
-       lnet_ni_decref_locked(ni, 0);
+       net = list_entry(net_head.next, struct lnet_net, net_list);
+       list_del_init(&net->net_list);
 
-       lnet_shutdown_lndni(ni);
+       LASSERT(lnet_net_unique(net->net_id, &the_lnet.ln_nets, NULL));
+
+       memset(&tun, 0, sizeof(tun));
+
+       tun.lt_cmn.lct_peer_timeout =
+         conf->cfg_config_u.cfg_net.net_peer_timeout;
+       tun.lt_cmn.lct_peer_tx_credits =
+         conf->cfg_config_u.cfg_net.net_peer_tx_credits;
+       tun.lt_cmn.lct_peer_rtr_credits =
+         conf->cfg_config_u.cfg_net.net_peer_rtr_credits;
+       tun.lt_cmn.lct_max_tx_credits =
+         conf->cfg_config_u.cfg_net.net_max_tx_credits;
+
+       rc = lnet_add_net_common(net, &tun);
+       if (rc != 0)
+               goto failed;
 
-       if (lnet_count_acceptor_nis() == 0)
+       return 0;
+
+failed:
+       mutex_unlock(&the_lnet.ln_api_mutex);
+       while (!list_empty(&net_head)) {
+               net = list_entry(net_head.next, struct lnet_net, net_list);
+               list_del_init(&net->net_list);
+               lnet_net_free(net);
+       }
+       return rc;
+}
+
+int
+lnet_dyn_del_net(__u32 net_id)
+{
+       struct lnet_net  *net;
+       struct lnet_ping_info *pinfo;
+       lnet_handle_md_t  md_handle;
+       int               rc;
+       int               net_ni_count;
+
+       /* don't allow userspace to shutdown the LOLND */
+       if (LNET_NETTYP(net_id) == LOLND)
+               return -EINVAL;
+
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       lnet_net_lock(0);
+
+       net = lnet_get_net_locked(net_id);
+       if (net == NULL) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       net_ni_count = lnet_get_net_ni_count_locked(net);
+
+       lnet_net_unlock(0);
+
+       /* create and link a new ping info, before removing the old one */
+       rc = lnet_ping_info_setup(&pinfo, &md_handle,
+                                 lnet_get_ni_count() - net_ni_count, false);
+       if (rc != 0)
+               goto out;
+
+       lnet_shutdown_lndnet(net);
+
+       if (lnet_count_acceptor_nets() == 0)
                lnet_acceptor_stop();
 
        lnet_ping_target_update(pinfo, md_handle);
-       goto out;
-failed:
-       lnet_ping_md_unlink(pinfo, &md_handle);
-       lnet_ping_info_free(pinfo);
+
 out:
        mutex_unlock(&the_lnet.ln_api_mutex);
 
        return rc;
 }
 
+void lnet_incr_dlc_seq(void)
+{
+       atomic_inc(&lnet_dlc_seq_no);
+}
+
+__u32 lnet_get_dlc_seq_locked(void)
+{
+       return atomic_read(&lnet_dlc_seq_no);
+}
+
 /**
  * LNet ioctl handler.
  *
@@ -1988,13 +2662,42 @@ LNetCtl(unsigned int cmd, void *arg)
                if (config->cfg_hdr.ioc_len < sizeof(*config))
                        return -EINVAL;
 
-               return lnet_get_route(config->cfg_count,
-                                     &config->cfg_net,
-                                     &config->cfg_config_u.cfg_route.rtr_hop,
-                                     &config->cfg_nid,
-                                     &config->cfg_config_u.cfg_route.rtr_flags,
-                                     &config->cfg_config_u.cfg_route.
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_route(config->cfg_count,
+                                   &config->cfg_net,
+                                   &config->cfg_config_u.cfg_route.rtr_hop,
+                                   &config->cfg_nid,
+                                   &config->cfg_config_u.cfg_route.rtr_flags,
+                                   &config->cfg_config_u.cfg_route.
                                        rtr_priority);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+
+       case IOC_LIBCFS_GET_LOCAL_NI: {
+               struct lnet_ioctl_config_ni *cfg_ni;
+               struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+               struct lnet_ioctl_element_stats *stats;
+               __u32 tun_size;
+
+               cfg_ni = arg;
+               /* get the tunables if they are available */
+               if (cfg_ni->lic_cfg_hdr.ioc_len <
+                   sizeof(*cfg_ni) + sizeof(*stats)+ sizeof(*tun))
+                       return -EINVAL;
+
+               stats = (struct lnet_ioctl_element_stats *)
+                       cfg_ni->lic_bulk;
+               tun = (struct lnet_ioctl_config_lnd_tunables *)
+                               (cfg_ni->lic_bulk + sizeof(*stats));
+
+               tun_size = cfg_ni->lic_cfg_hdr.ioc_len - sizeof(*cfg_ni) -
+                       sizeof(*stats);
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_ni_config(cfg_ni, tun, stats, tun_size);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+       }
 
        case IOC_LIBCFS_GET_NET: {
                size_t total = sizeof(*config) +
@@ -2004,7 +2707,10 @@ LNetCtl(unsigned int cmd, void *arg)
                if (config->cfg_hdr.ioc_len < total)
                        return -EINVAL;
 
-               return lnet_get_net_config(config);
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_net_config(config);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
        }
 
        case IOC_LIBCFS_GET_LNET_STATS:
@@ -2014,7 +2720,9 @@ LNetCtl(unsigned int cmd, void *arg)
                if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats))
                        return -EINVAL;
 
+               mutex_lock(&the_lnet.ln_api_mutex);
                lnet_counters_get(&lnet_stats->st_cntrs);
+               mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
 
@@ -2050,6 +2758,26 @@ LNetCtl(unsigned int cmd, void *arg)
                mutex_unlock(&the_lnet.ln_api_mutex);
                return rc;
 
+       case IOC_LIBCFS_SET_NUMA_RANGE: {
+               struct lnet_ioctl_numa_range *numa;
+               numa = arg;
+               if (numa->nr_hdr.ioc_len != sizeof(*numa))
+                       return -EINVAL;
+               mutex_lock(&the_lnet.ln_api_mutex);
+               lnet_numa_range = numa->nr_range;
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
+       case IOC_LIBCFS_GET_NUMA_RANGE: {
+               struct lnet_ioctl_numa_range *numa;
+               numa = arg;
+               if (numa->nr_hdr.ioc_len != sizeof(*numa))
+                       return -EINVAL;
+               numa->nr_range = lnet_numa_range;
+               return 0;
+       }
+
        case IOC_LIBCFS_GET_BUF: {
                struct lnet_ioctl_pool_cfg *pool_cfg;
                size_t total = sizeof(*config) + sizeof(*pool_cfg);
@@ -2060,7 +2788,38 @@ LNetCtl(unsigned int cmd, void *arg)
                        return -EINVAL;
 
                pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk;
-               return lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg);
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+       }
+
+       case IOC_LIBCFS_ADD_PEER_NI: {
+               struct lnet_ioctl_peer_cfg *cfg = arg;
+
+               if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+                       return -EINVAL;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_add_peer_ni_to_peer(cfg->prcfg_prim_nid,
+                                             cfg->prcfg_cfg_nid,
+                                             cfg->prcfg_mr);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+       }
+
+       case IOC_LIBCFS_DEL_PEER_NI: {
+               struct lnet_ioctl_peer_cfg *cfg = arg;
+
+               if (cfg->prcfg_hdr.ioc_len < sizeof(*cfg))
+                       return -EINVAL;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_del_peer_ni_from_peer(cfg->prcfg_prim_nid,
+                                               cfg->prcfg_cfg_nid);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
        }
 
        case IOC_LIBCFS_GET_PEER_INFO: {
@@ -2069,7 +2828,8 @@ LNetCtl(unsigned int cmd, void *arg)
                if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info))
                        return -EINVAL;
 
-               return lnet_get_peer_info(
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_peer_ni_info(
                   peer_info->pr_count,
                   &peer_info->pr_nid,
                   peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness,
@@ -2078,8 +2838,32 @@ LNetCtl(unsigned int cmd, void *arg)
                   &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits,
                   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits,
                   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits,
-                  &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_rtr_credits,
+                  &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_tx_credits,
                   &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
+       }
+
+       case IOC_LIBCFS_GET_PEER_NI: {
+               struct lnet_ioctl_peer_cfg *cfg = arg;
+               struct lnet_peer_ni_credit_info *lpni_cri;
+               struct lnet_ioctl_element_stats *lpni_stats;
+               size_t total = sizeof(*cfg) + sizeof(*lpni_cri) +
+                              sizeof(*lpni_stats);
+
+               if (cfg->prcfg_hdr.ioc_len < total)
+                       return -EINVAL;
+
+               lpni_cri = (struct lnet_peer_ni_credit_info*) cfg->prcfg_bulk;
+               lpni_stats = (struct lnet_ioctl_element_stats *)
+                            (cfg->prcfg_bulk + sizeof(*lpni_cri));
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_get_peer_info(cfg->prcfg_idx, &cfg->prcfg_prim_nid,
+                                       &cfg->prcfg_cfg_nid, &cfg->prcfg_mr,
+                                       lpni_cri, lpni_stats);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return rc;
        }
 
        case IOC_LIBCFS_NOTIFY_ROUTER: {
@@ -2132,15 +2916,29 @@ LNetCtl(unsigned int cmd, void *arg)
                data->ioc_count = rc;
                return 0;
        }
+
+       case IOC_LIBCFS_DBG: {
+               struct lnet_ioctl_dbg *dbg = arg;
+               struct lnet_dbg_task_info *dbg_info;
+               size_t total = sizeof(*dbg) + sizeof(*dbg_info);
+
+               if (dbg->dbg_hdr.ioc_len < total)
+                       return -EINVAL;
+
+               dbg_info = (struct lnet_dbg_task_info*) dbg->dbg_bulk;
+
+               return lnet_handle_dbg_task(dbg, dbg_info);
+       }
+
        default:
-               ni = lnet_net2ni(data->ioc_net);
+               ni = lnet_net2ni_addref(data->ioc_net);
                if (ni == NULL)
                        return -EINVAL;
 
-               if (ni->ni_lnd->lnd_ctl == NULL)
+               if (ni->ni_net->net_lnd->lnd_ctl == NULL)
                        rc = -EINVAL;
                else
-                       rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+                       rc = ni->ni_net->net_lnd->lnd_ctl(ni, cmd, arg);
 
                lnet_ni_decref(ni);
                return rc;
@@ -2170,7 +2968,7 @@ int
 LNetGetId(unsigned int index, lnet_process_id_t *id)
 {
        struct lnet_ni   *ni;
-       struct list_head *tmp;
+       struct lnet_net  *net;
        int               cpt;
        int               rc = -ENOENT;
 
@@ -2178,16 +2976,16 @@ LNetGetId(unsigned int index, lnet_process_id_t *id)
 
        cpt = lnet_net_lock_current();
 
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               if (index-- != 0)
-                       continue;
-
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (index-- != 0)
+                               continue;
 
-               id->nid = ni->ni_nid;
-               id->pid = the_lnet.ln_pid;
-               rc = 0;
-               break;
+                       id->nid = ni->ni_nid;
+                       id->pid = the_lnet.ln_pid;
+                       rc = 0;
+                       break;
+               }
        }
 
        lnet_net_unlock(cpt);
index ba8b879..9469a11 100644 (file)
@@ -46,8 +46,11 @@ static int lnet_tbnob = 0;                   /* track text buf allocation */
 #define LNET_MAX_TEXTBUF_NOB    (64<<10)       /* bound allocation */
 #define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
 
+#define SPACESTR " \t\v\r\n"
+#define DELIMITERS ":()[]"
+
 static void
-lnet_syntax(char *name, char *str, int offset, int width)
+lnet_syntax(const char *name, const char *str, int offset, int width)
 {
        static char dots[LNET_SINGLE_TEXTBUF_NOB];
        static char dashes[LNET_SINGLE_TEXTBUF_NOB];
@@ -76,20 +79,216 @@ lnet_issep (char c)
        }
 }
 
-int
-lnet_net_unique(__u32 net, struct list_head *nilist)
+bool
+lnet_net_unique(__u32 net_id, struct list_head *netlist,
+               struct lnet_net **net)
+{
+       struct lnet_net  *net_l;
+
+       if (!netlist)
+               return true;
+
+       list_for_each_entry(net_l, netlist, net_list) {
+               if (net_l->net_id == net_id) {
+                       if (net != NULL)
+                               *net = net_l;
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+/* check that the NI is unique within the list of NIs already added to
+ * a network */
+bool
+lnet_ni_unique_net(struct list_head *nilist, char *iface)
 {
        struct list_head *tmp;
-       lnet_ni_t        *ni;
+       struct lnet_ni *ni;
 
        list_for_each(tmp, nilist) {
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
+               ni = list_entry(tmp, struct lnet_ni, ni_netlist);
 
-               if (LNET_NIDNET(ni->ni_nid) == net)
-                       return 0;
+               if (ni->ni_interfaces[0] != NULL &&
+                   strncmp(ni->ni_interfaces[0], iface, strlen(iface)) == 0)
+                       return false;
        }
 
-       return 1;
+       return true;
+}
+
+/* check that the NI is unique to the interfaces with in the same NI.
+ * This is only a consideration if use_tcp_bonding is set */
+static bool
+lnet_ni_unique_ni(char *iface_list[LNET_MAX_INTERFACES], char *iface)
+{
+       int i;
+       for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+               if (iface_list[i] != NULL &&
+                   strncmp(iface_list[i], iface, strlen(iface)) == 0)
+                       return false;
+       }
+
+       return true;
+}
+
+static bool
+in_array(__u32 *array, __u32 size, __u32 value)
+{
+       int i;
+
+       for (i = 0; i < size; i++) {
+               if (array[i] == value)
+                       return false;
+       }
+
+       return true;
+}
+
+static int
+lnet_net_append_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+       __u32 *added_cpts = NULL;
+       int i, j = 0, rc = 0;
+
+       /*
+        * no need to go futher since a subset of the NIs already exist on
+        * all CPTs
+        */
+       if (net->net_ncpts == LNET_CPT_NUMBER)
+               return 0;
+
+       if (cpts == NULL) {
+               /* there is an NI which will exist on all CPTs */
+               if (net->net_cpts != NULL)
+                       LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) *
+                                   net->net_ncpts);
+               net->net_cpts = NULL;
+               net->net_ncpts = LNET_CPT_NUMBER;
+               return 0;
+       }
+
+       if (net->net_cpts == NULL) {
+               LIBCFS_ALLOC(net->net_cpts, sizeof(*net->net_cpts) * ncpts);
+               if (net->net_cpts == NULL)
+                       return -ENOMEM;
+               memcpy(net->net_cpts, cpts, ncpts);
+               net->net_ncpts = ncpts;
+               return 0;
+       }
+
+       LIBCFS_ALLOC(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER);
+       if (added_cpts == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ncpts; i++) {
+               if (!in_array(net->net_cpts, net->net_ncpts, cpts[i])) {
+                       added_cpts[j] = cpts[i];
+                       j++;
+               }
+       }
+
+       /* append the new cpts if any to the list of cpts in the net */
+       if (j > 0) {
+               __u32 *array = NULL, *loc;
+               __u32 total_entries = j + net->net_ncpts;
+
+               LIBCFS_ALLOC(array, sizeof(*net->net_cpts) * total_entries);
+               if (array == NULL) {
+                       rc = -ENOMEM;
+                       goto failed;
+               }
+
+               memcpy(array, net->net_cpts, net->net_ncpts);
+               loc = array + net->net_ncpts;
+               memcpy(loc, added_cpts, j);
+
+               LIBCFS_FREE(net->net_cpts, sizeof(*net->net_cpts) *
+                           net->net_ncpts);
+               net->net_ncpts = total_entries;
+               net->net_cpts = array;
+       }
+
+failed:
+       LIBCFS_FREE(added_cpts, sizeof(*added_cpts) * LNET_CPT_NUMBER);
+
+       return rc;
+}
+
+static void
+lnet_net_remove_cpts(__u32 *cpts, __u32 ncpts, struct lnet_net *net)
+{
+       struct lnet_ni *ni;
+       int rc;
+
+       /*
+        * Operation Assumption:
+        *      This function is called after an NI has been removed from
+        *      its parent net.
+        *
+        * if we're removing an NI which exists on all CPTs then
+        * we have to check if any of the other NIs on this net also
+        * exists on all CPTs. If none, then we need to build our Net CPT
+        * list based on the remaining NIs.
+        *
+        * If the NI being removed exist on a subset of the CPTs then we
+        * alo rebuild the Net CPT list based on the remaining NIs, which
+        * should resutl in the expected Net CPT list.
+        */
+
+       /*
+        * sometimes this function can be called due to some failure
+        * creating an NI, before any of the cpts are allocated, so check
+        * for that case and don't do anything
+        */
+       if (ncpts == 0)
+               return;
+
+       if (ncpts == LNET_CPT_NUMBER) {
+               /*
+                * first iteration through the NI list in the net to see
+                * if any of the NIs exist on all the CPTs. If one is
+                * found then our job is done.
+                */
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (ni->ni_ncpts == LNET_CPT_NUMBER)
+                               return;
+               }
+       }
+
+       /*
+        * Rebuild the Net CPT list again, thereby only including only the
+        * CPTs which the remaining NIs are associated with.
+        */
+       if (net->net_cpts != NULL) {
+               LIBCFS_FREE(net->net_cpts,
+                       sizeof(*net->net_cpts) * net->net_ncpts);
+               net->net_cpts = NULL;
+       }
+
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+               rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts,
+                                         net);
+               if (rc != 0) {
+                       CERROR("Out of Memory\n");
+                       /*
+                        * do our best to keep on going. Delete
+                        * the net cpts and set it to NULL. This
+                        * way we can keep on going but less
+                        * efficiently, since memory accesses might be
+                        * accross CPT lines.
+                        */
+                       if (net->net_cpts != NULL) {
+                               LIBCFS_FREE(net->net_cpts,
+                                               sizeof(*net->net_cpts) *
+                                               net->net_ncpts);
+                               net->net_cpts = NULL;
+                               net->net_ncpts = LNET_CPT_NUMBER;
+                       }
+                       return;
+               }
+       }
 }
 
 void
@@ -97,6 +296,8 @@ lnet_ni_free(struct lnet_ni *ni)
 {
        int i;
 
+       lnet_net_remove_cpts(ni->ni_cpts, ni->ni_ncpts, ni->ni_net);
+
        if (ni->ni_refs != NULL)
                cfs_percpt_free(ni->ni_refs);
 
@@ -106,9 +307,6 @@ lnet_ni_free(struct lnet_ni *ni)
        if (ni->ni_cpts != NULL)
                cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
 
-       if (ni->ni_lnd_tunables != NULL)
-               LIBCFS_FREE(ni->ni_lnd_tunables, sizeof(*ni->ni_lnd_tunables));
-
        for (i = 0; i < LNET_MAX_INTERFACES &&
                    ni->ni_interfaces[i] != NULL; i++) {
                LIBCFS_FREE(ni->ni_interfaces[i],
@@ -122,29 +320,142 @@ lnet_ni_free(struct lnet_ni *ni)
        LIBCFS_FREE(ni, sizeof(*ni));
 }
 
-lnet_ni_t *
-lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+void
+lnet_net_free(struct lnet_net *net)
+{
+       struct list_head *tmp, *tmp2;
+       struct lnet_ni *ni;
+
+       LASSERT(list_empty(&net->net_ni_zombie));
+
+       /*
+        * delete any nis that haven't been added yet. This could happen
+        * if there is a failure on net startup
+        */
+       list_for_each_safe(tmp, tmp2, &net->net_ni_added) {
+               ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+               list_del_init(&ni->ni_netlist);
+               lnet_ni_free(ni);
+       }
+
+       /* delete any nis which have been started. */
+       list_for_each_safe(tmp, tmp2, &net->net_ni_list) {
+               ni = list_entry(tmp, struct lnet_ni, ni_netlist);
+               list_del_init(&ni->ni_netlist);
+               lnet_ni_free(ni);
+       }
+
+       if (net->net_cpts != NULL)
+               LIBCFS_FREE(net->net_cpts,
+                           sizeof(*net->net_cpts) * net->net_ncpts);
+
+       LIBCFS_FREE(net, sizeof(*net));
+}
+
+struct lnet_net *
+lnet_net_alloc(__u32 net_id, struct list_head *net_list)
+{
+       struct lnet_net         *net;
+
+       if (!lnet_net_unique(net_id, net_list, NULL)) {
+               CERROR("Duplicate net %s. Ignore\n",
+                      libcfs_net2str(net_id));
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       if (net == NULL) {
+               CERROR("Out of memory creating network %s\n",
+                      libcfs_net2str(net_id));
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&net->net_list);
+       INIT_LIST_HEAD(&net->net_ni_list);
+       INIT_LIST_HEAD(&net->net_ni_added);
+       INIT_LIST_HEAD(&net->net_ni_zombie);
+
+       net->net_id = net_id;
+       net->net_state = LNET_NET_STATE_INIT;
+
+       /* initialize global paramters to undefiend */
+       net->net_tunables.lct_peer_timeout = -1;
+       net->net_tunables.lct_max_tx_credits = -1;
+       net->net_tunables.lct_peer_tx_credits = -1;
+       net->net_tunables.lct_peer_rtr_credits = -1;
+
+       if (net_list)
+               list_add_tail(&net->net_list, net_list);
+
+       return net;
+}
+
+static int
+lnet_ni_add_interface(struct lnet_ni *ni, char *iface)
+{
+       int niface = 0;
+
+       if (ni == NULL)
+               return -ENOMEM;
+
+       if (!lnet_ni_unique_ni(ni->ni_interfaces, iface))
+               return -EINVAL;
+
+       /* Allocate a separate piece of memory and copy
+        * into it the string, so we don't have
+        * a depencency on the tokens string.  This way we
+        * can free the tokens at the end of the function.
+        * The newly allocated ni_interfaces[] can be
+        * freed when freeing the NI */
+       while (niface < LNET_MAX_INTERFACES &&
+              ni->ni_interfaces[niface] != NULL)
+               niface++;
+
+       if (niface >= LNET_MAX_INTERFACES) {
+               LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+                                  "for net %s\n",
+                                  libcfs_net2str(LNET_NIDNET(ni->ni_nid)));
+               return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(ni->ni_interfaces[niface],
+                    strlen(iface) + 1);
+
+       if (ni->ni_interfaces[niface] == NULL) {
+               CERROR("Can't allocate net interface name\n");
+               return -ENOMEM;
+       }
+
+       strncpy(ni->ni_interfaces[niface], iface,
+               strlen(iface) + 1);
+
+       return 0;
+}
+
+static struct lnet_ni *
+lnet_ni_alloc_common(struct lnet_net *net, char *iface)
 {
        struct lnet_tx_queue    *tq;
        struct lnet_ni          *ni;
-       int                     rc;
        int                     i;
 
-       if (!lnet_net_unique(net, nilist)) {
-               LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
-                                  libcfs_net2str(net));
-               return NULL;
-       }
+       if (iface != NULL)
+               /* make sure that this NI is unique in the net it's
+                * being added to */
+               if (!lnet_ni_unique_net(&net->net_ni_added, iface))
+                       return NULL;
 
        LIBCFS_ALLOC(ni, sizeof(*ni));
        if (ni == NULL) {
-               CERROR("Out of memory creating network %s\n",
-                      libcfs_net2str(net));
+               CERROR("Out of memory creating network interface %s%s\n",
+                      libcfs_net2str(net->net_id),
+                      (iface != NULL) ? iface : "");
                return NULL;
        }
 
        spin_lock_init(&ni->ni_lock);
        INIT_LIST_HEAD(&ni->ni_cptlist);
+       INIT_LIST_HEAD(&ni->ni_netlist);
        ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
                                       sizeof(*ni->ni_refs[0]));
        if (ni->ni_refs == NULL)
@@ -158,14 +469,54 @@ lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
        cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
                INIT_LIST_HEAD(&tq->tq_delayed);
 
-       if (el == NULL) {
+       ni->ni_net = net;
+       /* LND will fill in the address part of the NID */
+       ni->ni_nid = LNET_MKNID(net->net_id, 0);
+
+       /* Store net namespace in which current ni is being created */
+       if (current->nsproxy->net_ns != NULL)
+               ni->ni_net_ns = get_net(current->nsproxy->net_ns);
+       else
+               ni->ni_net_ns = NULL;
+
+       ni->ni_last_alive = ktime_get_real_seconds();
+       ni->ni_state = LNET_NI_STATE_INIT;
+       list_add_tail(&ni->ni_netlist, &net->net_ni_added);
+
+       /*
+        * if an interface name is provided then make sure to add in that
+        * interface name in NI
+        */
+       if (iface)
+               if (lnet_ni_add_interface(ni, iface) != 0)
+                       goto failed;
+
+       return ni;
+failed:
+       lnet_ni_free(ni);
+       return NULL;
+}
+
+/* allocate and add to the provided network */
+struct lnet_ni *
+lnet_ni_alloc(struct lnet_net *net, struct cfs_expr_list *el, char *iface)
+{
+       struct lnet_ni          *ni;
+       int                     rc;
+
+       ni = lnet_ni_alloc_common(net, iface);
+       if (!ni)
+               return NULL;
+
+       if (!el) {
                ni->ni_cpts  = NULL;
                ni->ni_ncpts = LNET_CPT_NUMBER;
        } else {
                rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
                if (rc <= 0) {
-                       CERROR("Failed to set CPTs for NI %s: %d\n",
-                              libcfs_net2str(net), rc);
+                       CERROR("Failed to set CPTs for NI %s(%s): %d\n",
+                              libcfs_net2str(net->net_id),
+                              (iface != NULL) ? iface : "", rc);
                        goto failed;
                }
 
@@ -178,35 +529,66 @@ lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
                ni->ni_ncpts = rc;
        }
 
-       /* LND will fill in the address part of the NID */
-       ni->ni_nid = LNET_MKNID(net, 0);
+       rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+       if (rc != 0)
+               goto failed;
 
-       /* Store net namespace in which current ni is being created */
-       if (current->nsproxy->net_ns != NULL)
-               ni->ni_net_ns = get_net(current->nsproxy->net_ns);
-       else
-               ni->ni_net_ns = NULL;
+       return ni;
+failed:
+       lnet_ni_free(ni);
+       return NULL;
+}
+
+struct lnet_ni *
+lnet_ni_alloc_w_cpt_array(struct lnet_net *net, __u32 *cpts, __u32 ncpts,
+                         char *iface)
+{
+       struct lnet_ni          *ni;
+       int                     rc;
+
+       ni = lnet_ni_alloc_common(net, iface);
+       if (!ni)
+               return NULL;
+
+       if (ncpts == 0) {
+               ni->ni_cpts  = NULL;
+               ni->ni_ncpts = LNET_CPT_NUMBER;
+       } else {
+               size_t array_size = ncpts * sizeof(ni->ni_cpts[0]);
+               LIBCFS_ALLOC(ni->ni_cpts, array_size);
+               if (ni->ni_cpts == NULL)
+                       goto failed;
+               memcpy(ni->ni_cpts, cpts, array_size);
+               ni->ni_ncpts = ncpts;
+       }
+
+       rc = lnet_net_append_cpts(ni->ni_cpts, ni->ni_ncpts, net);
+       if (rc != 0)
+               goto failed;
 
-       ni->ni_last_alive = ktime_get_real_seconds();
-       list_add_tail(&ni->ni_list, nilist);
        return ni;
- failed:
+failed:
        lnet_ni_free(ni);
        return NULL;
 }
 
+/*
+ * Parse the networks string and create the matching set of NIs on the
+ * nilist.
+ */
 int
-lnet_parse_networks(struct list_head *nilist, char *networks)
+lnet_parse_networks(struct list_head *netlist, char *networks,
+                   bool use_tcp_bonding)
 {
-       struct cfs_expr_list *el = NULL;
+       struct cfs_expr_list *net_el = NULL;
+       struct cfs_expr_list *ni_el = NULL;
        int             tokensize;
        char            *tokens;
        char            *str;
-       char            *tmp;
-       struct lnet_ni  *ni;
-       __u32           net;
+       struct lnet_net *net;
+       struct lnet_ni  *ni = NULL;
+       __u32           net_id;
        int             nnets = 0;
-       struct list_head *temp_node;
 
        if (networks == NULL) {
                CERROR("networks string is undefined\n");
@@ -229,173 +611,238 @@ lnet_parse_networks(struct list_head *nilist, char *networks)
        }
 
        memcpy(tokens, networks, tokensize);
-       str = tmp = tokens;
-
-       while (str != NULL && *str != 0) {
-               char    *comma = strchr(str, ',');
-               char    *bracket = strchr(str, '(');
-               char    *square = strchr(str, '[');
-               char    *iface;
-               int     niface;
-               int     rc;
-
-               /* NB we don't check interface conflicts here; it's the LNDs
-                * responsibility (if it cares at all) */
-
-               if (square != NULL && (comma == NULL || square < comma)) {
-                       /* i.e: o2ib0(ib0)[1,2], number between square
-                        * brackets are CPTs this NI needs to be bond */
-                       if (bracket != NULL && bracket > square) {
-                               tmp = square;
+       str = tokens;
+
+       /*
+        * Main parser loop.
+        *
+        * NB we don't check interface conflicts here; it's the LNDs
+        * responsibility (if it cares at all)
+        */
+       do {
+               char *nistr;
+               char *elstr;
+               char *name;
+               int rc;
+
+               /*
+                * Parse a network string into its components.
+                *
+                * <name>{"("...")"}{"["<el>"]"}
+                */
+
+               /* Network name (mandatory) */
+               while (isspace(*str))
+                       *str++ = '\0';
+               if (!*str)
+                       break;
+               name = str;
+               str += strcspn(str, SPACESTR ":()[],");
+               while (isspace(*str))
+                       *str++ = '\0';
+
+               /* Interface list (optional) */
+               if (*str == '(') {
+                       *str++ = '\0';
+                       nistr = str;
+                       str += strcspn(str, ")");
+                       if (*str != ')') {
+                               str = nistr;
                                goto failed_syntax;
                        }
+                       do {
+                               *str++ = '\0';
+                       } while (isspace(*str));
+               } else {
+                       nistr = NULL;
+               }
 
-                       tmp = strchr(square, ']');
-                       if (tmp == NULL) {
-                               tmp = square;
+               /* CPT expression (optional) */
+               if (*str == '[') {
+                       elstr = str;
+                       str += strcspn(str, "]");
+                       if (*str != ']') {
+                               str = elstr;
                                goto failed_syntax;
                        }
-
-                       rc = cfs_expr_list_parse(square, tmp - square + 1,
-                                                0, LNET_CPT_NUMBER - 1, &el);
+                       rc = cfs_expr_list_parse(elstr, str - elstr + 1,
+                                               0, LNET_CPT_NUMBER - 1,
+                                               &net_el);
                        if (rc != 0) {
-                               tmp = square;
+                               str = elstr;
                                goto failed_syntax;
                        }
-
-                       while (square <= tmp)
-                               *square++ = ' ';
+                       *elstr = '\0';
+                       do {
+                               *str++ = '\0';
+                       } while (isspace(*str));
                }
 
-               if (bracket == NULL ||
-                   (comma != NULL && comma < bracket)) {
-
-                       /* no interface list specified */
-
-                       if (comma != NULL)
-                               *comma++ = 0;
-                       net = libcfs_str2net(cfs_trimwhite(str));
-
-                       if (net == LNET_NIDNET(LNET_NID_ANY)) {
-                               LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
-                                                  " type\n");
-                               tmp = str;
-                               goto failed_syntax;
-                       }
+               /* Bad delimiters */
+               if (*str && (strchr(DELIMITERS, *str) != NULL))
+                       goto failed_syntax;
 
-                       if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
-                           lnet_ni_alloc(net, el, nilist) == NULL)
-                               goto failed;
+               /* go to the next net if it exits */
+               str += strcspn(str, ",");
+               if (*str == ',')
+                       *str++ = '\0';
+
+               /*
+                * At this point the name is properly terminated.
+                */
+               net_id = libcfs_str2net(name);
+               if (net_id == LNET_NIDNET(LNET_NID_ANY)) {
+                       LCONSOLE_ERROR_MSG(0x113,
+                                       "Unrecognised network type\n");
+                       str = name;
+                       goto failed_syntax;
+               }
 
-                       if (el != NULL) {
-                               cfs_expr_list_free(el);
-                               el = NULL;
+               if (LNET_NETTYP(net_id) == LOLND) {
+                       /* Loopback is implicit, and there can be only one. */
+                       if (net_el) {
+                               cfs_expr_list_free(net_el);
+                               net_el = NULL;
                        }
-
-                       str = comma;
+                       /* Should we error out instead? */
                        continue;
                }
 
-               *bracket = 0;
-               net = libcfs_str2net(cfs_trimwhite(str));
-               if (net == LNET_NIDNET(LNET_NID_ANY)) {
-                       tmp = str;
-                       goto failed_syntax;
-               }
+               /*
+                * All network paramaters are now known.
+                */
+               nnets++;
 
-               ni = lnet_ni_alloc(net, el, nilist);
-               if (ni == NULL)
+               /* always allocate a net, since we will eventually add an
+                * interface to it, or we will fail, in which case we'll
+                * just delete it */
+               net = lnet_net_alloc(net_id, netlist);
+               if (IS_ERR_OR_NULL(net))
                        goto failed;
 
-               if (el != NULL) {
-                       cfs_expr_list_free(el);
-                       el = NULL;
-               }
-
-               niface = 0;
-               iface = bracket + 1;
+               if (!nistr ||
+                   (use_tcp_bonding && LNET_NETTYP(net_id) == SOCKLND)) {
+                       /*
+                        * No interface list was specified, allocate a
+                        * ni using the defaults.
+                        */
+                       ni = lnet_ni_alloc(net, net_el, NULL);
+                       if (IS_ERR_OR_NULL(ni))
+                               goto failed;
 
-               bracket = strchr(iface, ')');
-               if (bracket == NULL) {
-                       tmp = iface;
-                       goto failed_syntax;
+                       if (!nistr) {
+                               if (net_el) {
+                                       cfs_expr_list_free(net_el);
+                                       net_el = NULL;
+                               }
+                               continue;
+                       }
                }
 
-               *bracket = 0;
                do {
-                       comma = strchr(iface, ',');
-                       if (comma != NULL)
-                               *comma++ = 0;
+                       elstr = NULL;
+
+                       /* Interface name (mandatory) */
+                       while (isspace(*nistr))
+                               *nistr++ = '\0';
+                       name = nistr;
+                       nistr += strcspn(nistr, SPACESTR "[],");
+                       while (isspace(*nistr))
+                               *nistr++ = '\0';
+
+                       /* CPT expression (optional) */
+                       if (*nistr == '[') {
+                               elstr = nistr;
+                               nistr += strcspn(nistr, "]");
+                               if (*nistr != ']') {
+                                       str = elstr;
+                                       goto failed_syntax;
+                               }
+                               rc = cfs_expr_list_parse(elstr,
+                                                       nistr - elstr + 1,
+                                                       0, LNET_CPT_NUMBER - 1,
+                                                       &ni_el);
+                               if (rc != 0) {
+                                       str = elstr;
+                                       goto failed_syntax;
+                               }
+                               *elstr = '\0';
+                               do {
+                                       *nistr++ = '\0';
+                               } while (isspace(*nistr));
+                       } else {
+                               ni_el = net_el;
+                       }
 
-                       iface = cfs_trimwhite(iface);
-                       if (*iface == 0) {
-                               tmp = iface;
+                       /*
+                        * End of single interface specificaton,
+                        * advance to the start of the next one, if
+                        * any.
+                        */
+                       if (*nistr == ',') {
+                               do {
+                                       *nistr++ = '\0';
+                               } while (isspace(*nistr));
+                               if (!*nistr) {
+                                       str = nistr;
+                                       goto failed_syntax;
+                               }
+                       } else if (*nistr) {
+                               str = nistr;
                                goto failed_syntax;
                        }
 
-                       if (niface == LNET_MAX_INTERFACES) {
-                               LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
-                                                  "for net %s\n",
-                                                  libcfs_net2str(net));
-                               goto failed;
+                       /*
+                        * At this point the name is properly terminated.
+                        */
+                       if (!*name) {
+                               str = name;
+                               goto failed_syntax;
                        }
 
-                       /* Allocate a separate piece of memory and copy
-                        * into it the string, so we don't have
-                        * a depencency on the tokens string.  This way we
-                        * can free the tokens at the end of the function.
-                        * The newly allocated ni_interfaces[] can be
-                        * freed when freeing the NI */
-                       LIBCFS_ALLOC(ni->ni_interfaces[niface],
-                                    strlen(iface) + 1);
-                       if (ni->ni_interfaces[niface] == NULL) {
-                               CERROR("Can't allocate net interface name\n");
-                               goto failed;
+                       if (use_tcp_bonding &&
+                           LNET_NETTYP(net->net_id) == SOCKLND) {
+                               rc = lnet_ni_add_interface(ni, name);
+                               if (rc != 0)
+                                       goto failed;
+                       } else {
+                               ni = lnet_ni_alloc(net, ni_el, name);
+                               if (IS_ERR_OR_NULL(ni))
+                                       goto failed;
                        }
-                       strncpy(ni->ni_interfaces[niface], iface,
-                               strlen(iface));
-                       niface++;
-                       iface = comma;
-               } while (iface != NULL);
-
-               str = bracket + 1;
-               comma = strchr(bracket + 1, ',');
-               if (comma != NULL) {
-                       *comma = 0;
-                       str = cfs_trimwhite(str);
-                       if (*str != 0) {
-                               tmp = str;
-                               goto failed_syntax;
+
+                       if (ni_el) {
+                               if (ni_el != net_el) {
+                                       cfs_expr_list_free(ni_el);
+                                       ni_el = NULL;
+                               }
                        }
-                       str = comma + 1;
-                       continue;
-               }
+               } while (*nistr);
 
-               str = cfs_trimwhite(str);
-               if (*str != 0) {
-                       tmp = str;
-                       goto failed_syntax;
+               if (net_el) {
+                       cfs_expr_list_free(net_el);
+                       net_el = NULL;
                }
-       }
-
-       list_for_each(temp_node, nilist)
-               nnets++;
+       } while (*str);
 
        LIBCFS_FREE(tokens, tokensize);
        return nnets;
 
  failed_syntax:
-       lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+       lnet_syntax("networks", networks, (int)(str - tokens), strlen(str));
  failed:
-       while (!list_empty(nilist)) {
-               ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+       /* free the net list and all the nis on each net */
+       while (!list_empty(netlist)) {
+               net = list_entry(netlist->next, struct lnet_net, net_list);
 
-               list_del(&ni->ni_list);
-               lnet_ni_free(ni);
+               list_del_init(&net->net_list);
+               lnet_net_free(net);
        }
 
-       if (el != NULL)
-               cfs_expr_list_free(el);
+       if (ni_el && ni_el != net_el)
+               cfs_expr_list_free(ni_el);
+       if (net_el)
+               cfs_expr_list_free(net_el);
 
        LIBCFS_FREE(tokens, tokensize);
 
index 6732562..2236980 100644 (file)
@@ -80,6 +80,34 @@ lnet_md_unlink(lnet_libmd_t *md)
        lnet_md_free(md);
 }
 
+int
+lnet_cpt_of_md(lnet_libmd_t *md)
+{
+       int cpt = CFS_CPT_ANY;
+
+       if (!md)
+               return CFS_CPT_ANY;
+
+       if ((md->md_options & LNET_MD_BULK_HANDLE) != 0 &&
+           !LNetHandleIsInvalid(md->md_bulk_handle)) {
+               md = lnet_handle2md(&md->md_bulk_handle);
+
+               if (!md)
+                       return CFS_CPT_ANY;
+       }
+
+       if ((md->md_options & LNET_MD_KIOV) != 0) {
+               if (md->md_iov.kiov[0].kiov_page != NULL)
+                       cpt = cfs_cpt_of_node(lnet_cpt_table(),
+                               page_to_nid(md->md_iov.kiov[0].kiov_page));
+       } else if (md->md_iov.iov[0].iov_base != NULL) {
+               cpt = cfs_cpt_of_node(lnet_cpt_table(),
+                       page_to_nid(virt_to_page(md->md_iov.iov[0].iov_base)));
+       }
+
+       return cpt;
+}
+
 static int
 lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
 {
@@ -97,6 +125,7 @@ lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
        lmd->md_threshold = umd->threshold;
        lmd->md_refcount = 0;
        lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+       lmd->md_bulk_handle = umd->bulk_handle;
 
        if ((umd->options & LNET_MD_IOVEC) != 0) {
 
index d93d061..e9dd9b3 100644 (file)
@@ -584,13 +584,14 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
                        iov  = msg->msg_iov;
                        kiov = msg->msg_kiov;
 
-                       LASSERT(niov > 0);
-                       LASSERT((iov == NULL) != (kiov == NULL));
+                       LASSERT (niov > 0);
+                       LASSERT ((iov == NULL) != (kiov == NULL));
                }
        }
 
-       rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
-                                   niov, iov, kiov, offset, mlen, rlen);
+       rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed,
+                                            niov, iov, kiov, offset, mlen,
+                                            rlen);
        if (rc < 0)
                lnet_finalize(ni, msg, rc);
 }
@@ -626,12 +627,11 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
        if (len != 0)
                lnet_setpayloadbuffer(msg);
 
-       memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr));
-       msg->msg_hdr.type           = cpu_to_le32(type);
-       msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
-       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+       memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+       msg->msg_hdr.type           = cpu_to_le32(type);
+       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
        /* src_nid will be set later */
-       msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
+       msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
        msg->msg_hdr.payload_length = cpu_to_le32(len);
 }
 
@@ -645,7 +645,7 @@ lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
        LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
                 (msg->msg_txcredit && msg->msg_peertxcredit));
 
-       rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+       rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg);
        if (rc < 0)
                lnet_finalize(ni, msg, rc);
 }
@@ -658,15 +658,15 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
        LASSERT(!msg->msg_sending);
        LASSERT(msg->msg_receiving);
        LASSERT(!msg->msg_rx_ready_delay);
-       LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+       LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL);
 
        msg->msg_rx_ready_delay = 1;
-       rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
-                                         &msg->msg_private);
+       rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+                                                 &msg->msg_private);
        if (rc != 0) {
                CERROR("recv from %s / send to %s aborted: "
                       "eager_recv failed %d\n",
-                      libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+                      libcfs_nid2str(msg->msg_rxpeer->lpni_nid),
                       libcfs_id2str(msg->msg_target), rc);
                LASSERT(rc < 0); /* required by my callers */
        }
@@ -674,52 +674,71 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
        return rc;
 }
 
-/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+/*
+ * This function can be called from two paths:
+ *     1. when sending a message
+ *     2. when decommiting a message (lnet_msg_decommit_tx())
+ * In both these cases the peer_ni should have it's reference count
+ * acquired by the caller and therefore it is safe to drop the spin
+ * lock before calling lnd_query()
+ */
 static void
-lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+lnet_ni_query_locked(lnet_ni_t *ni, struct lnet_peer_ni *lp)
 {
        cfs_time_t last_alive = 0;
+       int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
 
        LASSERT(lnet_peer_aliveness_enabled(lp));
-       LASSERT(ni->ni_lnd->lnd_query != NULL);
+       LASSERT(ni->ni_net->net_lnd->lnd_query != NULL);
 
-       lnet_net_unlock(lp->lp_cpt);
-       (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
-       lnet_net_lock(lp->lp_cpt);
+       lnet_net_unlock(cpt);
+       (ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
+       lnet_net_lock(cpt);
 
-       lp->lp_last_query = cfs_time_current();
+       lp->lpni_last_query = cfs_time_current();
 
        if (last_alive != 0) /* NI has updated timestamp */
-               lp->lp_last_alive = last_alive;
+               lp->lpni_last_alive = last_alive;
 }
 
 /* NB: always called with lnet_net_lock held */
 static inline int
-lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
 {
-       int        alive;
+       int        alive;
        cfs_time_t deadline;
 
-       LASSERT(lnet_peer_aliveness_enabled(lp));
+       LASSERT (lnet_peer_aliveness_enabled(lp));
 
-       /* Trust lnet_notify() if it has more recent aliveness news, but
+       /*
+        * Trust lnet_notify() if it has more recent aliveness news, but
         * ignore the initial assumed death (see lnet_peers_start_down()).
         */
-       if (!lp->lp_alive && lp->lp_alive_count > 0 &&
-           cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+       spin_lock(&lp->lpni_lock);
+       if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
+           cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
+               spin_unlock(&lp->lpni_lock);
                return 0;
+       }
 
-       deadline = cfs_time_add(lp->lp_last_alive,
-                               cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+       deadline =
+         cfs_time_add(lp->lpni_last_alive,
+                      cfs_time_seconds(lp->lpni_net->net_tunables.
+                                       lct_peer_timeout));
        alive = cfs_time_after(deadline, now);
 
-       /* Update obsolete lp_alive except for routers assumed to be dead
+       /*
+        * Update obsolete lp_alive except for routers assumed to be dead
         * initially, because router checker would update aliveness in this
-        * case, and moreover lp_last_alive at peer creation is assumed.
+        * case, and moreover lpni_last_alive at peer creation is assumed.
         */
-       if (alive && !lp->lp_alive &&
-           !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
-               lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+       if (alive && !lp->lpni_alive &&
+           !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) {
+               spin_unlock(&lp->lpni_lock);
+               lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+       } else {
+               spin_unlock(&lp->lpni_lock);
+       }
 
        return alive;
 }
@@ -728,7 +747,7 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked (lnet_peer_t *lp)
+lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp)
 {
        cfs_time_t now = cfs_time_current();
 
@@ -738,34 +757,36 @@ lnet_peer_alive_locked (lnet_peer_t *lp)
        if (lnet_peer_is_alive(lp, now))
                return 1;
 
-       /* Peer appears dead, but we should avoid frequent NI queries (at
-        * most once per lnet_queryinterval seconds). */
-       if (lp->lp_last_query != 0) {
+       /*
+        * Peer appears dead, but we should avoid frequent NI queries (at
+        * most once per lnet_queryinterval seconds).
+        */
+       if (lp->lpni_last_query != 0) {
                static const int lnet_queryinterval = 1;
 
                cfs_time_t next_query =
-                          cfs_time_add(lp->lp_last_query,
+                          cfs_time_add(lp->lpni_last_query,
                                        cfs_time_seconds(lnet_queryinterval));
 
                if (cfs_time_before(now, next_query)) {
-                       if (lp->lp_alive)
+                       if (lp->lpni_alive)
                                CWARN("Unexpected aliveness of peer %s: "
                                      "%d < %d (%d/%d)\n",
-                                     libcfs_nid2str(lp->lp_nid),
+                                     libcfs_nid2str(lp->lpni_nid),
                                      (int)now, (int)next_query,
                                      lnet_queryinterval,
-                                     lp->lp_ni->ni_peertimeout);
+                                     lp->lpni_net->net_tunables.lct_peer_timeout);
                        return 0;
                }
        }
 
        /* query NI for latest aliveness news */
-       lnet_ni_query_locked(lp->lp_ni, lp);
+       lnet_ni_query_locked(ni, lp);
 
        if (lnet_peer_is_alive(lp, now))
                return 1;
 
-       lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+       lnet_notify_locked(lp, 0, 0, lp->lpni_last_alive);
        return 0;
 }
 
@@ -783,8 +804,8 @@ lnet_peer_alive_locked (lnet_peer_t *lp)
 static int
 lnet_post_send_locked(lnet_msg_t *msg, int do_send)
 {
-       lnet_peer_t             *lp = msg->msg_txpeer;
-       lnet_ni_t               *ni = lp->lp_ni;
+       struct lnet_peer_ni     *lp = msg->msg_txpeer;
+       struct lnet_ni          *ni = msg->msg_txni;
        int                     cpt = msg->msg_tx_cpt;
        struct lnet_tx_queue    *tq = ni->ni_tx_queues[cpt];
 
@@ -795,10 +816,14 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
 
        /* NB 'lp' is always the next hop */
        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-           lnet_peer_alive_locked(lp) == 0) {
+           lnet_peer_alive_locked(ni, lp) == 0) {
                the_lnet.ln_counters[cpt]->drop_count++;
                the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
                lnet_net_unlock(cpt);
+               if (msg->msg_txpeer)
+                       atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+               if (msg->msg_txni)
+                       atomic_inc(&msg->msg_txni->ni_stats.drop_count);
 
                CNETERR("Dropping message for %s: peer not alive\n",
                        libcfs_id2str(msg->msg_target));
@@ -824,21 +849,24 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
        }
 
        if (!msg->msg_peertxcredit) {
-               LASSERT((lp->lp_txcredits < 0) ==
-                       !list_empty(&lp->lp_txq));
+               spin_lock(&lp->lpni_lock);
+               LASSERT((lp->lpni_txcredits < 0) ==
+                       !list_empty(&lp->lpni_txq));
 
                msg->msg_peertxcredit = 1;
-               lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
-               lp->lp_txcredits--;
+               lp->lpni_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+               lp->lpni_txcredits--;
 
-               if (lp->lp_txcredits < lp->lp_mintxcredits)
-                       lp->lp_mintxcredits = lp->lp_txcredits;
+               if (lp->lpni_txcredits < lp->lpni_mintxcredits)
+                       lp->lpni_mintxcredits = lp->lpni_txcredits;
 
-               if (lp->lp_txcredits < 0) {
+               if (lp->lpni_txcredits < 0) {
                        msg->msg_tx_delayed = 1;
-                       list_add_tail(&msg->msg_list, &lp->lp_txq);
+                       list_add_tail(&msg->msg_list, &lp->lpni_txq);
+                       spin_unlock(&lp->lpni_lock);
                        return LNET_CREDIT_WAIT;
                }
+               spin_unlock(&lp->lpni_lock);
        }
 
        if (!msg->msg_txcredit) {
@@ -847,6 +875,7 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
 
                msg->msg_txcredit = 1;
                tq->tq_credits--;
+               atomic_dec(&ni->ni_tx_credits);
 
                if (tq->tq_credits < tq->tq_credits_min)
                        tq->tq_credits_min = tq->tq_credits;
@@ -894,36 +923,39 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
         * sets do_recv FALSE and I don't do the unlock/send/lock bit.
         * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if
         * received or OK to receive */
-       lnet_peer_t         *lp = msg->msg_rxpeer;
+       struct lnet_peer_ni *lp = msg->msg_rxpeer;
        lnet_rtrbufpool_t   *rbp;
-       lnet_rtrbuf_t       *rb;
+       lnet_rtrbuf_t       *rb;
 
-       LASSERT(msg->msg_iov == NULL);
-       LASSERT(msg->msg_kiov == NULL);
-       LASSERT(msg->msg_niov == 0);
-       LASSERT(msg->msg_routing);
-       LASSERT(msg->msg_receiving);
-       LASSERT(!msg->msg_sending);
+       LASSERT (msg->msg_iov == NULL);
+       LASSERT (msg->msg_kiov == NULL);
+       LASSERT (msg->msg_niov == 0);
+       LASSERT (msg->msg_routing);
+       LASSERT (msg->msg_receiving);
+       LASSERT (!msg->msg_sending);
 
        /* non-lnet_parse callers only receive delayed messages */
        LASSERT(!do_recv || msg->msg_rx_delayed);
 
        if (!msg->msg_peerrtrcredit) {
-               LASSERT((lp->lp_rtrcredits < 0) ==
-                       !list_empty(&lp->lp_rtrq));
+               spin_lock(&lp->lpni_lock);
+               LASSERT((lp->lpni_rtrcredits < 0) ==
+                       !list_empty(&lp->lpni_rtrq));
 
                msg->msg_peerrtrcredit = 1;
-               lp->lp_rtrcredits--;
-               if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
-                       lp->lp_minrtrcredits = lp->lp_rtrcredits;
+               lp->lpni_rtrcredits--;
+               if (lp->lpni_rtrcredits < lp->lpni_minrtrcredits)
+                       lp->lpni_minrtrcredits = lp->lpni_rtrcredits;
 
-               if (lp->lp_rtrcredits < 0) {
+               if (lp->lpni_rtrcredits < 0) {
                        /* must have checked eager_recv before here */
                        LASSERT(msg->msg_rx_ready_delay);
                        msg->msg_rx_delayed = 1;
-                       list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+                       list_add_tail(&msg->msg_list, &lp->lpni_rtrq);
+                       spin_unlock(&lp->lpni_lock);
                        return LNET_CREDIT_WAIT;
                }
+               spin_unlock(&lp->lpni_lock);
        }
 
        rbp = lnet_msg2bufpool(msg);
@@ -954,7 +986,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
                int cpt = msg->msg_rx_cpt;
 
                lnet_net_unlock(cpt);
-               lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+               lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1,
                             0, msg->msg_len, msg->msg_len);
                lnet_net_lock(cpt);
        }
@@ -964,11 +996,12 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
 void
 lnet_return_tx_credits_locked(lnet_msg_t *msg)
 {
-       lnet_peer_t     *txpeer = msg->msg_txpeer;
-       lnet_msg_t      *msg2;
+       struct lnet_peer_ni     *txpeer = msg->msg_txpeer;
+       struct lnet_ni          *txni = msg->msg_txni;
+       lnet_msg_t              *msg2;
 
        if (msg->msg_txcredit) {
-               struct lnet_ni       *ni = txpeer->lp_ni;
+               struct lnet_ni       *ni = msg->msg_txni;
                struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
 
                /* give back NI txcredits */
@@ -978,13 +1011,15 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg)
                        !list_empty(&tq->tq_delayed));
 
                tq->tq_credits++;
+               atomic_inc(&ni->ni_tx_credits);
                if (tq->tq_credits <= 0) {
                        msg2 = list_entry(tq->tq_delayed.next,
                                          lnet_msg_t, msg_list);
                        list_del(&msg2->msg_list);
 
-                       LASSERT(msg2->msg_txpeer->lp_ni == ni);
+                       LASSERT(msg2->msg_txni == ni);
                        LASSERT(msg2->msg_tx_delayed);
+                       LASSERT(msg2->msg_tx_cpt == msg->msg_tx_cpt);
 
                        (void) lnet_post_send_locked(msg2, 1);
                }
@@ -994,28 +1029,54 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg)
                /* give back peer txcredits */
                msg->msg_peertxcredit = 0;
 
-               LASSERT((txpeer->lp_txcredits < 0) ==
-                       !list_empty(&txpeer->lp_txq));
+               spin_lock(&txpeer->lpni_lock);
+               LASSERT((txpeer->lpni_txcredits < 0) ==
+                       !list_empty(&txpeer->lpni_txq));
 
-               txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
-               LASSERT(txpeer->lp_txqnob >= 0);
+               txpeer->lpni_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+               LASSERT(txpeer->lpni_txqnob >= 0);
 
-               txpeer->lp_txcredits++;
-               if (txpeer->lp_txcredits <= 0) {
-                       msg2 = list_entry(txpeer->lp_txq.next,
-                                             lnet_msg_t, msg_list);
+               txpeer->lpni_txcredits++;
+               if (txpeer->lpni_txcredits <= 0) {
+                       msg2 = list_entry(txpeer->lpni_txq.next,
+                                              lnet_msg_t, msg_list);
                        list_del(&msg2->msg_list);
+                       spin_unlock(&txpeer->lpni_lock);
 
                        LASSERT(msg2->msg_txpeer == txpeer);
                        LASSERT(msg2->msg_tx_delayed);
 
-                       (void) lnet_post_send_locked(msg2, 1);
+                       if (msg2->msg_tx_cpt != msg->msg_tx_cpt) {
+                               lnet_net_unlock(msg->msg_tx_cpt);
+                               lnet_net_lock(msg2->msg_tx_cpt);
+                       }
+                        (void) lnet_post_send_locked(msg2, 1);
+                       if (msg2->msg_tx_cpt != msg->msg_tx_cpt) {
+                               lnet_net_unlock(msg2->msg_tx_cpt);
+                               lnet_net_lock(msg->msg_tx_cpt);
+                       }
+                } else {
+                       spin_unlock(&txpeer->lpni_lock);
                }
+        }
+
+       if (txni != NULL) {
+               msg->msg_txni = NULL;
+               lnet_ni_decref_locked(txni, msg->msg_tx_cpt);
        }
 
        if (txpeer != NULL) {
+               /*
+                * TODO:
+                * Once the patch for the health comes in we need to set
+                * the health of the peer ni to bad when we fail to send
+                * a message.
+                * int status = msg->msg_ev.status;
+                * if (status != 0)
+                *      lnet_set_peer_ni_health_locked(txpeer, false)
+                */
                msg->msg_txpeer = NULL;
-               lnet_peer_decref_locked(txpeer);
+               lnet_peer_ni_decref_locked(txpeer);
        }
 }
 
@@ -1036,18 +1097,13 @@ lnet_schedule_blocked_locked(lnet_rtrbufpool_t *rbp)
 void
 lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
 {
-       lnet_msg_t       *msg;
-       lnet_msg_t       *tmp;
-       struct list_head drop;
-
-       INIT_LIST_HEAD(&drop);
-
-       list_splice_init(list, &drop);
+       lnet_msg_t *msg;
+       lnet_msg_t *tmp;
 
        lnet_net_unlock(cpt);
 
-       list_for_each_entry_safe(msg, tmp, &drop, msg_list) {
-               lnet_ni_recv(msg->msg_rxpeer->lp_ni, msg->msg_private, NULL,
+       list_for_each_entry_safe(msg, tmp, list, msg_list) {
+               lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL,
                             0, 0, 0, msg->msg_hdr.payload_length);
                list_del_init(&msg->msg_list);
                lnet_finalize(NULL, msg, -ECANCELED);
@@ -1059,8 +1115,9 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt)
 void
 lnet_return_rx_credits_locked(lnet_msg_t *msg)
 {
-       lnet_peer_t     *rxpeer = msg->msg_rxpeer;
-       lnet_msg_t      *msg2;
+       struct lnet_peer_ni     *rxpeer = msg->msg_rxpeer;
+       struct lnet_ni          *rxni = msg->msg_rxni;
+       lnet_msg_t              *msg2;
 
        if (msg->msg_rtrcredit) {
                /* give back global router credits */
@@ -1111,87 +1168,109 @@ routing_off:
                /* give back peer router credits */
                msg->msg_peerrtrcredit = 0;
 
-               LASSERT((rxpeer->lp_rtrcredits < 0) ==
-                       !list_empty(&rxpeer->lp_rtrq));
+               spin_lock(&rxpeer->lpni_lock);
+               LASSERT((rxpeer->lpni_rtrcredits < 0) ==
+                       !list_empty(&rxpeer->lpni_rtrq));
 
-               rxpeer->lp_rtrcredits++;
+               rxpeer->lpni_rtrcredits++;
 
                /* drop all messages which are queued to be routed on that
                 * peer. */
                if (!the_lnet.ln_routing) {
-                       lnet_drop_routed_msgs_locked(&rxpeer->lp_rtrq,
-                                                    msg->msg_rx_cpt);
-               } else if (rxpeer->lp_rtrcredits <= 0) {
-                       msg2 = list_entry(rxpeer->lp_rtrq.next,
+                       struct list_head drop;
+                       INIT_LIST_HEAD(&drop);
+                       list_splice_init(&rxpeer->lpni_rtrq, &drop);
+                       spin_unlock(&rxpeer->lpni_lock);
+                       lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt);
+               } else if (rxpeer->lpni_rtrcredits <= 0) {
+                       msg2 = list_entry(rxpeer->lpni_rtrq.next,
                                          lnet_msg_t, msg_list);
                        list_del(&msg2->msg_list);
-
+                       spin_unlock(&rxpeer->lpni_lock);
                        (void) lnet_post_routed_recv_locked(msg2, 1);
+               } else {
+                       spin_unlock(&rxpeer->lpni_lock);
                }
        }
+       if (rxni != NULL) {
+               msg->msg_rxni = NULL;
+               lnet_ni_decref_locked(rxni, msg->msg_rx_cpt);
+       }
        if (rxpeer != NULL) {
                msg->msg_rxpeer = NULL;
-               lnet_peer_decref_locked(rxpeer);
+               lnet_peer_ni_decref_locked(rxpeer);
        }
 }
 
 static int
+lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2)
+{
+       if (p1->lpni_txqnob < p2->lpni_txqnob)
+               return 1;
+
+       if (p1->lpni_txqnob > p2->lpni_txqnob)
+               return -1;
+
+       if (p1->lpni_txcredits > p2->lpni_txcredits)
+               return 1;
+
+       if (p1->lpni_txcredits < p2->lpni_txcredits)
+               return -1;
+
+       return 0;
+}
+
+static int
 lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
 {
-       lnet_peer_t *p1 = r1->lr_gateway;
-       lnet_peer_t *p2 = r2->lr_gateway;
+       struct lnet_peer_ni *p1 = r1->lr_gateway;
+       struct lnet_peer_ni *p2 = r2->lr_gateway;
        int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops;
        int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops;
+       int rc;
 
        if (r1->lr_priority < r2->lr_priority)
                return 1;
 
        if (r1->lr_priority > r2->lr_priority)
-               return -ERANGE;
+               return -1;
 
        if (r1_hops < r2_hops)
                return 1;
 
        if (r1_hops > r2_hops)
-               return -ERANGE;
+               return -1;
 
-       if (p1->lp_txqnob < p2->lp_txqnob)
-               return 1;
-
-       if (p1->lp_txqnob > p2->lp_txqnob)
-               return -ERANGE;
-
-       if (p1->lp_txcredits > p2->lp_txcredits)
-               return 1;
-
-       if (p1->lp_txcredits < p2->lp_txcredits)
-               return -ERANGE;
+       rc = lnet_compare_peers(p1, p2);
+       if (rc)
+               return rc;
 
        if (r1->lr_seq - r2->lr_seq <= 0)
                return 1;
 
-       return -ERANGE;
+       return -1;
 }
 
-static lnet_peer_t *
-lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+static struct lnet_peer_ni *
+lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
+                      lnet_nid_t rtr_nid)
 {
        lnet_remotenet_t        *rnet;
        lnet_route_t            *route;
        lnet_route_t            *best_route;
        lnet_route_t            *last_route;
-       struct lnet_peer        *lp_best;
-       struct lnet_peer        *lp;
+       struct lnet_peer_ni     *lpni_best;
+       struct lnet_peer_ni     *lp;
        int                     rc;
 
        /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
         * rtr_nid nid, otherwise find the best gateway I can use */
 
-       rnet = lnet_find_net_locked(LNET_NIDNET(target));
+       rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
        if (rnet == NULL)
                return NULL;
 
-       lp_best = NULL;
+       lpni_best = NULL;
        best_route = last_route = NULL;
        list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
                lp = route->lr_gateway;
@@ -1199,15 +1278,15 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
                if (!lnet_is_route_alive(route))
                        continue;
 
-               if (ni != NULL && lp->lp_ni != ni)
+               if (net != NULL && lp->lpni_net != net)
                        continue;
 
-               if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+               if (lp->lpni_nid == rtr_nid) /* it's pre-determined router */
                        return lp;
 
-               if (lp_best == NULL) {
+               if (lpni_best == NULL) {
                        best_route = last_route = route;
-                       lp_best = lp;
+                       lpni_best = lp;
                        continue;
                }
 
@@ -1220,7 +1299,7 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
                        continue;
 
                best_route = route;
-               lp_best = lp;
+               lpni_best = lp;
        }
 
        /* set sequence number on the best router to the latest sequence + 1
@@ -1228,179 +1307,635 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
         * harmless and functional  */
        if (best_route != NULL)
                best_route->lr_seq = last_route->lr_seq + 1;
-       return lp_best;
+       return lpni_best;
 }
 
-int
-lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+static struct lnet_ni *
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
+                int md_cpt)
 {
-       lnet_nid_t              dst_nid = msg->msg_target.nid;
-       struct lnet_ni          *src_ni;
-       struct lnet_ni          *local_ni;
-       struct lnet_peer        *lp;
-       int                     cpt;
-       int                     cpt2;
-       int                     rc;
+       struct lnet_ni *ni = NULL, *best_ni = cur_ni;
+       unsigned int shortest_distance;
+       int best_credits;
 
-       /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
-        * but we might want to use pre-determined router for ACK/REPLY
-        * in the future */
-       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
-       LASSERT(msg->msg_txpeer == NULL);
-       LASSERT(!msg->msg_sending);
-       LASSERT(!msg->msg_target_is_router);
-       LASSERT(!msg->msg_receiving);
+       if (best_ni == NULL) {
+               shortest_distance = UINT_MAX;
+               best_credits = INT_MIN;
+       } else {
+               shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
+                                                    best_ni->ni_dev_cpt);
+               best_credits = atomic_read(&best_ni->ni_tx_credits);
+       }
 
-       msg->msg_sending = 1;
+       while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
+               unsigned int distance;
+               int ni_credits;
 
-       LASSERT(!msg->msg_tx_committed);
-       cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
- again:
-       lnet_net_lock(cpt);
+               if (!lnet_is_ni_healthy_locked(ni))
+                       continue;
+
+               ni_credits = atomic_read(&ni->ni_tx_credits);
+
+               /*
+                * calculate the distance from the CPT on which
+                * the message memory is allocated to the CPT of
+                * the NI's physical device
+                */
+               distance = cfs_cpt_distance(lnet_cpt_table(),
+                                           md_cpt,
+                                           ni->ni_dev_cpt);
+
+               /*
+                * All distances smaller than the NUMA range
+                * are treated equally.
+                */
+               if (distance < lnet_numa_range)
+                       distance = lnet_numa_range;
+
+               /*
+                * Select on shorter distance, then available
+                * credits, then round-robin.
+                */
+               if (distance > shortest_distance) {
+                       continue;
+               } else if (distance < shortest_distance) {
+                       shortest_distance = distance;
+               } else if (ni_credits < best_credits) {
+                       continue;
+               } else if (ni_credits == best_credits) {
+                       if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+                               continue;
+               }
+               best_ni = ni;
+               best_credits = ni_credits;
+       }
+
+       return best_ni;
+}
+
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+                   struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+       struct lnet_ni          *best_ni;
+       struct lnet_peer_ni     *best_lpni;
+       struct lnet_peer_ni     *best_gw;
+       struct lnet_peer_ni     *lpni;
+       struct lnet_peer_ni     *final_dst;
+       struct lnet_peer        *peer;
+       struct lnet_peer_net    *peer_net;
+       struct lnet_net         *local_net;
+       __u32                   seq;
+       int                     cpt, cpt2, rc;
+       bool                    routing;
+       bool                    routing2;
+       bool                    ni_is_pref;
+       bool                    preferred;
+       bool                    local_found;
+       int                     best_lpni_credits;
+       int                     md_cpt;
+
+       /*
+        * get an initial CPT to use for locking. The idea here is not to
+        * serialize the calls to select_pathway, so that as many
+        * operations can run concurrently as possible. To do that we use
+        * the CPT where this call is being executed. Later on when we
+        * determine the CPT to use in lnet_message_commit, we switch the
+        * lock and check if there was any configuration change.  If none,
+        * then we proceed, if there is, then we restart the operation.
+        */
+       cpt = lnet_net_lock_current();
+
+       md_cpt = lnet_cpt_of_md(msg->msg_md);
+       if (md_cpt == CFS_CPT_ANY)
+               md_cpt = cpt;
+
+again:
+       best_ni = NULL;
+       best_lpni = NULL;
+       best_gw = NULL;
+       final_dst = NULL;
+       local_net = NULL;
+       routing = false;
+       routing2 = false;
+       local_found = false;
+
+       seq = lnet_get_dlc_seq_locked();
 
        if (the_lnet.ln_shutdown) {
                lnet_net_unlock(cpt);
                return -ESHUTDOWN;
        }
 
-       if (src_nid == LNET_NID_ANY) {
-               src_ni = NULL;
-       } else {
-               src_ni = lnet_nid2ni_locked(src_nid, cpt);
-               if (src_ni == NULL) {
+       peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
+       if (IS_ERR(peer)) {
+               lnet_net_unlock(cpt);
+               return PTR_ERR(peer);
+       }
+
+       /* If peer is not healthy then can not send anything to it */
+       if (!lnet_is_peer_healthy_locked(peer)) {
+               lnet_net_unlock(cpt);
+               return -EHOSTUNREACH;
+       }
+
+       if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
+               CERROR("peer %s is declared to be non MR capable, "
+                      "yet configured with more than one NID\n",
+                      libcfs_nid2str(dst_nid));
+               return -EINVAL;
+       }
+
+       /*
+        * STEP 1: first jab at determining best_ni
+        * if src_nid is explicitly specified, then best_ni is already
+        * pre-determiend for us. Otherwise we need to select the best
+        * one to use later on
+        */
+       if (src_nid != LNET_NID_ANY) {
+               best_ni = lnet_nid2ni_locked(src_nid, cpt);
+               if (!best_ni) {
                        lnet_net_unlock(cpt);
                        LCONSOLE_WARN("Can't send to %s: src %s is not a "
                                      "local nid\n", libcfs_nid2str(dst_nid),
                                      libcfs_nid2str(src_nid));
                        return -EINVAL;
                }
-               LASSERT(!msg->msg_routing);
        }
 
-       /* Is this for someone on a local network? */
-       local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+       if (msg->msg_type == LNET_MSG_REPLY ||
+           msg->msg_type == LNET_MSG_ACK ||
+           !peer->lp_multi_rail ||
+           best_ni) {
+               /*
+                * for replies we want to respond on the same peer_ni we
+                * received the message on if possible. If not, then pick
+                * a peer_ni to send to
+                *
+                * if the peer is non-multi-rail then you want to send to
+                * the dst_nid provided as well.
+                *
+                * If the best_ni has already been determined, IE the
+                * src_nid has been specified, then use the
+                * destination_nid provided as well, since we're
+                * continuing a series of related messages for the same
+                * RPC.
+                *
+                * It is expected to find the lpni using dst_nid, since we
+                * created it earlier.
+                */
+               best_lpni = lnet_find_peer_ni_locked(dst_nid);
+               if (best_lpni)
+                       lnet_peer_ni_decref_locked(best_lpni);
+
+               if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
+                       /*
+                        * this lpni is not on a local network so we need
+                        * to route this reply.
+                        */
+                       best_gw = lnet_find_route_locked(NULL,
+                                                        best_lpni->lpni_nid,
+                                                        rtr_nid);
+                       if (best_gw) {
+                               /*
+                               * RULE: Each node considers only the next-hop
+                               *
+                               * We're going to route the message, so change the peer to
+                               * the router.
+                               */
+                               LASSERT(best_gw->lpni_peer_net);
+                               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+                               peer = best_gw->lpni_peer_net->lpn_peer;
+
+                               /*
+                               * if the router is not multi-rail then use the best_gw
+                               * found to send the message to
+                               */
+                               if (!peer->lp_multi_rail)
+                                       best_lpni = best_gw;
+                               else
+                                       best_lpni = NULL;
 
-       if (local_ni != NULL) {
-               if (src_ni == NULL) {
-                       src_ni = local_ni;
-                       src_nid = src_ni->ni_nid;
-               } else if (src_ni == local_ni) {
-                       lnet_ni_decref_locked(local_ni, cpt);
-               } else {
-                       lnet_ni_decref_locked(local_ni, cpt);
-                       lnet_ni_decref_locked(src_ni, cpt);
+                               routing = true;
+                       } else {
+                               best_lpni = NULL;
+                       }
+               } else if (!best_lpni) {
                        lnet_net_unlock(cpt);
-                       LCONSOLE_WARN("No route to %s via from %s\n",
-                                     libcfs_nid2str(dst_nid),
-                                     libcfs_nid2str(src_nid));
+                       CERROR("unable to send msg_type %d to "
+                             "originating %s. Destination NID not in DB\n",
+                             msg->msg_type, libcfs_nid2str(dst_nid));
                        return -EINVAL;
                }
+       }
+
+       /*
+        * if the peer is not MR capable, then we should always send to it
+        * using the first NI in the NET we determined.
+        */
+       if (!peer->lp_multi_rail) {
+               if (!best_lpni) {
+                       lnet_net_unlock(cpt);
+                       CERROR("no route to %s\n",
+                              libcfs_nid2str(dst_nid));
+                       return -EHOSTUNREACH;
+               }
 
-               LASSERT(src_nid != LNET_NID_ANY);
-               lnet_msg_commit(msg, cpt);
+               /* best ni could be set because src_nid was provided */
+               if (!best_ni) {
+                       best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
+                       if (!best_ni) {
+                               lnet_net_unlock(cpt);
+                               CERROR("no path to %s from net %s\n",
+                               libcfs_nid2str(best_lpni->lpni_nid),
+                               libcfs_net2str(best_lpni->lpni_net->net_id));
+                               return -EHOSTUNREACH;
+                       }
+               }
+       }
 
+       if (best_ni == the_lnet.ln_loni) {
+               /* No send credit hassles with LOLND */
+               lnet_ni_addref_locked(best_ni, cpt);
+               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
                if (!msg->msg_routing)
-                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+               msg->msg_target.nid = best_ni->ni_nid;
+               lnet_msg_commit(msg, cpt);
+               msg->msg_txni = best_ni;
+               lnet_net_unlock(cpt);
 
-               if (src_ni == the_lnet.ln_loni) {
-                       /* No send credit hassles with LOLND */
-                       lnet_net_unlock(cpt);
-                       lnet_ni_send(src_ni, msg);
+               return LNET_CREDIT_OK;
+       }
 
-                       lnet_net_lock(cpt);
-                       lnet_ni_decref_locked(src_ni, cpt);
-                       lnet_net_unlock(cpt);
-                       return 0;
-               }
+       /*
+        * if we already found a best_ni because src_nid is specified and
+        * best_lpni because we are replying to a message then just send
+        * the message
+        */
+       if (best_ni && best_lpni)
+               goto send;
 
-               rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
-               /* lp has ref on src_ni; lose mine */
-               lnet_ni_decref_locked(src_ni, cpt);
-               if (rc != 0) {
-                       lnet_net_unlock(cpt);
-                       LCONSOLE_WARN("Error %d finding peer %s\n", rc,
-                                     libcfs_nid2str(dst_nid));
-                       /* ENOMEM or shutting down */
-                       return rc;
+       /*
+        * If we already found a best_ni because src_nid is specified then
+        * pick the peer then send the message
+        */
+       if (best_ni)
+               goto pick_peer;
+
+       /*
+        * pick the best_ni by going through all the possible networks of
+        * that peer and see which local NI is best suited to talk to that
+        * peer.
+        *
+        * Locally connected networks will always be preferred over
+        * a routed network. If there are only routed paths to the peer,
+        * then the best route is chosen. If all routes are equal then
+        * they are used in round robin.
+        */
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (!lnet_is_peer_net_healthy_locked(peer_net))
+                       continue;
+
+               local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+               if (!local_net && !routing && !local_found) {
+                       struct lnet_peer_ni *net_gw;
+
+                       lpni = list_entry(peer_net->lpn_peer_nis.next,
+                                         struct lnet_peer_ni,
+                                         lpni_on_peer_net_list);
+
+                       net_gw = lnet_find_route_locked(NULL,
+                                                       lpni->lpni_nid,
+                                                       rtr_nid);
+                       if (!net_gw)
+                               continue;
+
+                       if (best_gw) {
+                               /*
+                                * lnet_find_route_locked() call
+                                * will return the best_Gw on the
+                                * lpni->lpni_nid network.
+                                * However, best_gw and net_gw can
+                                * be on different networks.
+                                * Therefore need to compare them
+                                * to pick the better of either.
+                                */
+                               if (lnet_compare_peers(best_gw, net_gw) > 0)
+                                       continue;
+                               if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
+                                       continue;
+                       }
+                       best_gw = net_gw;
+                       final_dst = lpni;
+
+                       routing2 = true;
+               } else {
+                       best_gw = NULL;
+                       final_dst = NULL;
+                       routing2 = false;
+                       local_found = true;
                }
-               LASSERT(lp->lp_ni == src_ni);
-       } else {
-               /* sending to a remote network */
-               lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
-               if (lp == NULL) {
-                       if (src_ni != NULL)
-                               lnet_ni_decref_locked(src_ni, cpt);
-                       lnet_net_unlock(cpt);
 
-                       LCONSOLE_WARN("No route to %s via %s "
-                                     "(all routers down)\n",
-                                     libcfs_id2str(msg->msg_target),
-                                     libcfs_nid2str(src_nid));
+               /*
+                * a gw on this network is found, but there could be
+                * other better gateways on other networks. So don't pick
+                * the best_ni until we determine the best_gw.
+                */
+               if (best_gw)
+                       continue;
+
+               /* if no local_net found continue */
+               if (!local_net)
+                       continue;
+
+               /*
+                * Iterate through the NIs in this local Net and select
+                * the NI to send from. The selection is determined by
+                * these 3 criterion in the following priority:
+                *      1. NUMA
+                *      2. NI available credits
+                *      3. Round Robin
+                */
+               best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
+       }
+
+       if (!best_ni && !best_gw) {
+               lnet_net_unlock(cpt);
+               LCONSOLE_WARN("No local ni found to send from to %s\n",
+                       libcfs_nid2str(dst_nid));
+               return -EINVAL;
+       }
+
+       if (!best_ni) {
+               best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
+               LASSERT(best_gw && best_ni);
+
+               /*
+                * We're going to route the message, so change the peer to
+                * the router.
+                */
+               LASSERT(best_gw->lpni_peer_net);
+               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+               best_gw->lpni_gw_seq++;
+               peer = best_gw->lpni_peer_net->lpn_peer;
+       }
+
+       /*
+        * Now that we selected the NI to use increment its sequence
+        * number so the Round Robin algorithm will detect that it has
+        * been used and pick the next NI.
+        */
+       best_ni->ni_seq++;
+
+pick_peer:
+       /*
+        * At this point the best_ni is on a local network on which
+        * the peer has a peer_ni as well
+        */
+       peer_net = lnet_peer_get_net_locked(peer,
+                                           best_ni->ni_net->net_id);
+       /*
+        * peer_net is not available or the src_nid is explicitly defined
+        * and the peer_net for that src_nid is unhealthy. find a route to
+        * the destination nid.
+        */
+       if (!peer_net ||
+           (src_nid != LNET_NID_ANY &&
+            !lnet_is_peer_net_healthy_locked(peer_net))) {
+               best_gw = lnet_find_route_locked(best_ni->ni_net,
+                                                dst_nid,
+                                                rtr_nid);
+               /*
+                * if no route is found for that network then
+                * move onto the next peer_ni in the peer
+                */
+               if (!best_gw) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("No route to peer from %s\n",
+                               libcfs_nid2str(best_ni->ni_nid));
                        return -EHOSTUNREACH;
                }
 
-               /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
-                * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
-                * pre-determined router, this can happen if router table
-                * was changed when we release the lock */
-               if (rtr_nid != lp->lp_nid) {
-                       cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
-                       if (cpt2 != cpt) {
-                               if (src_ni != NULL)
-                                       lnet_ni_decref_locked(src_ni, cpt);
-                               lnet_net_unlock(cpt);
+               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+                       libcfs_nid2str(dst_nid),
+                       libcfs_nid2str(best_gw->lpni_nid),
+                       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-                               rtr_nid = lp->lp_nid;
-                               cpt = cpt2;
-                               goto again;
+               routing2 = true;
+               /*
+                * RULE: Each node considers only the next-hop
+                *
+                * We're going to route the message, so change the peer to
+                * the router.
+                */
+               LASSERT(best_gw->lpni_peer_net);
+               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+               peer = best_gw->lpni_peer_net->lpn_peer;
+       } else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
+               /*
+                * this peer_net is unhealthy but we still have an opportunity
+                * to find another peer_net that we can use
+                */
+               __u32 net_id = peer_net->lpn_net_id;
+               LCONSOLE_WARN("peer net %s unhealthy\n",
+                             libcfs_net2str(net_id));
+               goto again;
+       }
+
+       /*
+        * Look at the peer NIs for the destination peer that connect
+        * to the chosen net. If a peer_ni is preferred when using the
+        * best_ni to communicate, we use that one. If there is no
+        * preferred peer_ni, or there are multiple preferred peer_ni,
+        * the available transmit credits are used. If the transmit
+        * credits are equal, we round-robin over the peer_ni.
+        */
+       lpni = NULL;
+       best_lpni_credits = INT_MIN;
+       preferred = false;
+       best_lpni = NULL;
+       while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
+               /*
+                * if this peer ni is not healthy just skip it, no point in
+                * examining it further
+                */
+               if (!lnet_is_peer_ni_healthy_locked(lpni))
+                       continue;
+               ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+
+               /* if this is a preferred peer use it */
+               if (!preferred && ni_is_pref) {
+                       preferred = true;
+               } else if (preferred && !ni_is_pref) {
+                       /*
+                        * this is not the preferred peer so let's ignore
+                        * it.
+                        */
+                       continue;
+               } else if (lpni->lpni_txcredits < best_lpni_credits) {
+                       /*
+                        * We already have a peer that has more credits
+                        * available than this one. No need to consider
+                        * this peer further.
+                        */
+                       continue;
+               } else if (lpni->lpni_txcredits == best_lpni_credits) {
+                       /*
+                        * The best peer found so far and the current peer
+                        * have the same number of available credits let's
+                        * make sure to select between them using Round
+                        * Robin
+                        */
+                       if (best_lpni) {
+                               if (best_lpni->lpni_seq <= lpni->lpni_seq)
+                                       continue;
                        }
                }
 
-               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-                      libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
-                      lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+               best_lpni = lpni;
+               best_lpni_credits = lpni->lpni_txcredits;
+       }
 
-               if (src_ni == NULL) {
-                       src_ni = lp->lp_ni;
-                       src_nid = src_ni->ni_nid;
-               } else {
-                       LASSERT(src_ni == lp->lp_ni);
-                       lnet_ni_decref_locked(src_ni, cpt);
-               }
+       /* if we still can't find a peer ni then we can't reach it */
+       if (!best_lpni) {
+               __u32 net_id = (peer_net) ? peer_net->lpn_net_id :
+                       LNET_NIDNET(dst_nid);
+               lnet_net_unlock(cpt);
+               LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+                               libcfs_net2str(net_id));
+               return -EHOSTUNREACH;
+       }
 
-               lnet_peer_addref_locked(lp);
 
-               LASSERT(src_nid != LNET_NID_ANY);
-               lnet_msg_commit(msg, cpt);
+send:
+       routing = routing || routing2;
 
-               if (!msg->msg_routing) {
-                       /* I'm the source and now I know which NI to send on */
-                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+       /*
+        * Increment sequence number of the peer selected so that we
+        * pick the next one in Round Robin.
+        */
+       best_lpni->lpni_seq++;
+
+       /*
+        * grab a reference on the peer_ni so it sticks around even if
+        * we need to drop and relock the lnet_net_lock below.
+        */
+       lnet_peer_ni_addref_locked(best_lpni);
+
+       /*
+        * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+        * message. This ensures that we get a CPT that is correct for
+        * the NI when the NI has been restricted to a subset of all CPTs.
+        * If the selected CPT differs from the one currently locked, we
+        * must unlock and relock the lnet_net_lock(), and then check whether
+        * the configuration has changed. We don't have a hold on the best_ni
+        * yet, and it may have vanished.
+        */
+       cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+       if (cpt != cpt2) {
+               lnet_net_unlock(cpt);
+               cpt = cpt2;
+               lnet_net_lock(cpt);
+               if (seq != lnet_get_dlc_seq_locked()) {
+                       lnet_peer_ni_decref_locked(best_lpni);
+                       goto again;
                }
+       }
+
+       /*
+        * store the best_lpni in the message right away to avoid having
+        * to do the same operation under different conditions
+        */
+       msg->msg_txpeer = best_lpni;
+       msg->msg_txni = best_ni;
+
+       /*
+        * grab a reference for the best_ni since now it's in use in this
+        * send. the reference will need to be dropped when the message is
+        * finished in lnet_finalize()
+        */
+       lnet_ni_addref_locked(msg->msg_txni, cpt);
+
+       /*
+        * Always set the target.nid to the best peer picked. Either the
+        * nid will be one of the preconfigured NIDs, or the same NID as
+        * what was originally set in the target or it will be the NID of
+        * a router if this message should be routed
+        */
+       msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
 
+       /*
+        * lnet_msg_commit assigns the correct cpt to the message, which
+        * is used to decrement the correct refcount on the ni when it's
+        * time to return the credits
+        */
+       lnet_msg_commit(msg, cpt);
+
+       /*
+        * If we are routing the message then we don't need to overwrite
+        * the src_nid since it would've been set at the origin. Otherwise
+        * we are the originator so we need to set it.
+        */
+       if (!msg->msg_routing)
+               msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+
+       if (routing) {
                msg->msg_target_is_router = 1;
-               msg->msg_target.nid = lp->lp_nid;
                msg->msg_target.pid = LNET_PID_LUSTRE;
+               /*
+                * since we're routing we want to ensure that the
+                * msg_hdr.dest_nid is set to the final destination. When
+                * the router receives this message it knows how to route
+                * it.
+                */
+               msg->msg_hdr.dest_nid =
+                       cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
+       } else {
+               /*
+                * if we're not routing set the dest_nid to the best peer
+                * ni that we picked earlier in the algorithm.
+                */
+               msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
        }
 
-       /* 'lp' is our best choice of peer */
+       rc = lnet_post_send_locked(msg, 0);
 
-       LASSERT(!msg->msg_peertxcredit);
-       LASSERT(!msg->msg_txcredit);
-       LASSERT(msg->msg_txpeer == NULL);
+       lnet_net_unlock(cpt);
 
-       msg->msg_txpeer = lp;                   /* msg takes my ref on lp */
+       return rc;
+}
 
-       rc = lnet_post_send_locked(msg, 0);
-       lnet_net_unlock(cpt);
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+       lnet_nid_t              dst_nid = msg->msg_target.nid;
+       int                     rc;
 
+       /*
+        * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+        * but we might want to use pre-determined router for ACK/REPLY
+        * in the future
+        */
+       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+       LASSERT (msg->msg_txpeer == NULL);
+       LASSERT (!msg->msg_sending);
+       LASSERT (!msg->msg_target_is_router);
+       LASSERT (!msg->msg_receiving);
+
+       msg->msg_sending = 1;
+
+       LASSERT(!msg->msg_tx_committed);
+
+       rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
        if (rc < 0)
                return rc;
 
        if (rc == LNET_CREDIT_OK)
-               lnet_ni_send(src_ni, msg);
+               lnet_ni_send(msg->msg_txni, msg);
 
-       return 0; /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
+       /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
+       return 0;
 }
 
 void
@@ -1446,15 +1981,17 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
        hdr->msg.put.ptl_index  = le32_to_cpu(hdr->msg.put.ptl_index);
        hdr->msg.put.offset     = le32_to_cpu(hdr->msg.put.offset);
 
-       info.mi_id.nid  = hdr->src_nid;
+       /* Primary peer NID. */
+       info.mi_id.nid  = msg->msg_initiator;
        info.mi_id.pid  = hdr->src_pid;
        info.mi_opc     = LNET_MD_OP_PUT;
        info.mi_portal  = hdr->msg.put.ptl_index;
        info.mi_rlength = hdr->payload_length;
        info.mi_roffset = hdr->msg.put.offset;
        info.mi_mbits   = hdr->msg.put.match_bits;
+       info.mi_cpt     = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni);
 
-       msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+       msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL;
        ready_delay = msg->msg_rx_ready_delay;
 
  again:
@@ -1495,6 +2032,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 {
        struct lnet_match_info  info;
        lnet_hdr_t              *hdr = &msg->msg_hdr;
+       lnet_process_id_t       source_id;
        struct lnet_handle_wire reply_wmd;
        int                     rc;
 
@@ -1504,13 +2042,17 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
        hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
        hdr->msg.get.src_offset   = le32_to_cpu(hdr->msg.get.src_offset);
 
-       info.mi_id.nid  = hdr->src_nid;
+       source_id.nid = hdr->src_nid;
+       source_id.pid = hdr->src_pid;
+       /* Primary peer NID */
+       info.mi_id.nid  = msg->msg_initiator;
        info.mi_id.pid  = hdr->src_pid;
        info.mi_opc     = LNET_MD_OP_GET;
        info.mi_portal  = hdr->msg.get.ptl_index;
        info.mi_rlength = hdr->msg.get.sink_length;
        info.mi_roffset = hdr->msg.get.src_offset;
        info.mi_mbits   = hdr->msg.get.match_bits;
+       info.mi_cpt     = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni);
 
        rc = lnet_ptl_match_md(&info, msg);
        if (rc == LNET_MATCHMD_DROP) {
@@ -1527,7 +2069,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 
        reply_wmd = hdr->msg.get.return_wmd;
 
-       lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+       lnet_prep_send(msg, LNET_MSG_REPLY, source_id,
                       msg->msg_offset, msg->msg_wanted);
 
        msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
@@ -1685,9 +2227,9 @@ lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
        if (!the_lnet.ln_routing)
                return -ECANCELED;
 
-       if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+       if (msg->msg_rxpeer->lpni_rtrcredits <= 0 ||
            lnet_msg2bufpool(msg)->rbp_credits <= 0) {
-               if (ni->ni_lnd->lnd_eager_recv == NULL) {
+               if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) {
                        msg->msg_rx_ready_delay = 1;
                } else {
                        lnet_net_unlock(msg->msg_rx_cpt);
@@ -1820,8 +2362,9 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
        lnet_pid_t     dest_pid;
        lnet_nid_t     dest_nid;
        lnet_nid_t     src_nid;
-       __u32          payload_length;
-       __u32          type;
+       struct lnet_peer_ni *lpni;
+       __u32          payload_length;
+       __u32          type;
 
        LASSERT (!in_interrupt ());
 
@@ -1832,7 +2375,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
        payload_length = le32_to_cpu(hdr->payload_length);
 
        for_me = (ni->ni_nid == dest_nid);
-       cpt = lnet_cpt_of_nid(from_nid);
+       cpt = lnet_cpt_of_nid(from_nid, ni);
 
        switch (type) {
        case LNET_MSG_ACK:
@@ -1979,21 +2522,26 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                msg->msg_hdr.dest_pid   = dest_pid;
                msg->msg_hdr.payload_length = payload_length;
        }
+       /* Multi-Rail: Primary NID of source. */
+       msg->msg_initiator = lnet_peer_primary_nid(src_nid);
 
        lnet_net_lock(cpt);
-       rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
-       if (rc != 0) {
+       lpni = lnet_nid2peerni_locked(from_nid, cpt);
+       if (IS_ERR(lpni)) {
                lnet_net_unlock(cpt);
                CERROR("%s, src %s: Dropping %s "
-                      "(error %d looking up sender)\n",
+                      "(error %ld looking up sender)\n",
                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
-                      lnet_msgtyp2str(type), rc);
+                      lnet_msgtyp2str(type), PTR_ERR(lpni));
                lnet_msg_free(msg);
                if (rc == -ESHUTDOWN)
                        /* We are shutting down.  Don't do anything more */
                        return 0;
                goto drop;
        }
+       msg->msg_rxpeer = lpni;
+       msg->msg_rxni = ni;
+       lnet_ni_addref_locked(ni, cpt);
 
        if (lnet_isrouter(msg->msg_rxpeer)) {
                lnet_peer_set_alive(msg->msg_rxpeer);
@@ -2078,15 +2626,14 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
                 * called lnet_drop_message(), so I just hang onto msg as well
                 * until that's done */
 
-               lnet_drop_message(msg->msg_rxpeer->lp_ni,
-                                 msg->msg_rxpeer->lp_cpt,
+               lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
                                  msg->msg_private, msg->msg_len);
                /*
                 * NB: message will not generate event because w/o attached MD,
                 * but we still should give error code so lnet_msg_decommit()
                 * can skip counters operations and other checks.
                 */
-               lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+               lnet_finalize(msg->msg_rxni, msg, -ENOENT);
        }
 }
 
@@ -2109,6 +2656,7 @@ lnet_recv_delayed_msg_list(struct list_head *head)
                LASSERT(msg->msg_rx_delayed);
                LASSERT(msg->msg_md != NULL);
                LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_rxni != NULL);
                LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
 
                CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
@@ -2118,7 +2666,7 @@ lnet_recv_delayed_msg_list(struct list_head *head)
                        msg->msg_hdr.msg.put.offset,
                        msg->msg_hdr.payload_length);
 
-               lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+               lnet_recv_put(msg->msg_rxni, msg);
        }
 }
 
@@ -2294,6 +2842,8 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
               libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
 
        /* setup information for lnet_build_msg_event */
+       msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid);
+       /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */
        msg->msg_from = peer_id.nid;
        msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
        msg->msg_hdr.src_nid = peer_id.nid;
@@ -2303,7 +2853,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
        lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
        lnet_res_unlock(cpt);
 
-       cpt = lnet_cpt_of_nid(peer_id.nid);
+       cpt = lnet_cpt_of_nid(peer_id.nid, ni);
 
        lnet_net_lock(cpt);
        lnet_msg_commit(msg, cpt);
@@ -2314,7 +2864,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
        return msg;
 
  drop:
-       cpt = lnet_cpt_of_nid(peer_id.nid);
+       cpt = lnet_cpt_of_nid(peer_id.nid, ni);
 
        lnet_net_lock(cpt);
        the_lnet.ln_counters[cpt]->drop_count++;
@@ -2461,7 +3011,7 @@ int
 LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 {
        struct list_head        *e;
-       struct lnet_ni          *ni;
+       struct lnet_ni          *ni = NULL;
        lnet_remotenet_t        *rnet;
        __u32                   dstnet = LNET_NIDNET(dstnid);
        int                     hops;
@@ -2478,9 +3028,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 
        cpt = lnet_net_lock_current();
 
-       list_for_each(e, &the_lnet.ln_nis) {
-               ni = list_entry(e, lnet_ni_t, ni_list);
-
+       while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
                if (ni->ni_nid == dstnid) {
                        if (srcnidp != NULL)
                                *srcnidp = dstnid;
@@ -2540,8 +3088,12 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
 
                        LASSERT(shortest != NULL);
                        hops = shortest_hops;
-                       if (srcnidp != NULL)
-                               *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+                       if (srcnidp != NULL) {
+                               ni = lnet_get_next_ni_locked(
+                                       shortest->lr_gateway->lpni_net,
+                                       NULL);
+                               *srcnidp = ni->ni_nid;
+                       }
                        if (orderp != NULL)
                                *orderp = order;
                        lnet_net_unlock(cpt);
index cb3a7cd..a6ffe8e 100644 (file)
@@ -72,6 +72,8 @@ lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
                ev->target.pid    = le32_to_cpu(hdr->dest_pid);
                ev->initiator.nid = LNET_NID_ANY;
                ev->initiator.pid = the_lnet.ln_pid;
+               ev->source.nid    = LNET_NID_ANY;
+               ev->source.pid    = the_lnet.ln_pid;
                ev->sender        = LNET_NID_ANY;
 
        } else {
@@ -79,8 +81,12 @@ lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
                ev->target.pid    = hdr->dest_pid;
                ev->target.nid    = hdr->dest_nid;
                ev->initiator.pid = hdr->src_pid;
-               ev->initiator.nid = hdr->src_nid;
-               ev->rlength       = hdr->payload_length;
+               /* Multi-Rail: resolve src_nid to "primary" peer NID */
+               ev->initiator.nid = msg->msg_initiator;
+               /* Multi-Rail: track source NID. */
+               ev->source.pid    = hdr->src_pid;
+               ev->source.nid    = hdr->src_nid;
+               ev->rlength       = hdr->payload_length;
                ev->sender        = msg->msg_from;
                ev->mlength       = msg->msg_wanted;
                ev->offset        = msg->msg_offset;
@@ -210,6 +216,10 @@ lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
        }
 
        counters->send_count++;
+       if (msg->msg_txpeer)
+               atomic_inc(&msg->msg_txpeer->lpni_stats.send_count);
+       if (msg->msg_txni)
+               atomic_inc(&msg->msg_txni->ni_stats.send_count);
  out:
        lnet_return_tx_credits_locked(msg);
        msg->msg_tx_committed = 0;
@@ -261,6 +271,10 @@ lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
        }
 
        counters->recv_count++;
+       if (msg->msg_rxpeer)
+               atomic_inc(&msg->msg_rxpeer->lpni_stats.recv_count);
+       if (msg->msg_rxni)
+               atomic_inc(&msg->msg_rxni->ni_stats.recv_count);
        if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
                counters->recv_length += msg->msg_wanted;
 
@@ -376,7 +390,7 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
 
                ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
 
-               lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+               lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0);
 
                msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
                msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
index acba755..0b31878 100644 (file)
@@ -222,7 +222,7 @@ lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
 
        /* if it's a unique portal, return match-table hashed by NID */
        return lnet_ptl_is_unique(ptl) ?
-              ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+              ptl->ptl_mtables[lnet_cpt_of_nid(id.nid, NULL)] : NULL;
 }
 
 struct lnet_match_table *
@@ -292,7 +292,7 @@ lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
 
        rotor = ptl->ptl_rotor++; /* get round-robin factor */
        if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
-               cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+               cpt = info->mi_cpt;
        else
                cpt = rotor % LNET_CPT_NUMBER;
 
@@ -682,7 +682,8 @@ lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
                LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
 
                hdr   = &msg->msg_hdr;
-               info.mi_id.nid  = hdr->src_nid;
+               /* Multi-Rail: Primary peer NID */
+               info.mi_id.nid  = msg->msg_initiator;
                info.mi_id.pid  = hdr->src_pid;
                info.mi_opc     = LNET_MD_OP_PUT;
                info.mi_portal  = hdr->msg.put.ptl_index;
@@ -941,7 +942,7 @@ lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason)
                /* grab all messages which are on the NI passed in */
                list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed,
                                         msg_list) {
-                       if (msg->msg_rxpeer->lp_ni == ni)
+                       if (msg->msg_txni == ni || msg->msg_rxni == ni)
                                list_move(&msg->msg_list, &zombies);
                }
        } else {
index 673f9b3..cda649b 100644 (file)
@@ -94,7 +94,7 @@ lolnd_shutdown(lnet_ni_t *ni)
 static int
 lolnd_startup (lnet_ni_t *ni)
 {
-       LASSERT (ni->ni_lnd == &the_lolnd);
+       LASSERT (ni->ni_net->net_lnd == &the_lolnd);
        LASSERT (!lolnd_instanced);
        lolnd_instanced = 1;
 
index c080eb4..a7190dd 100644 (file)
@@ -91,7 +91,7 @@ lnet_unconfigure (void)
 }
 
 static int
-lnet_dyn_configure(struct libcfs_ioctl_hdr *hdr)
+lnet_dyn_configure_net(struct libcfs_ioctl_hdr *hdr)
 {
        struct lnet_ioctl_config_data *conf =
          (struct lnet_ioctl_config_data *)hdr;
@@ -101,19 +101,17 @@ lnet_dyn_configure(struct libcfs_ioctl_hdr *hdr)
                return -EINVAL;
 
        mutex_lock(&lnet_config_mutex);
-       if (!the_lnet.ln_niinit_self) {
+       if (the_lnet.ln_niinit_self)
+               rc = lnet_dyn_add_net(conf);
+       else
                rc = -EINVAL;
-               goto out_unlock;
-       }
-       rc = lnet_dyn_add_ni(LNET_PID_LUSTRE, conf);
-out_unlock:
        mutex_unlock(&lnet_config_mutex);
 
        return rc;
 }
 
 static int
-lnet_dyn_unconfigure(struct libcfs_ioctl_hdr *hdr)
+lnet_dyn_unconfigure_net(struct libcfs_ioctl_hdr *hdr)
 {
        struct lnet_ioctl_config_data *conf =
          (struct lnet_ioctl_config_data *) hdr;
@@ -123,12 +121,50 @@ lnet_dyn_unconfigure(struct libcfs_ioctl_hdr *hdr)
                return -EINVAL;
 
        mutex_lock(&lnet_config_mutex);
-       if (!the_lnet.ln_niinit_self) {
+       if (the_lnet.ln_niinit_self)
+               rc = lnet_dyn_del_net(conf->cfg_net);
+       else
+               rc = -EINVAL;
+       mutex_unlock(&lnet_config_mutex);
+
+       return rc;
+}
+
+static int
+lnet_dyn_configure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+       struct lnet_ioctl_config_ni *conf =
+         (struct lnet_ioctl_config_ni *)hdr;
+       int                           rc;
+
+       if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+               return -EINVAL;
+
+       mutex_lock(&lnet_config_mutex);
+       if (the_lnet.ln_niinit_self)
+               rc = lnet_dyn_add_ni(conf);
+       else
+               rc = -EINVAL;
+       mutex_unlock(&lnet_config_mutex);
+
+       return rc;
+}
+
+static int
+lnet_dyn_unconfigure_ni(struct libcfs_ioctl_hdr *hdr)
+{
+       struct lnet_ioctl_config_ni *conf =
+         (struct lnet_ioctl_config_ni *) hdr;
+       int                           rc;
+
+       if (conf->lic_cfg_hdr.ioc_len < sizeof(*conf))
+               return -EINVAL;
+
+       mutex_lock(&lnet_config_mutex);
+       if (the_lnet.ln_niinit_self)
+               rc = lnet_dyn_del_ni(conf);
+       else
                rc = -EINVAL;
-               goto out_unlock;
-       }
-       rc = lnet_dyn_del_ni(conf->cfg_net);
-out_unlock:
        mutex_unlock(&lnet_config_mutex);
 
        return rc;
@@ -155,10 +191,16 @@ lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
                return lnet_unconfigure();
 
        case IOC_LIBCFS_ADD_NET:
-               return lnet_dyn_configure(hdr);
+               return lnet_dyn_configure_net(hdr);
 
        case IOC_LIBCFS_DEL_NET:
-               return lnet_dyn_unconfigure(hdr);
+               return lnet_dyn_unconfigure_net(hdr);
+
+       case IOC_LIBCFS_ADD_LOCAL_NI:
+               return lnet_dyn_configure_ni(hdr);
+
+       case IOC_LIBCFS_DEL_LOCAL_NI:
+               return lnet_dyn_unconfigure_ni(hdr);
 
        default:
                /* Passing LNET_PID_ANY only gives me a ref if the net is up
index 083b169..91c9c6b 100644 (file)
@@ -617,8 +617,9 @@ delayed_msg_process(struct list_head *msg_list, bool drop)
 
                msg = list_entry(msg_list->next, struct lnet_msg, msg_list);
                LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_rxni != NULL);
 
-               ni = msg->msg_rxpeer->lp_ni;
+               ni = msg->msg_rxni;
                cpt = msg->msg_rx_cpt;
 
                list_del_init(&msg->msg_list);
index 523d5b3..a5758fa 100644 (file)
 #include <lnet/lib-lnet.h>
 #include <lnet/lib-dlc.h>
 
+static void
+lnet_peer_remove_from_remote_list(struct lnet_peer_ni *lpni)
+{
+       if (!list_empty(&lpni->lpni_on_remote_peer_ni_list)) {
+               list_del_init(&lpni->lpni_on_remote_peer_ni_list);
+               lnet_peer_ni_decref_locked(lpni);
+       }
+}
+
+void
+lnet_peer_net_added(struct lnet_net *net)
+{
+       struct lnet_peer_ni *lpni, *tmp;
+
+       list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+                                lpni_on_remote_peer_ni_list) {
+
+               if (LNET_NIDNET(lpni->lpni_nid) == net->net_id) {
+                       lpni->lpni_net = net;
+
+                       spin_lock(&lpni->lpni_lock);
+                       lpni->lpni_txcredits =
+                               lpni->lpni_net->net_tunables.lct_peer_tx_credits;
+                       lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+                       lpni->lpni_rtrcredits =
+                               lnet_peer_buffer_credits(lpni->lpni_net);
+                       lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+                       spin_unlock(&lpni->lpni_lock);
+
+                       lnet_peer_remove_from_remote_list(lpni);
+               }
+       }
+}
+
+static void
+lnet_peer_tables_destroy(void)
+{
+       struct lnet_peer_table  *ptable;
+       struct list_head        *hash;
+       int                     i;
+       int                     j;
+
+       if (!the_lnet.ln_peer_tables)
+               return;
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               hash = ptable->pt_hash;
+               if (!hash) /* not intialized */
+                       break;
+
+               LASSERT(list_empty(&ptable->pt_zombie_list));
+
+               ptable->pt_hash = NULL;
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+                       LASSERT(list_empty(&hash[j]));
+
+               LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+       }
+
+       cfs_percpt_free(the_lnet.ln_peer_tables);
+       the_lnet.ln_peer_tables = NULL;
+}
+
 int
 lnet_peer_tables_create(void)
 {
@@ -53,8 +116,6 @@ lnet_peer_tables_create(void)
        }
 
        cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-               INIT_LIST_HEAD(&ptable->pt_deathrow);
-
                LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
                                 LNET_PEER_HASH_SIZE * sizeof(*hash));
                if (hash == NULL) {
@@ -63,6 +124,9 @@ lnet_peer_tables_create(void)
                        return -ENOMEM;
                }
 
+               spin_lock_init(&ptable->pt_zombie_lock);
+               INIT_LIST_HEAD(&ptable->pt_zombie_list);
+
                for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
                        INIT_LIST_HEAD(&hash[j]);
                ptable->pt_hash = hash; /* sign of initialization */
@@ -71,63 +135,252 @@ lnet_peer_tables_create(void)
        return 0;
 }
 
-void
-lnet_peer_tables_destroy(void)
+static struct lnet_peer_ni *
+lnet_peer_ni_alloc(lnet_nid_t nid)
 {
-       struct lnet_peer_table  *ptable;
-       struct list_head        *hash;
-       int                     i;
-       int                     j;
+       struct lnet_peer_ni *lpni;
+       struct lnet_net *net;
+       int cpt;
+
+       cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+       LIBCFS_CPT_ALLOC(lpni, lnet_cpt_table(), cpt, sizeof(*lpni));
+       if (!lpni)
+               return NULL;
+
+       INIT_LIST_HEAD(&lpni->lpni_txq);
+       INIT_LIST_HEAD(&lpni->lpni_rtrq);
+       INIT_LIST_HEAD(&lpni->lpni_routes);
+       INIT_LIST_HEAD(&lpni->lpni_hashlist);
+       INIT_LIST_HEAD(&lpni->lpni_on_peer_net_list);
+       INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+
+       spin_lock_init(&lpni->lpni_lock);
+
+       lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */
+       lpni->lpni_last_alive = cfs_time_current(); /* assumes alive */
+       lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+       lpni->lpni_nid = nid;
+       lpni->lpni_cpt = cpt;
+       lnet_set_peer_ni_health_locked(lpni, true);
+
+       net = lnet_get_net_locked(LNET_NIDNET(nid));
+       lpni->lpni_net = net;
+       if (net) {
+               lpni->lpni_txcredits = net->net_tunables.lct_peer_tx_credits;
+               lpni->lpni_mintxcredits = lpni->lpni_txcredits;
+               lpni->lpni_rtrcredits = lnet_peer_buffer_credits(net);
+               lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
+       } else {
+               /*
+                * This peer_ni is not on a local network, so we
+                * cannot add the credits here. In case the net is
+                * added later, add the peer_ni to the remote peer ni
+                * list so it can be easily found and revisited.
+                */
+               /* FIXME: per-net implementation instead? */
+               atomic_inc(&lpni->lpni_refcount);
+               list_add_tail(&lpni->lpni_on_remote_peer_ni_list,
+                             &the_lnet.ln_remote_peer_ni_list);
+       }
+
+       /* TODO: update flags */
+
+       return lpni;
+}
+
+static struct lnet_peer_net *
+lnet_peer_net_alloc(__u32 net_id)
+{
+       struct lnet_peer_net *lpn;
+
+       LIBCFS_CPT_ALLOC(lpn, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lpn));
+       if (!lpn)
+               return NULL;
 
-       if (the_lnet.ln_peer_tables == NULL)
+       INIT_LIST_HEAD(&lpn->lpn_on_peer_list);
+       INIT_LIST_HEAD(&lpn->lpn_peer_nis);
+       lpn->lpn_net_id = net_id;
+
+       return lpn;
+}
+
+static struct lnet_peer *
+lnet_peer_alloc(lnet_nid_t nid)
+{
+       struct lnet_peer *lp;
+
+       LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), CFS_CPT_ANY, sizeof(*lp));
+       if (!lp)
+               return NULL;
+
+       INIT_LIST_HEAD(&lp->lp_on_lnet_peer_list);
+       INIT_LIST_HEAD(&lp->lp_peer_nets);
+       lp->lp_primary_nid = nid;
+
+       /* TODO: update flags */
+
+       return lp;
+}
+
+
+static void
+lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
+{
+       struct lnet_peer_net *peer_net;
+       struct lnet_peer *peer;
+
+       /* TODO: could the below situation happen? accessing an already
+        * destroyed peer? */
+       if (lpni->lpni_peer_net == NULL ||
+           lpni->lpni_peer_net->lpn_peer == NULL)
                return;
 
-       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-               hash = ptable->pt_hash;
-               if (hash == NULL) /* not intialized */
-                       break;
+       peer_net = lpni->lpni_peer_net;
+       peer = lpni->lpni_peer_net->lpn_peer;
 
-               LASSERT(list_empty(&ptable->pt_deathrow));
+       list_del_init(&lpni->lpni_on_peer_net_list);
+       lpni->lpni_peer_net = NULL;
 
-               ptable->pt_hash = NULL;
-               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
-                       LASSERT(list_empty(&hash[j]));
+       /* if peer_net is empty, then remove it from the peer */
+       if (list_empty(&peer_net->lpn_peer_nis)) {
+               list_del_init(&peer_net->lpn_on_peer_list);
+               peer_net->lpn_peer = NULL;
+               LIBCFS_FREE(peer_net, sizeof(*peer_net));
 
-               LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+               /* if the peer is empty then remove it from the
+                * the_lnet.ln_peers */
+               if (list_empty(&peer->lp_peer_nets)) {
+                       list_del_init(&peer->lp_on_lnet_peer_list);
+                       LIBCFS_FREE(peer, sizeof(*peer));
+               }
        }
+}
 
-       cfs_percpt_free(the_lnet.ln_peer_tables);
-       the_lnet.ln_peer_tables = NULL;
+/* called with lnet_net_lock LNET_LOCK_EX held */
+static int
+lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
+{
+       struct lnet_peer_table *ptable = NULL;
+
+       /* don't remove a peer_ni if it's also a gateway */
+       if (lpni->lpni_rtr_refcount > 0) {
+               CERROR("Peer NI %s is a gateway. Can not delete it\n",
+                      libcfs_nid2str(lpni->lpni_nid));
+               return -EBUSY;
+       }
+
+       lnet_peer_remove_from_remote_list(lpni);
+
+       /* remove peer ni from the hash list. */
+       list_del_init(&lpni->lpni_hashlist);
+
+       /* decrement the ref count on the peer table */
+       ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+       LASSERT(atomic_read(&ptable->pt_number) > 0);
+       atomic_dec(&ptable->pt_number);
+
+       /*
+        * The peer_ni can no longer be found with a lookup. But there
+        * can be current users, so keep track of it on the zombie
+        * list until the reference count has gone to zero.
+        *
+        * The last reference may be lost in a place where the
+        * lnet_net_lock locks only a single cpt, and that cpt may not
+        * be lpni->lpni_cpt. So the zombie list of this peer_table
+        * has its own lock.
+        */
+       spin_lock(&ptable->pt_zombie_lock);
+       list_add(&lpni->lpni_hashlist, &ptable->pt_zombie_list);
+       ptable->pt_zombies++;
+       spin_unlock(&ptable->pt_zombie_lock);
+
+       /* no need to keep this peer on the hierarchy anymore */
+       lnet_try_destroy_peer_hierarchy_locked(lpni);
+
+       /* decrement reference on peer */
+       lnet_peer_ni_decref_locked(lpni);
+
+       return 0;
+}
+
+void lnet_peer_uninit(void)
+{
+       struct lnet_peer_ni *lpni, *tmp;
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       /* remove all peer_nis from the remote peer and the hash list */
+       list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list,
+                                lpni_on_remote_peer_ni_list)
+               lnet_peer_ni_del_locked(lpni);
+
+       lnet_peer_tables_destroy();
+
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+
+static int
+lnet_peer_del_locked(struct lnet_peer *peer)
+{
+       struct lnet_peer_ni *lpni = NULL, *lpni2;
+       int rc = 0, rc2 = 0;
+
+       lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+       while (lpni != NULL) {
+               lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
+               rc = lnet_peer_ni_del_locked(lpni);
+               if (rc != 0)
+                       rc2 = rc;
+               lpni = lpni2;
+       }
+
+       return rc2;
 }
 
 static void
-lnet_peer_table_cleanup_locked(lnet_ni_t *ni, struct lnet_peer_table *ptable)
+lnet_peer_table_cleanup_locked(struct lnet_net *net,
+                              struct lnet_peer_table *ptable)
 {
-       int              i;
-       lnet_peer_t     *lp;
-       lnet_peer_t     *tmp;
+       int                      i;
+       struct lnet_peer_ni     *next;
+       struct lnet_peer_ni     *lpni;
+       struct lnet_peer        *peer;
 
        for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
-               list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i],
-                                        lp_hashlist) {
-                       if (ni != NULL && ni != lp->lp_ni)
+               list_for_each_entry_safe(lpni, next, &ptable->pt_hash[i],
+                                        lpni_hashlist) {
+                       if (net != NULL && net != lpni->lpni_net)
                                continue;
-                       list_del_init(&lp->lp_hashlist);
-                       /* Lose hash table's ref */
-                       ptable->pt_zombies++;
-                       lnet_peer_decref_locked(lp);
+
+                       peer = lpni->lpni_peer_net->lpn_peer;
+                       if (peer->lp_primary_nid != lpni->lpni_nid) {
+                               lnet_peer_ni_del_locked(lpni);
+                               continue;
+                       }
+                       /*
+                        * Removing the primary NID implies removing
+                        * the entire peer. Advance next beyond any
+                        * peer_ni that belongs to the same peer.
+                        */
+                       list_for_each_entry_from(next, &ptable->pt_hash[i],
+                                                lpni_hashlist) {
+                               if (next->lpni_peer_net->lpn_peer != peer)
+                                       break;
+                       }
+                       lnet_peer_del_locked(peer);
                }
        }
 }
 
 static void
-lnet_peer_table_deathrow_wait_locked(struct lnet_peer_table *ptable,
-                                    int cpt_locked)
+lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable)
 {
-       int     i;
+       int     i = 3;
 
-       for (i = 3; ptable->pt_zombies != 0; i++) {
-               lnet_net_unlock(cpt_locked);
+       spin_lock(&ptable->pt_zombie_lock);
+       while (ptable->pt_zombies) {
+               spin_unlock(&ptable->pt_zombie_lock);
 
                if (IS_PO2(i)) {
                        CDEBUG(D_WARNING,
@@ -136,268 +389,738 @@ lnet_peer_table_deathrow_wait_locked(struct lnet_peer_table *ptable,
                }
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule_timeout(cfs_time_seconds(1) >> 1);
-               lnet_net_lock(cpt_locked);
+               spin_lock(&ptable->pt_zombie_lock);
        }
+       spin_unlock(&ptable->pt_zombie_lock);
 }
 
 static void
-lnet_peer_table_del_rtrs_locked(lnet_ni_t *ni, struct lnet_peer_table *ptable,
-                               int cpt_locked)
+lnet_peer_table_del_rtrs_locked(struct lnet_net *net,
+                               struct lnet_peer_table *ptable)
 {
-       lnet_peer_t     *lp;
-       lnet_peer_t     *tmp;
-       lnet_nid_t       lp_nid;
-       int              i;
+       struct lnet_peer_ni     *lp;
+       struct lnet_peer_ni     *tmp;
+       lnet_nid_t              lpni_nid;
+       int                     i;
 
        for (i = 0; i < LNET_PEER_HASH_SIZE; i++) {
                list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i],
-                                        lp_hashlist) {
-                       if (ni != lp->lp_ni)
+                                        lpni_hashlist) {
+                       if (net != lp->lpni_net)
                                continue;
 
-                       if (lp->lp_rtr_refcount == 0)
+                       if (lp->lpni_rtr_refcount == 0)
                                continue;
 
-                       lp_nid = lp->lp_nid;
+                       lpni_nid = lp->lpni_nid;
 
-                       lnet_net_unlock(cpt_locked);
-                       lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lp_nid);
-                       lnet_net_lock(cpt_locked);
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lpni_nid);
+                       lnet_net_lock(LNET_LOCK_EX);
                }
        }
 }
 
 void
-lnet_peer_tables_cleanup(lnet_ni_t *ni)
+lnet_peer_tables_cleanup(struct lnet_net *net)
 {
-       int                     i;
-       struct lnet_peer_table  *ptable;
-       lnet_peer_t             *lp;
-       struct list_head        deathrow;
-
-       INIT_LIST_HEAD(&deathrow);
+       int                             i;
+       struct lnet_peer_table          *ptable;
 
-       LASSERT(the_lnet.ln_shutdown || ni != NULL);
+       LASSERT(the_lnet.ln_shutdown || net != NULL);
        /* If just deleting the peers for a NI, get rid of any routes these
         * peers are gateways for. */
        cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-               lnet_net_lock(i);
-               lnet_peer_table_del_rtrs_locked(ni, ptable, i);
-               lnet_net_unlock(i);
+               lnet_net_lock(LNET_LOCK_EX);
+               lnet_peer_table_del_rtrs_locked(net, ptable);
+               lnet_net_unlock(LNET_LOCK_EX);
        }
 
-       /* Start the process of moving the applicable peers to
-        * deathrow. */
+       /* Start the cleanup process */
        cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-               lnet_net_lock(i);
-               lnet_peer_table_cleanup_locked(ni, ptable);
-               lnet_net_unlock(i);
+               lnet_net_lock(LNET_LOCK_EX);
+               lnet_peer_table_cleanup_locked(net, ptable);
+               lnet_net_unlock(LNET_LOCK_EX);
        }
 
-       /* Cleanup all entries on deathrow. */
-       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
-               lnet_net_lock(i);
-               lnet_peer_table_deathrow_wait_locked(ptable, i);
-               list_splice_init(&ptable->pt_deathrow, &deathrow);
-               lnet_net_unlock(i);
-       }
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables)
+               lnet_peer_ni_finalize_wait(ptable);
+}
 
-       while (!list_empty(&deathrow)) {
-               lp = list_entry(deathrow.next, lnet_peer_t, lp_hashlist);
-               list_del(&lp->lp_hashlist);
-               LIBCFS_FREE(lp, sizeof(*lp));
+static struct lnet_peer_ni *
+lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+       struct list_head        *peers;
+       struct lnet_peer_ni     *lp;
+
+       LASSERT(!the_lnet.ln_shutdown);
+
+       peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+       list_for_each_entry(lp, peers, lpni_hashlist) {
+               if (lp->lpni_nid == nid) {
+                       lnet_peer_ni_addref_locked(lp);
+                       return lp;
+               }
        }
+
+       return NULL;
 }
 
-void
-lnet_destroy_peer_locked(lnet_peer_t *lp)
+struct lnet_peer_ni *
+lnet_find_peer_ni_locked(lnet_nid_t nid)
 {
+       struct lnet_peer_ni *lpni;
        struct lnet_peer_table *ptable;
+       int cpt;
 
-       LASSERT(lp->lp_refcount == 0);
-       LASSERT(lp->lp_rtr_refcount == 0);
-       LASSERT(list_empty(&lp->lp_txq));
-       LASSERT(list_empty(&lp->lp_hashlist));
-       LASSERT(lp->lp_txqnob == 0);
-
-       ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
-       LASSERT(ptable->pt_number > 0);
-       ptable->pt_number--;
+       cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
 
-       lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
-       lp->lp_ni = NULL;
+       ptable = the_lnet.ln_peer_tables[cpt];
+       lpni = lnet_get_peer_ni_locked(ptable, nid);
 
-       list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
-       LASSERT(ptable->pt_zombies > 0);
-       ptable->pt_zombies--;
+       return lpni;
 }
 
-lnet_peer_t *
-lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+struct lnet_peer *
+lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt)
 {
-       struct list_head *peers;
-       lnet_peer_t      *lp;
+       struct lnet_peer_ni *lpni;
+       struct lnet_peer *lp;
+
+       lpni = lnet_find_peer_ni_locked(dst_nid);
+       if (!lpni) {
+               lpni = lnet_nid2peerni_locked(dst_nid, cpt);
+               if (IS_ERR(lpni))
+                       return ERR_CAST(lpni);
+       }
 
-       LASSERT(!the_lnet.ln_shutdown);
+       lp = lpni->lpni_peer_net->lpn_peer;
+       lnet_peer_ni_decref_locked(lpni);
 
-       peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
-       list_for_each_entry(lp, peers, lp_hashlist) {
-               if (lp->lp_nid == nid) {
-                       lnet_peer_addref_locked(lp);
-                       return lp;
+       return lp;
+}
+
+struct lnet_peer_ni *
+lnet_get_peer_ni_idx_locked(int idx, struct lnet_peer_net **lpn,
+                           struct lnet_peer **lp)
+{
+       struct lnet_peer_ni     *lpni;
+
+       list_for_each_entry((*lp), &the_lnet.ln_peers, lp_on_lnet_peer_list) {
+               list_for_each_entry((*lpn), &((*lp)->lp_peer_nets), lpn_on_peer_list) {
+                       list_for_each_entry(lpni, &((*lpn)->lpn_peer_nis),
+                                           lpni_on_peer_net_list)
+                               if (idx-- == 0)
+                                       return lpni;
                }
        }
 
        return NULL;
 }
 
-int
-lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+struct lnet_peer_ni *
+lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+                            struct lnet_peer_net *peer_net,
+                            struct lnet_peer_ni *prev)
 {
-       struct lnet_peer_table  *ptable;
-       lnet_peer_t             *lp = NULL;
-       lnet_peer_t             *lp2;
-       int                     cpt2;
-       int                     rc = 0;
+       struct lnet_peer_ni *lpni;
+       struct lnet_peer_net *net = peer_net;
+
+       if (!prev) {
+               if (!net)
+                       net = list_entry(peer->lp_peer_nets.next,
+                                        struct lnet_peer_net,
+                                        lpn_on_peer_list);
+               lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+                                 lpni_on_peer_net_list);
+
+               return lpni;
+       }
 
-       *lpp = NULL;
-       if (the_lnet.ln_shutdown) /* it's shutting down */
-               return -ESHUTDOWN;
+       if (prev->lpni_on_peer_net_list.next ==
+           &prev->lpni_peer_net->lpn_peer_nis) {
+               /*
+                * if you reached the end of the peer ni list and the peer
+                * net is specified then there are no more peer nis in that
+                * net.
+                */
+               if (net)
+                       return NULL;
+
+               /*
+                * we reached the end of this net ni list. move to the
+                * next net
+                */
+               if (prev->lpni_peer_net->lpn_on_peer_list.next ==
+                   &peer->lp_peer_nets)
+                       /* no more nets and no more NIs. */
+                       return NULL;
+
+               /* get the next net */
+               net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
+                                struct lnet_peer_net,
+                                lpn_on_peer_list);
+               /* get the ni on it */
+               lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+                                 lpni_on_peer_net_list);
+
+               return lpni;
+       }
 
-       /* cpt can be LNET_LOCK_EX if it's called from router functions */
-       cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+       /* there are more nis left */
+       lpni = list_entry(prev->lpni_on_peer_net_list.next,
+                         struct lnet_peer_ni, lpni_on_peer_net_list);
 
-       ptable = the_lnet.ln_peer_tables[cpt2];
-       lp = lnet_find_peer_locked(ptable, nid);
-       if (lp != NULL) {
-               *lpp = lp;
-               return 0;
+       return lpni;
+}
+
+bool
+lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
+{
+       int i;
+
+       for (i = 0; i < lpni->lpni_pref_nnids; i++) {
+               if (lpni->lpni_pref_nids[i] == ni->ni_nid)
+                       return true;
        }
+       return false;
+}
 
-       if (!list_empty(&ptable->pt_deathrow)) {
-               lp = list_entry(ptable->pt_deathrow.next,
-                               lnet_peer_t, lp_hashlist);
-               list_del(&lp->lp_hashlist);
+lnet_nid_t
+lnet_peer_primary_nid(lnet_nid_t nid)
+{
+       struct lnet_peer_ni *lpni;
+       lnet_nid_t primary_nid = nid;
+       int cpt;
+
+       cpt = lnet_net_lock_current();
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni) {
+               primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+               lnet_peer_ni_decref_locked(lpni);
        }
+       lnet_net_unlock(cpt);
+
+       return primary_nid;
+}
+
+lnet_nid_t
+LNetPrimaryNID(lnet_nid_t nid)
+{
+       struct lnet_peer_ni *lpni;
+       lnet_nid_t primary_nid = nid;
+       int rc = 0;
+       int cpt;
+
+       cpt = lnet_net_lock_current();
+       lpni = lnet_nid2peerni_locked(nid, cpt);
+       if (IS_ERR(lpni)) {
+               rc = PTR_ERR(lpni);
+               goto out_unlock;
+       }
+       primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+       lnet_peer_ni_decref_locked(lpni);
+out_unlock:
+       lnet_net_unlock(cpt);
+
+       CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
+              libcfs_nid2str(primary_nid), rc);
+       return primary_nid;
+}
+EXPORT_SYMBOL(LNetPrimaryNID);
+
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+       struct lnet_peer_net *peer_net;
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (peer_net->lpn_net_id == net_id)
+                       return peer_net;
+       }
+       return NULL;
+}
+
+static int
+lnet_peer_setup_hierarchy(struct lnet_peer *lp, struct lnet_peer_ni *lpni,
+                         lnet_nid_t nid)
+{
+       struct lnet_peer_net *lpn = NULL;
+       struct lnet_peer_table *ptable;
+        __u32 net_id = LNET_NIDNET(nid);
 
        /*
-        * take extra refcount in case another thread has shutdown LNet
-        * and destroyed locks and peer-table before I finish the allocation
+        * Create the peer_ni, peer_net, and peer if they don't exist
+        * yet.
         */
-       ptable->pt_number++;
-       lnet_net_unlock(cpt);
+       if (lp) {
+               lpn = lnet_peer_get_net_locked(lp, net_id);
+       } else {
+               lp = lnet_peer_alloc(nid);
+               if (!lp)
+                       goto out_enomem;
+       }
 
-       if (lp != NULL)
-               memset(lp, 0, sizeof(*lp));
-       else
-               LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
-
-       if (lp == NULL) {
-               rc = -ENOMEM;
-               lnet_net_lock(cpt);
-               goto out;
-       }
-
-       INIT_LIST_HEAD(&lp->lp_txq);
-       INIT_LIST_HEAD(&lp->lp_rtrq);
-       INIT_LIST_HEAD(&lp->lp_routes);
-
-       lp->lp_notify = 0;
-       lp->lp_notifylnd = 0;
-       lp->lp_notifying = 0;
-       lp->lp_alive_count = 0;
-       lp->lp_timestamp = 0;
-       lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
-       lp->lp_last_alive = cfs_time_current(); /* assumes alive */
-       lp->lp_last_query = 0; /* haven't asked NI yet */
-       lp->lp_ping_timestamp = 0;
-       lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
-       lp->lp_nid = nid;
-       lp->lp_cpt = cpt2;
-       lp->lp_refcount = 2;    /* 1 for caller; 1 for hash */
-       lp->lp_rtr_refcount = 0;
+       if (!lpn) {
+               lpn = lnet_peer_net_alloc(net_id);
+               if (!lpn)
+                       goto out_maybe_free_lp;
+       }
 
-       lnet_net_lock(cpt);
+       if (!lpni) {
+               lpni = lnet_peer_ni_alloc(nid);
+               if (!lpni)
+                       goto out_maybe_free_lpn;
+       }
 
-       if (the_lnet.ln_shutdown) {
-               rc = -ESHUTDOWN;
-               goto out;
+       /* Install the new peer_ni */
+       lnet_net_lock(LNET_LOCK_EX);
+       /* Add peer_ni to global peer table hash, if necessary. */
+       if (list_empty(&lpni->lpni_hashlist)) {
+               ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+               list_add_tail(&lpni->lpni_hashlist,
+                             &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+               ptable->pt_version++;
+               atomic_inc(&ptable->pt_number);
+               atomic_inc(&lpni->lpni_refcount);
+       }
+
+       /* Detach the peer_ni from an existing peer, if necessary. */
+       if (lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer != lp)
+               lnet_try_destroy_peer_hierarchy_locked(lpni);
+
+       /* Add peer_ni to peer_net */
+       lpni->lpni_peer_net = lpn;
+       list_add_tail(&lpni->lpni_on_peer_net_list, &lpn->lpn_peer_nis);
+
+       /* Add peer_net to peer */
+       if (!lpn->lpn_peer) {
+               lpn->lpn_peer = lp;
+               list_add_tail(&lpn->lpn_on_peer_list, &lp->lp_peer_nets);
+       }
+
+       /* Add peer to global peer list */
+       if (list_empty(&lp->lp_on_lnet_peer_list))
+               list_add_tail(&lp->lp_on_lnet_peer_list, &the_lnet.ln_peers);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return 0;
+
+out_maybe_free_lpn:
+       if (list_empty(&lpn->lpn_on_peer_list))
+               LIBCFS_FREE(lpn, sizeof(*lpn));
+out_maybe_free_lp:
+       if (list_empty(&lp->lp_on_lnet_peer_list))
+               LIBCFS_FREE(lp, sizeof(*lp));
+out_enomem:
+       return -ENOMEM;
+}
+
+static int
+lnet_add_prim_lpni(lnet_nid_t nid)
+{
+       int rc;
+       struct lnet_peer *peer;
+       struct lnet_peer_ni *lpni;
+
+       LASSERT(nid != LNET_NID_ANY);
+
+       /*
+        * lookup the NID and its peer
+        *  if the peer doesn't exist, create it.
+        *  if this is a non-MR peer then change its state to MR and exit.
+        *  if this is an MR peer and it's a primary NI: NO-OP.
+        *  if this is an MR peer and it's not a primary NI. Operation not
+        *     allowed.
+        *
+        * The adding and deleting of peer nis is being serialized through
+        * the api_mutex. So we can look up peers with the mutex locked
+        * safely. Only when we need to change the ptable, do we need to
+        * exclusively lock the lnet_net_lock()
+        */
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (!lpni) {
+               rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
+               if (rc != 0)
+                       return rc;
+               lpni = lnet_find_peer_ni_locked(nid);
        }
 
-       lp2 = lnet_find_peer_locked(ptable, nid);
-       if (lp2 != NULL) {
-               *lpp = lp2;
-               goto out;
+       LASSERT(lpni);
+
+       lnet_peer_ni_decref_locked(lpni);
+
+       peer = lpni->lpni_peer_net->lpn_peer;
+
+       /*
+        * If we found a lpni with the same nid as the NID we're trying to
+        * create, then we're trying to create an already existing lpni 
+        * that belongs to a different peer
+        */
+       if (peer->lp_primary_nid != nid)
+               return -EEXIST;
+
+       /*
+        * if we found an lpni that is not a multi-rail, which could occur
+        * if lpni is already created as a non-mr lpni or we just created
+        * it, then make sure you indicate that this lpni is a primary mr
+        * capable peer.
+        *
+        * TODO: update flags if necessary
+        */
+       if (!peer->lp_multi_rail && peer->lp_primary_nid == nid)
+               peer->lp_multi_rail = true;
+
+       return rc;
+}
+
+static int
+lnet_add_peer_ni_to_prim_lpni(lnet_nid_t prim_nid, lnet_nid_t nid)
+{
+       struct lnet_peer *peer, *primary_peer;
+       struct lnet_peer_ni *lpni = NULL, *klpni = NULL;
+
+       LASSERT(prim_nid != LNET_NID_ANY && nid != LNET_NID_ANY);
+
+       /*
+        * key nid must be created by this point. If not then this
+        * operation is not permitted
+        */
+       klpni = lnet_find_peer_ni_locked(prim_nid);
+       if (!klpni)
+               return -ENOENT;
+
+       lnet_peer_ni_decref_locked(klpni);
+
+       primary_peer = klpni->lpni_peer_net->lpn_peer;
+
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni) {
+               lnet_peer_ni_decref_locked(lpni);
+
+               peer = lpni->lpni_peer_net->lpn_peer;
+               /*
+                * lpni already exists in the system but it belongs to
+                * a different peer. We can't re-added it
+                */
+               if (peer->lp_primary_nid != prim_nid && peer->lp_multi_rail) {
+                       CERROR("Cannot add NID %s owned by peer %s to peer %s\n",
+                              libcfs_nid2str(lpni->lpni_nid),
+                              libcfs_nid2str(peer->lp_primary_nid),
+                              libcfs_nid2str(prim_nid));
+                       return -EEXIST;
+               } else if (peer->lp_primary_nid == prim_nid) {
+                       /*
+                        * found a peer_ni that is already part of the
+                        * peer. This is a no-op operation.
+                        */
+                       return 0;
+               }
+
+               /*
+                * TODO: else if (peer->lp_primary_nid != prim_nid &&
+                *                !peer->lp_multi_rail)
+                * peer is not an MR peer and it will be moved in the next
+                * step to klpni, so update its flags accordingly.
+                * lnet_move_peer_ni()
+                */
+
+               /*
+                * TODO: call lnet_update_peer() from here to update the
+                * flags. This is the case when the lpni you're trying to
+                * add is already part of the peer. This could've been
+                * added by the DD previously, so go ahead and do any
+                * updates to the state if necessary
+                */
+
        }
 
-       lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
-       if (lp->lp_ni == NULL) {
-               rc = -EHOSTUNREACH;
-               goto out;
+       /*
+        * When we get here we either have found an existing lpni, which
+        * we can switch to the new peer. Or we need to create one and
+        * add it to the new peer
+        */
+       return lnet_peer_setup_hierarchy(primary_peer, lpni, nid);
+}
+
+/*
+ * lpni creation initiated due to traffic either sending or receiving.
+ */
+static int
+lnet_peer_ni_traffic_add(lnet_nid_t nid)
+{
+       struct lnet_peer_ni *lpni;
+       int rc = 0;
+
+       if (nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       /* lnet_net_lock is not needed here because ln_api_lock is held */
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni) {
+               /*
+                * TODO: lnet_update_primary_nid() but not all of it
+                * only indicate if we're converting this to MR capable
+                * Can happen due to DD
+                */
+               lnet_peer_ni_decref_locked(lpni);
+       } else {
+               rc = lnet_peer_setup_hierarchy(NULL, NULL, nid);
        }
 
-       lp->lp_txcredits    =
-       lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
-       lp->lp_rtrcredits    =
-       lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+       return rc;
 
-       list_add_tail(&lp->lp_hashlist,
-                     &ptable->pt_hash[lnet_nid2peerhash(nid)]);
-       ptable->pt_version++;
-       *lpp = lp;
+}
+
+static int
+lnet_peer_ni_add_non_mr(lnet_nid_t nid)
+{
+       struct lnet_peer_ni *lpni;
+
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni) {
+               CERROR("Cannot add %s as non-mr when it already exists\n",
+                      libcfs_nid2str(nid));
+               lnet_peer_ni_decref_locked(lpni);
+               return -EEXIST;
+       }
+
+       return lnet_peer_setup_hierarchy(NULL, NULL, nid);
+}
+
+/*
+ * This API handles the following combinations:
+ *     Create a primary NI if only the prim_nid is provided
+ *     Create or add an lpni to a primary NI. Primary NI must've already
+ *     been created
+ *     Create a non-MR peer.
+ */
+int
+lnet_add_peer_ni_to_peer(lnet_nid_t prim_nid, lnet_nid_t nid, bool mr)
+{
+       /*
+        * Caller trying to setup an MR like peer hierarchy but
+        * specifying it to be non-MR. This is not allowed.
+        */
+       if (prim_nid != LNET_NID_ANY &&
+           nid != LNET_NID_ANY && !mr)
+               return -EPERM;
+
+       /* Add the primary NID of a peer */
+       if (prim_nid != LNET_NID_ANY &&
+           nid == LNET_NID_ANY && mr)
+               return lnet_add_prim_lpni(prim_nid);
+
+       /* Add a NID to an existing peer */
+       if (prim_nid != LNET_NID_ANY &&
+           nid != LNET_NID_ANY && mr)
+               return lnet_add_peer_ni_to_prim_lpni(prim_nid, nid);
+
+       /* Add a non-MR peer NI */
+       if (((prim_nid != LNET_NID_ANY &&
+             nid == LNET_NID_ANY) ||
+            (prim_nid == LNET_NID_ANY &&
+             nid != LNET_NID_ANY)) && !mr)
+               return lnet_peer_ni_add_non_mr(prim_nid != LNET_NID_ANY ?
+                                                        prim_nid : nid);
 
        return 0;
-out:
-       if (lp != NULL)
-               list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
-       ptable->pt_number--;
+}
+
+int
+lnet_del_peer_ni_from_peer(lnet_nid_t prim_nid, lnet_nid_t nid)
+{
+       lnet_nid_t local_nid;
+       struct lnet_peer *peer;
+       struct lnet_peer_ni *lpni;
+       int rc;
+
+       if (prim_nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       local_nid = (nid != LNET_NID_ANY) ? nid : prim_nid;
+
+       lpni = lnet_find_peer_ni_locked(local_nid);
+       if (!lpni)
+               return -EINVAL;
+       lnet_peer_ni_decref_locked(lpni);
+
+       peer = lpni->lpni_peer_net->lpn_peer;
+       LASSERT(peer != NULL);
+
+       if (peer->lp_primary_nid == lpni->lpni_nid) {
+               /*
+                * deleting the primary ni is equivalent to deleting the
+                * entire peer
+                */
+               lnet_net_lock(LNET_LOCK_EX);
+               rc = lnet_peer_del_locked(peer);
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               return rc;
+       }
+
+       lnet_net_lock(LNET_LOCK_EX);
+       rc = lnet_peer_ni_del_locked(lpni);
+       lnet_net_unlock(LNET_LOCK_EX);
+
        return rc;
 }
 
 void
+lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
+{
+       struct lnet_peer_table *ptable;
+
+       LASSERT(atomic_read(&lpni->lpni_refcount) == 0);
+       LASSERT(lpni->lpni_rtr_refcount == 0);
+       LASSERT(list_empty(&lpni->lpni_txq));
+       LASSERT(lpni->lpni_txqnob == 0);
+
+       lpni->lpni_net = NULL;
+
+       /* remove the peer ni from the zombie list */
+       ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt];
+       spin_lock(&ptable->pt_zombie_lock);
+       list_del_init(&lpni->lpni_hashlist);
+       ptable->pt_zombies--;
+       spin_unlock(&ptable->pt_zombie_lock);
+
+       LIBCFS_FREE(lpni, sizeof(*lpni));
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
+{
+       struct lnet_peer_ni *lpni = NULL;
+       int rc;
+
+       if (the_lnet.ln_shutdown) /* it's shutting down */
+               return ERR_PTR(-ESHUTDOWN);
+
+       /*
+        * find if a peer_ni already exists.
+        * If so then just return that.
+        */
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni)
+               return lpni;
+
+       lnet_net_unlock(cpt);
+
+       rc = lnet_peer_ni_traffic_add(nid);
+       if (rc) {
+               lpni = ERR_PTR(rc);
+               goto out_net_relock;
+       }
+
+       lpni = lnet_find_peer_ni_locked(nid);
+       LASSERT(lpni);
+
+out_net_relock:
+       lnet_net_lock(cpt);
+
+       return lpni;
+}
+
+struct lnet_peer_ni *
+lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
+{
+       struct lnet_peer_ni *lpni = NULL;
+       int rc;
+
+       if (the_lnet.ln_shutdown) /* it's shutting down */
+               return ERR_PTR(-ESHUTDOWN);
+
+       /*
+        * find if a peer_ni already exists.
+        * If so then just return that.
+        */
+       lpni = lnet_find_peer_ni_locked(nid);
+       if (lpni)
+               return lpni;
+
+       /*
+        * Slow path:
+        * use the lnet_api_mutex to serialize the creation of the peer_ni
+        * and the creation/deletion of the local ni/net. When a local ni is
+        * created, if there exists a set of peer_nis on that network,
+        * they need to be traversed and updated. When a local NI is
+        * deleted, which could result in a network being deleted, then
+        * all peer nis on that network need to be removed as well.
+        *
+        * Creation through traffic should also be serialized with
+        * creation through DLC.
+        */
+       lnet_net_unlock(cpt);
+       mutex_lock(&the_lnet.ln_api_mutex);
+       /*
+        * Shutdown is only set under the ln_api_lock, so a single
+        * check here is sufficent.
+        */
+       if (the_lnet.ln_shutdown) {
+               lpni = ERR_PTR(-ESHUTDOWN);
+               goto out_mutex_unlock;
+       }
+
+       rc = lnet_peer_ni_traffic_add(nid);
+       if (rc) {
+               lpni = ERR_PTR(rc);
+               goto out_mutex_unlock;
+       }
+
+       lpni = lnet_find_peer_ni_locked(nid);
+       LASSERT(lpni);
+
+out_mutex_unlock:
+       mutex_unlock(&the_lnet.ln_api_mutex);
+       lnet_net_lock(cpt);
+
+       return lpni;
+}
+
+void
 lnet_debug_peer(lnet_nid_t nid)
 {
-       char            *aliveness = "NA";
-       lnet_peer_t     *lp;
-       int             rc;
-       int             cpt;
+       char                    *aliveness = "NA";
+       struct lnet_peer_ni     *lp;
+       int                     cpt;
 
-       cpt = lnet_cpt_of_nid(nid);
+       cpt = lnet_cpt_of_nid(nid, NULL);
        lnet_net_lock(cpt);
 
-       rc = lnet_nid2peer_locked(&lp, nid, cpt);
-       if (rc != 0) {
+       lp = lnet_nid2peerni_locked(nid, cpt);
+       if (IS_ERR(lp)) {
                lnet_net_unlock(cpt);
                CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
                return;
        }
 
        if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
-               aliveness = lp->lp_alive ? "up" : "down";
+               aliveness = lp->lpni_alive ? "up" : "down";
 
        CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
-              libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
-              aliveness, lp->lp_ni->ni_peertxcredits,
-              lp->lp_rtrcredits, lp->lp_minrtrcredits,
-              lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+              libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount),
+              aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits,
+              lp->lpni_rtrcredits, lp->lpni_minrtrcredits,
+              lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob);
 
-       lnet_peer_decref_locked(lp);
+       lnet_peer_ni_decref_locked(lp);
 
        lnet_net_unlock(cpt);
 }
 
-int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
-                      char aliveness[LNET_MAX_STR_LEN],
-                      __u32 *cpt_iter, __u32 *refcount,
-                      __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
-                      __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits,
-                      __u32 *peer_tx_qnob)
+int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
+                         char aliveness[LNET_MAX_STR_LEN],
+                         __u32 *cpt_iter, __u32 *refcount,
+                         __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits,
+                         __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits,
+                         __u32 *peer_tx_qnob)
 {
-       struct lnet_peer_table  *peer_table;
-       lnet_peer_t             *lp;
-       int                     j;
-       int                     lncpt;
-       bool                    found = false;
+       struct lnet_peer_table          *peer_table;
+       struct lnet_peer_ni             *lp;
+       int                             j;
+       int                             lncpt;
+       bool                            found = false;
 
        /* get the number of CPTs */
        lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
@@ -419,7 +1142,7 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
        for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) {
                struct list_head *peers = &peer_table->pt_hash[j];
 
-               list_for_each_entry(lp, peers, lp_hashlist) {
+               list_for_each_entry(lp, peers, lpni_hashlist) {
                        if (peer_index-- > 0)
                                continue;
 
@@ -427,15 +1150,16 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
                        if (lnet_isrouter(lp) ||
                                lnet_peer_aliveness_enabled(lp))
                                snprintf(aliveness, LNET_MAX_STR_LEN,
-                                        lp->lp_alive ? "up" : "down");
+                                        lp->lpni_alive ? "up" : "down");
 
-                       *nid = lp->lp_nid;
-                       *refcount = lp->lp_refcount;
-                       *ni_peer_tx_credits = lp->lp_ni->ni_peertxcredits;
-                       *peer_tx_credits = lp->lp_txcredits;
-                       *peer_rtr_credits = lp->lp_rtrcredits;
-                       *peer_min_rtr_credits = lp->lp_mintxcredits;
-                       *peer_tx_qnob = lp->lp_txqnob;
+                       *nid = lp->lpni_nid;
+                       *refcount = atomic_read(&lp->lpni_refcount);
+                       *ni_peer_tx_credits =
+                               lp->lpni_net->net_tunables.lct_peer_tx_credits;
+                       *peer_tx_credits = lp->lpni_txcredits;
+                       *peer_rtr_credits = lp->lpni_rtrcredits;
+                       *peer_min_rtr_credits = lp->lpni_mintxcredits;
+                       *peer_tx_qnob = lp->lpni_txqnob;
 
                        found = true;
                }
@@ -447,3 +1171,41 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
 
        return found ? 0 : -ENOENT;
 }
+
+int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid,
+                      bool *mr, struct lnet_peer_ni_credit_info *peer_ni_info,
+                      struct lnet_ioctl_element_stats *peer_ni_stats)
+{
+       struct lnet_peer_ni *lpni = NULL;
+       struct lnet_peer_net *lpn = NULL;
+       struct lnet_peer *lp = NULL;
+
+       lpni = lnet_get_peer_ni_idx_locked(idx, &lpn, &lp);
+
+       if (!lpni)
+               return -ENOENT;
+
+       *primary_nid = lp->lp_primary_nid;
+       *mr = lp->lp_multi_rail;
+       *nid = lpni->lpni_nid;
+       snprintf(peer_ni_info->cr_aliveness, LNET_MAX_STR_LEN, "NA");
+       if (lnet_isrouter(lpni) ||
+               lnet_peer_aliveness_enabled(lpni))
+               snprintf(peer_ni_info->cr_aliveness, LNET_MAX_STR_LEN,
+                        lpni->lpni_alive ? "up" : "down");
+
+       peer_ni_info->cr_refcount = atomic_read(&lpni->lpni_refcount);
+       peer_ni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
+               lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+       peer_ni_info->cr_peer_tx_credits = lpni->lpni_txcredits;
+       peer_ni_info->cr_peer_rtr_credits = lpni->lpni_rtrcredits;
+       peer_ni_info->cr_peer_min_rtr_credits = lpni->lpni_minrtrcredits;
+       peer_ni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits;
+       peer_ni_info->cr_peer_tx_qnob = lpni->lpni_txqnob;
+
+       peer_ni_stats->send_count = atomic_read(&lpni->lpni_stats.send_count);
+       peer_ni_stats->recv_count = atomic_read(&lpni->lpni_stats.recv_count);
+       peer_ni_stats->drop_count = atomic_read(&lpni->lpni_stats.drop_count);
+
+       return 0;
+}
index 3ae2ba3..502ff84 100644 (file)
@@ -55,17 +55,17 @@ module_param(auto_down, int, 0444);
 MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
 
 int
-lnet_peer_buffer_credits(lnet_ni_t *ni)
+lnet_peer_buffer_credits(struct lnet_net *net)
 {
        /* NI option overrides LNet default */
-       if (ni->ni_peerrtrcredits > 0)
-               return ni->ni_peerrtrcredits;
+       if (net->net_tunables.lct_peer_rtr_credits > 0)
+               return net->net_tunables.lct_peer_rtr_credits;
        if (peer_buffer_credits > 0)
                return peer_buffer_credits;
 
        /* As an approximation, allow this peer the same number of router
         * buffers as it is allowed outstanding sends */
-       return ni->ni_peertxcredits;
+       return net->net_tunables.lct_peer_tx_credits;
 }
 
 /* forward ref's */
@@ -98,125 +98,152 @@ lnet_peers_start_down(void)
 }
 
 void
-lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
+                  cfs_time_t when)
 {
-       if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+       if (cfs_time_before(when, lp->lpni_timestamp)) { /* out of date information */
                CDEBUG(D_NET, "Out of date\n");
                return;
        }
 
-       lp->lp_timestamp = when;                /* update timestamp */
-       lp->lp_ping_deadline = 0;               /* disable ping timeout */
+       /*
+        * This function can be called with different cpt locks being
+        * held. lpni_alive_count modification needs to be properly protected.
+        * Significant reads to lpni_alive_count are also protected with
+        * the same lock
+        */
+       spin_lock(&lp->lpni_lock);
 
-       if (lp->lp_alive_count != 0 &&          /* got old news */
-           (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+       lp->lpni_timestamp = when;                /* update timestamp */
+       lp->lpni_ping_deadline = 0;               /* disable ping timeout */
+
+       if (lp->lpni_alive_count != 0 &&          /* got old news */
+           (!lp->lpni_alive) == (!alive)) {      /* new date for old news */
+               spin_unlock(&lp->lpni_lock);
                CDEBUG(D_NET, "Old news\n");
                return;
        }
 
        /* Flag that notification is outstanding */
 
-       lp->lp_alive_count++;
-       lp->lp_alive = !(!alive);               /* 1 bit! */
-       lp->lp_notify = 1;
-       lp->lp_notifylnd |= notifylnd;
-       if (lp->lp_alive)
-               lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+       lp->lpni_alive_count++;
+       lp->lpni_alive = (alive) ? 1 : 0;
+       lp->lpni_notify = 1;
+       lp->lpni_notifylnd = notifylnd;
+       if (lp->lpni_alive)
+               lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+       spin_unlock(&lp->lpni_lock);
 
-       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lpni_nid), alive);
 }
 
+/*
+ * This function will always be called with lp->lpni_cpt lock held.
+ */
 static void
-lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+lnet_ni_notify_locked(lnet_ni_t *ni, struct lnet_peer_ni *lp)
 {
-       int        alive;
-       int        notifylnd;
+       int alive;
+       int notifylnd;
 
        /* Notify only in 1 thread at any time to ensure ordered notification.
         * NB individual events can be missed; the only guarantee is that you
         * always get the most recent news */
 
-       if (lp->lp_notifying || ni == NULL)
+       spin_lock(&lp->lpni_lock);
+
+       if (lp->lpni_notifying || ni == NULL) {
+               spin_unlock(&lp->lpni_lock);
                return;
+       }
 
-       lp->lp_notifying = 1;
+       lp->lpni_notifying = 1;
 
-       while (lp->lp_notify) {
-               alive     = lp->lp_alive;
-               notifylnd = lp->lp_notifylnd;
+       /*
+        * lp->lpni_notify needs to be protected because it can be set in
+        * lnet_notify_locked().
+        */
+       while (lp->lpni_notify) {
+               alive     = lp->lpni_alive;
+               notifylnd = lp->lpni_notifylnd;
 
-               lp->lp_notifylnd = 0;
-               lp->lp_notify    = 0;
+               lp->lpni_notifylnd = 0;
+               lp->lpni_notify    = 0;
 
-               if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
-                       lnet_net_unlock(lp->lp_cpt);
+               if (notifylnd && ni->ni_net->net_lnd->lnd_notify != NULL) {
+                       spin_unlock(&lp->lpni_lock);
+                       lnet_net_unlock(lp->lpni_cpt);
 
                        /* A new notification could happen now; I'll handle it
                         * when control returns to me */
 
-                       (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+                       (ni->ni_net->net_lnd->lnd_notify)(ni, lp->lpni_nid,
+                                                         alive);
 
-                       lnet_net_lock(lp->lp_cpt);
+                       lnet_net_lock(lp->lpni_cpt);
+                       spin_lock(&lp->lpni_lock);
                }
        }
 
-       lp->lp_notifying = 0;
+       lp->lpni_notifying = 0;
+       spin_unlock(&lp->lpni_lock);
 }
 
-
 static void
-lnet_rtr_addref_locked(lnet_peer_t *lp)
+lnet_rtr_addref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(lp->lp_refcount > 0);
-       LASSERT(lp->lp_rtr_refcount >= 0);
+       LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+       LASSERT(lp->lpni_rtr_refcount >= 0);
 
        /* lnet_net_lock must be exclusively locked */
-       lp->lp_rtr_refcount++;
-       if (lp->lp_rtr_refcount == 1) {
+       lp->lpni_rtr_refcount++;
+       if (lp->lpni_rtr_refcount == 1) {
                struct list_head *pos;
 
                /* a simple insertion sort */
                list_for_each_prev(pos, &the_lnet.ln_routers) {
-                       lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
-                                                     lp_rtr_list);
+                       struct lnet_peer_ni *rtr =
+                         list_entry(pos, struct lnet_peer_ni,
+                                    lpni_rtr_list);
 
-                       if (rtr->lp_nid < lp->lp_nid)
+                       if (rtr->lpni_nid < lp->lpni_nid)
                                break;
                }
 
-               list_add(&lp->lp_rtr_list, pos);
+               list_add(&lp->lpni_rtr_list, pos);
                /* addref for the_lnet.ln_routers */
-               lnet_peer_addref_locked(lp);
+               lnet_peer_ni_addref_locked(lp);
                the_lnet.ln_routers_version++;
        }
 }
 
 static void
-lnet_rtr_decref_locked(lnet_peer_t *lp)
+lnet_rtr_decref_locked(struct lnet_peer_ni *lp)
 {
-       LASSERT(lp->lp_refcount > 0);
-       LASSERT(lp->lp_rtr_refcount > 0);
+       LASSERT(atomic_read(&lp->lpni_refcount) > 0);
+       LASSERT(lp->lpni_rtr_refcount > 0);
 
        /* lnet_net_lock must be exclusively locked */
-       lp->lp_rtr_refcount--;
-       if (lp->lp_rtr_refcount == 0) {
-               LASSERT(list_empty(&lp->lp_routes));
+       lp->lpni_rtr_refcount--;
+       if (lp->lpni_rtr_refcount == 0) {
+               LASSERT(list_empty(&lp->lpni_routes));
 
-               if (lp->lp_rcd != NULL) {
-                       list_add(&lp->lp_rcd->rcd_list,
+               if (lp->lpni_rcd != NULL) {
+                       list_add(&lp->lpni_rcd->rcd_list,
                                 &the_lnet.ln_rcd_deathrow);
-                       lp->lp_rcd = NULL;
+                       lp->lpni_rcd = NULL;
                }
 
-               list_del(&lp->lp_rtr_list);
+               list_del(&lp->lpni_rtr_list);
                /* decref for the_lnet.ln_routers */
-               lnet_peer_decref_locked(lp);
+               lnet_peer_ni_decref_locked(lp);
                the_lnet.ln_routers_version++;
        }
 }
 
 lnet_remotenet_t *
-lnet_find_net_locked (__u32 net)
+lnet_find_rnet_locked(__u32 net)
 {
        lnet_remotenet_t *rnet;
        struct list_head *tmp;
@@ -240,8 +267,7 @@ static void lnet_shuffle_seed(void)
        __u32 lnd_type;
        __u32 seed[2];
        struct timespec64 ts;
-       lnet_ni_t *ni;
-       struct list_head *tmp;
+       lnet_ni_t *ni = NULL;
 
        if (seeded)
                return;
@@ -250,8 +276,7 @@ static void lnet_shuffle_seed(void)
 
        /* Nodes with small feet have little entropy
         * the NID for this node gives the most entropy in the low bits */
-       list_for_each(tmp, &the_lnet.ln_nis) {
-               ni = list_entry(tmp, lnet_ni_t, ni_list);
+       while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
                lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
 
                if (lnd_type != LOLND)
@@ -286,7 +311,7 @@ lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route)
                offset--;
        }
        list_add(&route->lr_list, e);
-       list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+       list_add(&route->lr_gwlist, &route->lr_gateway->lpni_routes);
 
        the_lnet.ln_remote_nets_version++;
        lnet_rtr_addref_locked(route->lr_gateway);
@@ -301,6 +326,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        lnet_remotenet_t        *rnet2;
        lnet_route_t            *route;
        lnet_ni_t               *ni;
+       struct lnet_peer_ni     *lpni;
        int                     add_route;
        int                     rc;
 
@@ -339,13 +365,14 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
 
        lnet_net_lock(LNET_LOCK_EX);
 
-       rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
-       if (rc != 0) {
+       lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX);
+       if (IS_ERR(lpni)) {
                lnet_net_unlock(LNET_LOCK_EX);
 
                LIBCFS_FREE(route, sizeof(*route));
                LIBCFS_FREE(rnet, sizeof(*rnet));
 
+               rc = PTR_ERR(lpni);
                if (rc == -EHOSTUNREACH) /* gateway is not on a local net. */
                        return rc;       /* ignore the route entry */
                CERROR("Error %d creating route %s %d %s\n", rc,
@@ -353,10 +380,10 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                        libcfs_nid2str(gateway));
                return rc;
        }
-
+       route->lr_gateway = lpni;
        LASSERT(!the_lnet.ln_shutdown);
 
-       rnet2 = lnet_find_net_locked(net);
+       rnet2 = lnet_find_rnet_locked(net);
        if (rnet2 == NULL) {
                /* new network */
                list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
@@ -374,25 +401,25 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                }
 
                /* our lookups must be true */
-               LASSERT(route2->lr_gateway->lp_nid != gateway);
+               LASSERT(route2->lr_gateway->lpni_nid != gateway);
        }
 
        if (add_route) {
-               lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+               lnet_peer_ni_addref_locked(route->lr_gateway); /* +1 for notify */
                lnet_add_route_to_rnet(rnet2, route);
 
-               ni = route->lr_gateway->lp_ni;
+               ni = lnet_get_next_ni_locked(route->lr_gateway->lpni_net, NULL);
                lnet_net_unlock(LNET_LOCK_EX);
 
                /* XXX Assume alive */
-               if (ni->ni_lnd->lnd_notify != NULL)
-                       (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+               if (ni->ni_net->net_lnd->lnd_notify != NULL)
+                       (ni->ni_net->net_lnd->lnd_notify)(ni, gateway, 1);
 
                lnet_net_lock(LNET_LOCK_EX);
        }
 
        /* -1 for notify or !add_route */
-       lnet_peer_decref_locked(route->lr_gateway);
+       lnet_peer_ni_decref_locked(route->lr_gateway);
        lnet_net_unlock(LNET_LOCK_EX);
 
        rc = 0;
@@ -444,12 +471,12 @@ lnet_check_routes(void)
                                        continue;
                                }
 
-                               if (route->lr_gateway->lp_ni ==
-                                   route2->lr_gateway->lp_ni)
+                               if (route->lr_gateway->lpni_net ==
+                                   route2->lr_gateway->lpni_net)
                                        continue;
 
-                               nid1 = route->lr_gateway->lp_nid;
-                               nid2 = route2->lr_gateway->lp_nid;
+                               nid1 = route->lr_gateway->lpni_nid;
+                               nid2 = route2->lr_gateway->lpni_nid;
                                net = rnet->lrn_net;
 
                                lnet_net_unlock(cpt);
@@ -471,7 +498,7 @@ lnet_check_routes(void)
 int
 lnet_del_route(__u32 net, lnet_nid_t gw_nid)
 {
-       struct lnet_peer        *gateway;
+       struct lnet_peer_ni     *gateway;
        lnet_remotenet_t        *rnet;
        lnet_route_t            *route;
        struct list_head        *e1;
@@ -505,7 +532,7 @@ again:
 
                        gateway = route->lr_gateway;
                        if (!(gw_nid == LNET_NID_ANY ||
-                             gw_nid == gateway->lp_nid))
+                             gw_nid == gateway->lpni_nid))
                                continue;
 
                        list_del(&route->lr_list);
@@ -518,7 +545,7 @@ again:
                                rnet = NULL;
 
                        lnet_rtr_decref_locked(gateway);
-                       lnet_peer_decref_locked(gateway);
+                       lnet_peer_ni_decref_locked(gateway);
 
                        lnet_net_unlock(LNET_LOCK_EX);
 
@@ -608,7 +635,7 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
                                        *net      = rnet->lrn_net;
                                        *hops     = route->lr_hops;
                                        *priority = route->lr_priority;
-                                       *gateway  = route->lr_gateway->lp_nid;
+                                       *gateway  = route->lr_gateway->lpni_nid;
                                        *alive    = lnet_is_route_alive(route);
                                        lnet_net_unlock(cpt);
                                        return 0;
@@ -647,39 +674,49 @@ static void
 lnet_parse_rc_info(lnet_rc_data_t *rcd)
 {
        struct lnet_ping_info   *info = rcd->rcd_pinginfo;
-       struct lnet_peer        *gw   = rcd->rcd_gateway;
+       struct lnet_peer_ni     *gw   = rcd->rcd_gateway;
        lnet_route_t            *rte;
 
-       if (!gw->lp_alive)
+       if (!gw->lpni_alive)
                return;
 
+       /*
+        * Protect gw->lpni_ping_feats. This can be set from
+        * lnet_notify_locked with different locks being held
+        */
+       spin_lock(&gw->lpni_lock);
+
        if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
                lnet_swap_pinginfo(info);
 
        /* NB always racing with network! */
        if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
                CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
-                      libcfs_nid2str(gw->lp_nid), info->pi_magic);
-               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                      libcfs_nid2str(gw->lpni_nid), info->pi_magic);
+               gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+               spin_unlock(&gw->lpni_lock);
                return;
        }
 
-       gw->lp_ping_feats = info->pi_features;
-       if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+       gw->lpni_ping_feats = info->pi_features;
+       if ((gw->lpni_ping_feats & LNET_PING_FEAT_MASK) == 0) {
                CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
-                      libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+                      libcfs_nid2str(gw->lpni_nid), gw->lpni_ping_feats);
+               spin_unlock(&gw->lpni_lock);
                return; /* nothing I can understand */
        }
 
-       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+       if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) {
+               spin_unlock(&gw->lpni_lock);
                return; /* can't carry NI status info */
+       }
 
-       list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
+       list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
                int     down = 0;
                int     up = 0;
                int     i;
 
-               if ((gw->lp_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
+               if ((gw->lpni_ping_feats & LNET_PING_FEAT_RTE_DISABLED) != 0) {
                        rte->lr_downis = 1;
                        continue;
                }
@@ -690,8 +727,9 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
 
                        if (nid == LNET_NID_ANY) {
                                CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
-                                      libcfs_nid2str(gw->lp_nid));
-                               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                                      libcfs_nid2str(gw->lpni_nid));
+                               gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+                               spin_unlock(&gw->lpni_lock);
                                return;
                        }
 
@@ -712,8 +750,9 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
                        }
 
                        CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
-                              libcfs_nid2str(gw->lp_nid), stat->ns_status);
-                       gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                              libcfs_nid2str(gw->lpni_nid), stat->ns_status);
+                       gw->lpni_ping_feats = LNET_PING_FEAT_INVAL;
+                       spin_unlock(&gw->lpni_lock);
                        return;
                }
 
@@ -728,13 +767,15 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
 
                rte->lr_downis = down;
        }
+
+       spin_unlock(&gw->lpni_lock);
 }
 
 static void
 lnet_router_checker_event(lnet_event_t *event)
 {
-       lnet_rc_data_t          *rcd = event->md.user_ptr;
-       struct lnet_peer        *lp;
+       lnet_rc_data_t *rcd = event->md.user_ptr;
+       struct lnet_peer_ni *lp;
 
        LASSERT(rcd != NULL);
 
@@ -752,14 +793,14 @@ lnet_router_checker_event(lnet_event_t *event)
         /* NB: it's called with holding lnet_res_lock, we have a few
          * places need to hold both locks at the same time, please take
          * care of lock ordering */
-       lnet_net_lock(lp->lp_cpt);
-       if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+       lnet_net_lock(lp->lpni_cpt);
+       if (!lnet_isrouter(lp) || lp->lpni_rcd != rcd) {
                /* ignore if no longer a router or rcd is replaced */
                goto out;
        }
 
        if (event->type == LNET_EVENT_SEND) {
-               lp->lp_ping_notsent = 0;
+               lp->lpni_ping_notsent = 0;
                if (event->status == 0)
                        goto out;
        }
@@ -780,15 +821,15 @@ lnet_router_checker_event(lnet_event_t *event)
                lnet_parse_rc_info(rcd);
 
  out:
-       lnet_net_unlock(lp->lp_cpt);
+       lnet_net_unlock(lp->lpni_cpt);
 }
 
 static void
 lnet_wait_known_routerstate(void)
 {
-       lnet_peer_t      *rtr;
+       struct lnet_peer_ni *rtr;
        struct list_head *entry;
-       int               all_known;
+       int all_known;
 
        LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
 
@@ -797,12 +838,17 @@ lnet_wait_known_routerstate(void)
 
                all_known = 1;
                list_for_each(entry, &the_lnet.ln_routers) {
-                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+                       rtr = list_entry(entry, struct lnet_peer_ni,
+                                        lpni_rtr_list);
 
-                       if (rtr->lp_alive_count == 0) {
+                       spin_lock(&rtr->lpni_lock);
+
+                       if (rtr->lpni_alive_count == 0) {
                                all_known = 0;
+                               spin_unlock(&rtr->lpni_lock);
                                break;
                        }
+                       spin_unlock(&rtr->lpni_lock);
                }
 
                lnet_net_unlock(cpt);
@@ -816,12 +862,12 @@ lnet_wait_known_routerstate(void)
 }
 
 void
-lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
+lnet_router_ni_update_locked(struct lnet_peer_ni *gw, __u32 net)
 {
        lnet_route_t *rte;
 
-       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
-               list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
+       if ((gw->lpni_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
+               list_for_each_entry(rte, &gw->lpni_routes, lr_gwlist) {
                        if (rte->lr_net == net) {
                                rte->lr_downis = 0;
                                break;
@@ -833,8 +879,8 @@ lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
 static void
 lnet_update_ni_status_locked(void)
 {
-       lnet_ni_t       *ni;
-       time64_t now;
+       lnet_ni_t       *ni = NULL;
+       time64_t        now;
        int             timeout;
 
        LASSERT(the_lnet.ln_routing);
@@ -843,8 +889,8 @@ lnet_update_ni_status_locked(void)
                  MAX(live_router_check_interval, dead_router_check_interval);
 
        now = ktime_get_real_seconds();
-       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
-               if (ni->ni_lnd->lnd_type == LOLND)
+       while ((ni = lnet_get_next_ni_locked(NULL, ni))) {
+               if (ni->ni_net->net_lnd->lnd_type == LOLND)
                        continue;
 
                if (now < ni->ni_last_alive + timeout)
@@ -878,10 +924,10 @@ lnet_destroy_rc_data(lnet_rc_data_t *rcd)
        LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
 
        if (rcd->rcd_gateway != NULL) {
-               int cpt = rcd->rcd_gateway->lp_cpt;
+               int cpt = rcd->rcd_gateway->lpni_cpt;
 
                lnet_net_lock(cpt);
-               lnet_peer_decref_locked(rcd->rcd_gateway);
+               lnet_peer_ni_decref_locked(rcd->rcd_gateway);
                lnet_net_unlock(cpt);
        }
 
@@ -892,14 +938,14 @@ lnet_destroy_rc_data(lnet_rc_data_t *rcd)
 }
 
 static lnet_rc_data_t *
-lnet_create_rc_data_locked(lnet_peer_t *gateway)
+lnet_create_rc_data_locked(struct lnet_peer_ni *gateway)
 {
        lnet_rc_data_t          *rcd = NULL;
        struct lnet_ping_info   *pi;
        int                     rc;
        int                     i;
 
-       lnet_net_unlock(gateway->lp_cpt);
+       lnet_net_unlock(gateway->lpni_cpt);
 
        LIBCFS_ALLOC(rcd, sizeof(*rcd));
        if (rcd == NULL)
@@ -933,21 +979,21 @@ lnet_create_rc_data_locked(lnet_peer_t *gateway)
        }
        LASSERT(rc == 0);
 
-       lnet_net_lock(gateway->lp_cpt);
+       lnet_net_lock(gateway->lpni_cpt);
        /* router table changed or someone has created rcd for this gateway */
-       if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
-               lnet_net_unlock(gateway->lp_cpt);
+       if (!lnet_isrouter(gateway) || gateway->lpni_rcd != NULL) {
+               lnet_net_unlock(gateway->lpni_cpt);
                goto out;
        }
 
-       lnet_peer_addref_locked(gateway);
+       lnet_peer_ni_addref_locked(gateway);
        rcd->rcd_gateway = gateway;
-       gateway->lp_rcd = rcd;
-       gateway->lp_ping_notsent = 0;
+       gateway->lpni_rcd = rcd;
+       gateway->lpni_ping_notsent = 0;
 
        return rcd;
 
- out:
+out:
        if (rcd != NULL) {
                if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
                        rc = LNetMDUnlink(rcd->rcd_mdh);
@@ -956,16 +1002,16 @@ lnet_create_rc_data_locked(lnet_peer_t *gateway)
                lnet_destroy_rc_data(rcd);
        }
 
-       lnet_net_lock(gateway->lp_cpt);
-       return gateway->lp_rcd;
+       lnet_net_lock(gateway->lpni_cpt);
+       return gateway->lpni_rcd;
 }
 
 static int
-lnet_router_check_interval (lnet_peer_t *rtr)
+lnet_router_check_interval (struct lnet_peer_ni *rtr)
 {
        int secs;
 
-       secs = rtr->lp_alive ? live_router_check_interval :
+       secs = rtr->lpni_alive ? live_router_check_interval :
                               dead_router_check_interval;
        if (secs < 0)
                secs = 0;
@@ -974,30 +1020,32 @@ lnet_router_check_interval (lnet_peer_t *rtr)
 }
 
 static void
-lnet_ping_router_locked (lnet_peer_t *rtr)
+lnet_ping_router_locked (struct lnet_peer_ni *rtr)
 {
        lnet_rc_data_t *rcd = NULL;
-       cfs_time_t      now = cfs_time_current();
-       int             secs;
+       cfs_time_t      now = cfs_time_current();
+       int             secs;
+       struct lnet_ni  *ni;
 
-       lnet_peer_addref_locked(rtr);
+       lnet_peer_ni_addref_locked(rtr);
 
-       if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
-           cfs_time_after(now, rtr->lp_ping_deadline))
+       if (rtr->lpni_ping_deadline != 0 && /* ping timed out? */
+           cfs_time_after(now, rtr->lpni_ping_deadline))
                lnet_notify_locked(rtr, 1, 0, now);
 
        /* Run any outstanding notifications */
-       lnet_ni_notify_locked(rtr->lp_ni, rtr);
+       ni = lnet_get_next_ni_locked(rtr->lpni_net, NULL);
+       lnet_ni_notify_locked(ni, rtr);
 
        if (!lnet_isrouter(rtr) ||
            the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
                /* router table changed or router checker is shutting down */
-               lnet_peer_decref_locked(rtr);
+               lnet_peer_ni_decref_locked(rtr);
                return;
        }
 
-       rcd = rtr->lp_rcd != NULL ?
-             rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+       rcd = rtr->lpni_rcd != NULL ?
+             rtr->lpni_rcd : lnet_create_rc_data_locked(rtr);
 
        if (rcd == NULL)
                return;
@@ -1006,43 +1054,43 @@ lnet_ping_router_locked (lnet_peer_t *rtr)
 
        CDEBUG(D_NET,
               "rtr %s %d: deadline %lu ping_notsent %d alive %d "
-              "alive_count %d lp_ping_timestamp %lu\n",
-              libcfs_nid2str(rtr->lp_nid), secs,
-              rtr->lp_ping_deadline, rtr->lp_ping_notsent,
-              rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+              "alive_count %d lpni_ping_timestamp %lu\n",
+              libcfs_nid2str(rtr->lpni_nid), secs,
+              rtr->lpni_ping_deadline, rtr->lpni_ping_notsent,
+              rtr->lpni_alive, rtr->lpni_alive_count, rtr->lpni_ping_timestamp);
 
-       if (secs != 0 && !rtr->lp_ping_notsent &&
-           cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+       if (secs != 0 && !rtr->lpni_ping_notsent &&
+           cfs_time_after(now, cfs_time_add(rtr->lpni_ping_timestamp,
                                             cfs_time_seconds(secs)))) {
-               int               rc;
+               int               rc;
                lnet_process_id_t id;
                lnet_handle_md_t  mdh;
 
-               id.nid = rtr->lp_nid;
+               id.nid = rtr->lpni_nid;
                id.pid = LNET_PID_LUSTRE;
                CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
 
-               rtr->lp_ping_notsent   = 1;
-               rtr->lp_ping_timestamp = now;
+               rtr->lpni_ping_notsent   = 1;
+               rtr->lpni_ping_timestamp = now;
 
                mdh = rcd->rcd_mdh;
 
-               if (rtr->lp_ping_deadline == 0) {
-                       rtr->lp_ping_deadline =
+               if (rtr->lpni_ping_deadline == 0) {
+                       rtr->lpni_ping_deadline =
                                cfs_time_shift(router_ping_timeout);
                }
 
-               lnet_net_unlock(rtr->lp_cpt);
+               lnet_net_unlock(rtr->lpni_cpt);
 
                rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
                             LNET_PROTO_PING_MATCHBITS, 0);
 
-               lnet_net_lock(rtr->lp_cpt);
+               lnet_net_lock(rtr->lpni_cpt);
                if (rc != 0)
-                       rtr->lp_ping_notsent = 0; /* no event pending */
+                       rtr->lpni_ping_notsent = 0; /* no event pending */
        }
 
-       lnet_peer_decref_locked(rtr);
+       lnet_peer_ni_decref_locked(rtr);
        return;
 }
 
@@ -1121,7 +1169,7 @@ lnet_prune_rc_data(int wait_unlink)
 {
        lnet_rc_data_t          *rcd;
        lnet_rc_data_t          *tmp;
-       lnet_peer_t             *lp;
+       struct lnet_peer_ni     *lp;
        struct list_head         head;
        int                      i = 2;
 
@@ -1137,14 +1185,14 @@ lnet_prune_rc_data(int wait_unlink)
        if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
                /* router checker is stopping, prune all */
                list_for_each_entry(lp, &the_lnet.ln_routers,
-                                   lp_rtr_list) {
-                       if (lp->lp_rcd == NULL)
+                                   lpni_rtr_list) {
+                       if (lp->lpni_rcd == NULL)
                                continue;
 
-                       LASSERT(list_empty(&lp->lp_rcd->rcd_list));
-                       list_add(&lp->lp_rcd->rcd_list,
+                       LASSERT(list_empty(&lp->lpni_rcd->rcd_list));
+                       list_add(&lp->lpni_rcd->rcd_list,
                                 &the_lnet.ln_rcd_deathrow);
-                       lp->lp_rcd = NULL;
+                       lp->lpni_rcd = NULL;
                }
        }
 
@@ -1225,8 +1273,8 @@ lnet_router_checker_active(void)
 static int
 lnet_router_checker(void *arg)
 {
-       lnet_peer_t       *rtr;
-       struct list_head  *entry;
+       struct lnet_peer_ni *rtr;
+       struct list_head *entry;
 
        cfs_block_allsigs();
 
@@ -1240,9 +1288,10 @@ rescan:
                version = the_lnet.ln_routers_version;
 
                list_for_each(entry, &the_lnet.ln_routers) {
-                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+                       rtr = list_entry(entry, struct lnet_peer_ni,
+                                        lpni_rtr_list);
 
-                       cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+                       cpt2 = rtr->lpni_cpt;
                        if (cpt != cpt2) {
                                lnet_net_unlock(cpt);
                                cpt = cpt2;
@@ -1349,7 +1398,8 @@ lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp, int cpt)
        INIT_LIST_HEAD(&tmp);
 
        lnet_net_lock(cpt);
-       lnet_drop_routed_msgs_locked(&rbp->rbp_msgs, cpt);
+       list_splice_init(&rbp->rbp_msgs, &tmp);
+       lnet_drop_routed_msgs_locked(&tmp, cpt);
        list_splice_init(&rbp->rbp_bufs, &tmp);
        rbp->rbp_req_nbuffers = 0;
        rbp->rbp_nbuffers = rbp->rbp_credits = 0;
@@ -1716,9 +1766,9 @@ lnet_rtrpools_disable(void)
 int
 lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
-       struct lnet_peer        *lp = NULL;
-       cfs_time_t              now = cfs_time_current();
-       int                     cpt = lnet_cpt_of_nid(nid);
+       struct lnet_peer_ni *lp = NULL;
+       cfs_time_t now = cfs_time_current();
+       int cpt = lnet_cpt_of_nid(nid, ni);
 
        LASSERT (!in_interrupt ());
 
@@ -1758,7 +1808,7 @@ lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
                return -ESHUTDOWN;
        }
 
-       lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+       lp = lnet_find_peer_ni_locked(nid);
        if (lp == NULL) {
                /* nid not found */
                lnet_net_unlock(cpt);
@@ -1766,19 +1816,31 @@ lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
                return 0;
        }
 
+       /*
+        * It is possible for this function to be called for the same peer
+        * but with different NIs. We want to synchronize the notification
+        * between the different calls. So we will use the lpni_cpt to
+        * grab the net lock.
+        */
+       if (lp->lpni_cpt != cpt) {
+               lnet_net_unlock(cpt);
+               cpt = lp->lpni_cpt;
+               lnet_net_lock(cpt);
+       }
+
        /* We can't fully trust LND on reporting exact peer last_alive
         * if he notifies us about dead peer. For example ksocklnd can
         * call us with when == _time_when_the_node_was_booted_ if
         * no connections were successfully established */
-       if (ni != NULL && !alive && when < lp->lp_last_alive)
-               when = lp->lp_last_alive;
+       if (ni != NULL && !alive && when < lp->lpni_last_alive)
+               when = lp->lpni_last_alive;
 
        lnet_notify_locked(lp, ni == NULL, alive, when);
 
        if (ni != NULL)
                lnet_ni_notify_locked(ni, lp);
 
-       lnet_peer_decref_locked(lp);
+       lnet_peer_ni_decref_locked(lp);
 
        lnet_net_unlock(cpt);
        return 0;
index efec11b..209adab 100644 (file)
@@ -224,8 +224,8 @@ proc_lnet_routes(struct ctl_table *table, int write, void __user *buffer,
                        __u32        net        = rnet->lrn_net;
                        __u32 hops              = route->lr_hops;
                        unsigned int priority   = route->lr_priority;
-                       lnet_nid_t   nid        = route->lr_gateway->lp_nid;
-                       int          alive      = lnet_is_route_alive(route);
+                       lnet_nid_t   nid        = route->lr_gateway->lpni_nid;
+                       int          alive      = lnet_is_route_alive(route);
 
                        s += snprintf(s, tmpstr + tmpsiz - s,
                                      "%-8s %4u %8u %7s %s\n",
@@ -300,7 +300,7 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
                *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
        } else {
                struct list_head *r;
-               struct lnet_peer *peer = NULL;
+               struct lnet_peer_ni *peer = NULL;
                int               skip = off - 1;
 
                lnet_net_lock(0);
@@ -315,8 +315,9 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
                r = the_lnet.ln_routers.next;
 
                while (r != &the_lnet.ln_routers) {
-                       lnet_peer_t *lp = list_entry(r, lnet_peer_t,
-                                                    lp_rtr_list);
+                       struct lnet_peer_ni *lp =
+                         list_entry(r, struct lnet_peer_ni,
+                                    lpni_rtr_list);
 
                        if (skip == 0) {
                                peer = lp;
@@ -328,22 +329,22 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
                }
 
                if (peer != NULL) {
-                       lnet_nid_t nid = peer->lp_nid;
+                       lnet_nid_t nid = peer->lpni_nid;
                        cfs_time_t now = cfs_time_current();
-                       cfs_time_t deadline = peer->lp_ping_deadline;
-                       int nrefs     = peer->lp_refcount;
-                       int nrtrrefs  = peer->lp_rtr_refcount;
-                       int alive_cnt = peer->lp_alive_count;
-                       int alive     = peer->lp_alive;
-                       int pingsent  = !peer->lp_ping_notsent;
+                       cfs_time_t deadline = peer->lpni_ping_deadline;
+                       int nrefs     = atomic_read(&peer->lpni_refcount);
+                       int nrtrrefs  = peer->lpni_rtr_refcount;
+                       int alive_cnt = peer->lpni_alive_count;
+                       int alive     = peer->lpni_alive;
+                       int pingsent  = !peer->lpni_ping_notsent;
                        int last_ping = cfs_duration_sec(cfs_time_sub(now,
-                                                    peer->lp_ping_timestamp));
+                                                    peer->lpni_ping_timestamp));
                        int down_ni   = 0;
                        lnet_route_t *rtr;
 
-                       if ((peer->lp_ping_feats &
+                       if ((peer->lpni_ping_feats &
                             LNET_PING_FEAT_NI_STATUS) != 0) {
-                               list_for_each_entry(rtr, &peer->lp_routes,
+                               list_for_each_entry(rtr, &peer->lpni_routes,
                                                    lr_gwlist) {
                                        /* downis on any route should be the
                                         * number of downis on the gateway */
@@ -396,6 +397,8 @@ proc_lnet_routers(struct ctl_table *table, int write, void __user *buffer,
        return rc;
 }
 
+/* TODO: there should be no direct access to ptable. We should add a set
+ * of APIs that give access to the ptable and its members */
 static int
 proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                size_t *lenp, loff_t *ppos)
@@ -437,7 +440,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
 
                hoff++;
        } else {
-               struct lnet_peer        *peer;
+               struct lnet_peer_ni     *peer;
                struct list_head        *p;
                int                     skip;
  again:
@@ -461,15 +464,16 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                                p = ptable->pt_hash[hash].next;
 
                        while (p != &ptable->pt_hash[hash]) {
-                               lnet_peer_t *lp = list_entry(p, lnet_peer_t,
-                                                            lp_hashlist);
+                               struct lnet_peer_ni *lp =
+                                 list_entry(p, struct lnet_peer_ni,
+                                            lpni_hashlist);
                                if (skip == 0) {
                                        peer = lp;
 
                                        /* minor optimization: start from idx+1
                                         * on next iteration if we've just
-                                        * drained lp_hashlist */
-                                       if (lp->lp_hashlist.next ==
+                                        * drained lpni_hashlist */
+                                       if (lp->lpni_hashlist.next ==
                                            &ptable->pt_hash[hash]) {
                                                hoff = 1;
                                                hash++;
@@ -481,7 +485,7 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                                }
 
                                skip--;
-                               p = lp->lp_hashlist.next;
+                               p = lp->lpni_hashlist.next;
                        }
 
                        if (peer != NULL)
@@ -490,29 +494,30 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer,
                        p = NULL;
                        hoff = 1;
                        hash++;
-               }
+                }
 
                if (peer != NULL) {
-                       lnet_nid_t nid       = peer->lp_nid;
-                       int        nrefs     = peer->lp_refcount;
-                       int        lastalive = -1;
-                       char      *aliveness = "NA";
-                       int        maxcr     = peer->lp_ni->ni_peertxcredits;
-                       int        txcr      = peer->lp_txcredits;
-                       int        mintxcr   = peer->lp_mintxcredits;
-                       int        rtrcr     = peer->lp_rtrcredits;
-                       int        minrtrcr  = peer->lp_minrtrcredits;
-                       int        txqnob    = peer->lp_txqnob;
+                       lnet_nid_t nid = peer->lpni_nid;
+                       int nrefs = atomic_read(&peer->lpni_refcount);
+                       int lastalive = -1;
+                       char *aliveness = "NA";
+                       int maxcr = (peer->lpni_net) ?
+                         peer->lpni_net->net_tunables.lct_peer_tx_credits : 0;
+                       int txcr = peer->lpni_txcredits;
+                       int mintxcr = peer->lpni_mintxcredits;
+                       int rtrcr = peer->lpni_rtrcredits;
+                       int minrtrcr = peer->lpni_minrtrcredits;
+                       int txqnob = peer->lpni_txqnob;
 
                        if (lnet_isrouter(peer) ||
                            lnet_peer_aliveness_enabled(peer))
-                               aliveness = peer->lp_alive ? "up" : "down";
+                               aliveness = peer->lpni_alive ? "up" : "down";
 
                        if (lnet_peer_aliveness_enabled(peer)) {
-                               cfs_time_t     now = cfs_time_current();
+                               cfs_time_t now = cfs_time_current();
                                cfs_duration_t delta;
 
-                               delta = cfs_time_sub(now, peer->lp_last_alive);
+                               delta = cfs_time_sub(now, peer->lpni_last_alive);
                                lastalive = cfs_duration_sec(delta);
 
                                /* No need to mess up peers contents with
@@ -656,27 +661,14 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
                              "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
                              "nid", "status", "alive", "refs", "peer",
                              "rtr", "max", "tx", "min");
-               LASSERT(tmpstr + tmpsiz - s > 0);
+               LASSERT (tmpstr + tmpsiz - s > 0);
        } else {
-               struct list_head  *n;
-               lnet_ni_t         *ni   = NULL;
-               int                skip = *ppos - 1;
+               lnet_ni_t         *ni   = NULL;
+               int                skip = *ppos - 1;
 
                lnet_net_lock(0);
 
-               n = the_lnet.ln_nis.next;
-
-               while (n != &the_lnet.ln_nis) {
-                       lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
-
-                       if (skip == 0) {
-                               ni = a_ni;
-                               break;
-                       }
-
-                       skip--;
-                       n = n->next;
-               }
+               ni = lnet_get_ni_idx_locked(skip);
 
                if (ni != NULL) {
                        struct lnet_tx_queue    *tq;
@@ -690,7 +682,7 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
                                last_alive = now - ni->ni_last_alive;
 
                        /* @lo forever alive */
-                       if (ni->ni_lnd->lnd_type == LOLND)
+                       if (ni->ni_net->net_lnd->lnd_type == LOLND)
                                last_alive = 0;
 
                        lnet_ni_lock(ni);
@@ -718,8 +710,8 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
                                      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
                                      libcfs_nid2str(ni->ni_nid), stat,
                                      last_alive, *ni->ni_refs[i],
-                                     ni->ni_peertxcredits,
-                                     ni->ni_peerrtrcredits,
+                                     ni->ni_net->net_tunables.lct_peer_tx_credits,
+                                     ni->ni_net->net_tunables.lct_peer_rtr_credits,
                                      tq->tq_credits_max,
                                      tq->tq_credits, tq->tq_credits_min);
                                if (i != 0)
index bfabc6c..51e3254 100644 (file)
@@ -120,7 +120,7 @@ brw_client_init (sfw_test_instance_t *tsi)
                return -EINVAL;
 
        list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
-               bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+               bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid, NULL),
                                       off, npg, len, opc == LST_BRW_READ);
                if (bulk == NULL) {
                        brw_client_fini(tsi);
index ae43d09..50cf411 100644 (file)
@@ -1032,7 +1032,8 @@ sfw_run_batch (sfw_batch_t *tsb)
                        wi = &tsu->tsu_worker;
                        swi_init_workitem(wi, tsu, sfw_run_test,
                                          lst_sched_test[\
-                                         lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+                                         lnet_cpt_of_nid(tsu->tsu_dest.nid,
+                                                         NULL)]);
                        swi_schedule_workitem(wi);
                }
        }
index 970f130..9fecaad 100644 (file)
@@ -524,7 +524,7 @@ srpc_init_client_rpc(srpc_client_rpc_t *rpc, lnet_process_id_t peer,
 
        INIT_LIST_HEAD(&rpc->crpc_list);
        swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
-                         lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+                         lst_sched_test[lnet_cpt_of_nid(peer.nid, NULL)]);
        spin_lock_init(&rpc->crpc_lock);
        atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
 
index 2a6f58f..de8804b 100644 (file)
@@ -160,6 +160,32 @@ static yaml_token_handler dispatch_tbl[] = {
        [YAML_SCALAR_TOKEN] = yaml_scalar,
 };
 
+/* dispatch table */
+static char *token_type_string[] = {
+       [YAML_NO_TOKEN] = "YAML_NO_TOKEN",
+       [YAML_STREAM_START_TOKEN] = "YAML_STREAM_START_TOKEN",
+       [YAML_STREAM_END_TOKEN] = "YAML_STREAM_END_TOKEN",
+       [YAML_VERSION_DIRECTIVE_TOKEN] = "YAML_VERSION_DIRECTIVE_TOKEN",
+       [YAML_TAG_DIRECTIVE_TOKEN] = "YAML_TAG_DIRECTIVE_TOKEN",
+       [YAML_DOCUMENT_START_TOKEN] = "YAML_DOCUMENT_START_TOKEN",
+       [YAML_DOCUMENT_END_TOKEN] = "YAML_DOCUMENT_END_TOKEN",
+       [YAML_BLOCK_SEQUENCE_START_TOKEN] = "YAML_BLOCK_SEQUENCE_START_TOKEN",
+       [YAML_BLOCK_MAPPING_START_TOKEN] = "YAML_BLOCK_MAPPING_START_TOKEN",
+       [YAML_BLOCK_END_TOKEN] = "YAML_BLOCK_END_TOKEN",
+       [YAML_FLOW_SEQUENCE_START_TOKEN] = "YAML_FLOW_SEQUENCE_START_TOKEN",
+       [YAML_FLOW_SEQUENCE_END_TOKEN] = "YAML_FLOW_SEQUENCE_END_TOKEN",
+       [YAML_FLOW_MAPPING_START_TOKEN] = "YAML_FLOW_MAPPING_START_TOKEN",
+       [YAML_FLOW_MAPPING_END_TOKEN] = "YAML_FLOW_MAPPING_END_TOKEN",
+       [YAML_BLOCK_ENTRY_TOKEN] = "YAML_BLOCK_ENTRY_TOKEN",
+       [YAML_FLOW_ENTRY_TOKEN] = "YAML_FLOW_ENTRY_TOKEN",
+       [YAML_KEY_TOKEN] = "YAML_KEY_TOKEN",
+       [YAML_VALUE_TOKEN] = "YAML_VALUE_TOKEN",
+       [YAML_ALIAS_TOKEN] = "YAML_ALIAS_TOKEN",
+       [YAML_ANCHOR_TOKEN] = "YAML_ANCHOR_TOKEN",
+       [YAML_TAG_TOKEN] = "YAML_TAG_TOKEN",
+       [YAML_SCALAR_TOKEN] = "YAML_SCALAR_TOKEN",
+};
+
 static void cYAML_ll_free(struct list_head *ll)
 {
        struct cYAML_ll *node, *tmp;
@@ -661,19 +687,23 @@ static bool find_obj_iter(struct cYAML *node, void *usr_data, void **out)
 
 struct cYAML *cYAML_get_object_item(struct cYAML *parent, const char *name)
 {
-       struct cYAML *node;
+       struct cYAML *node = parent, *found = NULL;
 
-       if (parent == NULL || parent->cy_child == NULL || name == NULL)
+       if (!node || !name)
                return NULL;
 
-       node = parent->cy_child;
-
-       while (node != NULL &&
-               strcmp(node->cy_string, name) != 0) {
-               node = node->cy_next;
+       if (node->cy_string) {
+               if (strcmp(node->cy_string, name) == 0)
+                       return node;
        }
 
-       return node;
+       if (node->cy_child)
+               found = cYAML_get_object_item(node->cy_child, name);
+
+       if (!found && node->cy_next)
+               found = cYAML_get_object_item(node->cy_next, name);
+
+       return found;
 }
 
 struct cYAML *cYAML_get_next_seq_item(struct cYAML *seq, struct cYAML **itm)
@@ -1097,7 +1127,8 @@ failed:
 struct cYAML *cYAML_build_tree(char *yaml_file,
                               const char *yaml_blk,
                               size_t yaml_blk_size,
-                              struct cYAML **err_rc)
+                              struct cYAML **err_rc,
+                              bool debug)
 {
        yaml_parser_t parser;
        yaml_token_t token;
@@ -1145,6 +1176,11 @@ struct cYAML *cYAML_build_tree(char *yaml_file,
                */
                yaml_parser_scan(&parser, &token);
 
+               if (debug)
+                       fprintf(stderr, "token.type = %s: %s\n",
+                               token_type_string[token.type],
+                               (token.type == YAML_SCALAR_TOKEN) ?
+                               (char*)token.data.scalar.value : "");
                rc = dispatch_tbl[token.type](&token, &tree);
                if (rc != CYAML_ERROR_NONE) {
                        snprintf(err_str, sizeof(err_str),
index 98f7a27..c9c21c7 100644 (file)
@@ -86,7 +86,7 @@ typedef bool (*cYAML_walk_cb)(struct cYAML *, void *, void**);
  */
 struct cYAML *cYAML_build_tree(char *yaml_file, const char *yaml_blk,
                                size_t yaml_blk_size,
-                               struct cYAML **err_str);
+                               struct cYAML **err_str, bool debug);
 
 /*
  * cYAML_print_tree
index 58ab064..54e9a26 100644 (file)
@@ -34,6 +34,6 @@ liblnetconfig_la_SOURCES  = liblnetconfig.c liblnetconfig.h \
                            liblnetconfig_lnd.c liblnd.h $(CYAML)
 liblnetconfig_la_CPPFLAGS = -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 \
                            -DLUSTRE_UTILS=1 -I$(top_builddir)/lnet/utils/cyaml
-liblnetconfig_la_LDFLAGS = -L$(top_builddir)/libcfs/libcfs -version-info 1:1:0
+liblnetconfig_la_LDFLAGS = -L$(top_builddir)/libcfs/libcfs -version-info 2:0:0
 
 EXTRA_DIST =
index 0de9fba..45f7893 100644 (file)
 #include "cyaml.h"
 
 int
-lustre_interface_show_net(struct cYAML *interfaces, unsigned int index,
-                         bool detail, struct lnet_ioctl_config_data *data,
-                         struct lnet_ioctl_net_config *net_config);
+lustre_net_show_tunables(struct cYAML *tunables,
+                        struct lnet_ioctl_config_lnd_cmn_tunables *cmn);
+
+int
+lustre_ni_show_tunables(struct cYAML *lnd_tunables,
+                       __u32 net_type,
+                       struct lnet_lnd_tunables *lnd);
 
 void
-lustre_interface_parse(struct cYAML *lndparams, const char *dev_name,
-                      struct lnet_ioctl_config_lnd_tunables *lnd_cfg);
+lustre_yaml_extract_lnd_tunables(struct cYAML *tree,
+                                __u32 net_type,
+                                struct lnet_lnd_tunables *tun);
 
 #endif /* LIB_LND_CONFIG_API_H */
index 3e50f81..d32ce27 100644 (file)
 
 #include <errno.h>
 #include <limits.h>
+#include <byteswap.h>
 #include <netdb.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
+#include <net/if.h>
 #include <libcfs/util/ioctl.h>
 #include <lnet/lnetctl.h>
 #include <lnet/socklnd.h>
 #include "liblnd.h"
+#include <lnet/lnet.h>
+#include <sys/types.h>
+#include <ifaddrs.h>
 #include "liblnetconfig.h"
 #include "cyaml.h"
 
 #define ADD_CMD                        "add"
 #define DEL_CMD                        "del"
 #define SHOW_CMD               "show"
+#define DBG_CMD                        "dbg"
+
+/*
+ * lustre_lnet_ip_range_descr
+ *     Describes an IP range.
+ *     Each octect is an expression
+ */
+struct lustre_lnet_ip_range_descr {
+       struct list_head ipr_entry;
+       struct list_head ipr_expr;
+};
+
+/*
+ * lustre_lnet_ip2nets
+ *     Describes an ip2nets rule. This can be on a list of rules.
+ */
+struct lustre_lnet_ip2nets {
+       struct lnet_dlc_network_descr ip2nets_net;
+       struct list_head ip2nets_ip_ranges;
+};
+
+/*
+ * free_intf_descr
+ *     frees the memory allocated for an intf descriptor.
+ */
+void free_intf_descr(struct lnet_dlc_intf_descr *intf_descr)
+{
+       if (!intf_descr)
+               return;
+
+       if (intf_descr->cpt_expr != NULL)
+               cfs_expr_list_free(intf_descr->cpt_expr);
+       free(intf_descr);
+}
+
+/*
+ * lustre_lnet_add_ip_range
+ * Formatting:
+ *     given a string of the format:
+ *     <expr.expr.expr.expr> parse each expr into
+ *     a lustre_lnet_ip_range_descr structure and insert on the list.
+ *
+ *     This function is called from
+ *             YAML on each ip-range.
+ *             As a result of lnetctl command
+ *             When building a NID or P2P selection rules
+ */
+int lustre_lnet_add_ip_range(struct list_head *list, char *str_ip_range)
+{
+       struct lustre_lnet_ip_range_descr *ip_range;
+       int rc;
+
+       ip_range = calloc(1, sizeof(*ip_range));
+       if (ip_range == NULL)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       INIT_LIST_HEAD(&ip_range->ipr_entry);
+       INIT_LIST_HEAD(&ip_range->ipr_expr);
+
+       rc = cfs_ip_addr_parse(str_ip_range, strlen(str_ip_range),
+                              &ip_range->ipr_expr);
+       if (rc != 0)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       list_add_tail(&ip_range->ipr_entry, list);
+
+       return LUSTRE_CFG_RC_NO_ERR;
+}
+
+int lustre_lnet_add_intf_descr(struct list_head *list, char *intf, int len)
+{
+       char *open_sq_bracket = NULL, *close_sq_bracket = NULL,
+            *intf_name;
+       struct lnet_dlc_intf_descr *intf_descr = NULL;
+       int rc;
+       char intf_string[LNET_MAX_STR_LEN];
+
+       if (len >= LNET_MAX_STR_LEN)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       strncpy(intf_string, intf, len);
+       intf_string[len] = '\0';
+
+       intf_descr = calloc(1, sizeof(*intf_descr));
+       if (intf_descr == NULL)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       INIT_LIST_HEAD(&intf_descr->intf_on_network);
+
+       intf_name = intf_string;
+       open_sq_bracket = strchr(intf_string, '[');
+       if (open_sq_bracket != NULL) {
+               close_sq_bracket = strchr(intf_string, ']');
+               if (close_sq_bracket == NULL) {
+                       free(intf_descr);
+                       return LUSTRE_CFG_RC_BAD_PARAM;
+               }
+               rc = cfs_expr_list_parse(open_sq_bracket,
+                                        strlen(open_sq_bracket), 0, UINT_MAX,
+                                        &intf_descr->cpt_expr);
+               if (rc < 0) {
+                       free(intf_descr);
+                       return LUSTRE_CFG_RC_BAD_PARAM;
+               }
+               strncpy(intf_descr->intf_name, intf_name,
+                       open_sq_bracket - intf_name);
+               intf_descr->intf_name[open_sq_bracket - intf_name] = '\0';
+       } else {
+               strcpy(intf_descr->intf_name, intf_name);
+               intf_descr->cpt_expr = NULL;
+       }
+
+       list_add_tail(&intf_descr->intf_on_network, list);
+
+       return LUSTRE_CFG_RC_NO_ERR;
+}
+
+void lustre_lnet_init_nw_descr(struct lnet_dlc_network_descr *nw_descr)
+{
+       if (nw_descr != NULL) {
+               INIT_LIST_HEAD(&nw_descr->network_on_rule);
+               INIT_LIST_HEAD(&nw_descr->nw_intflist);
+       }
+}
+
+int lustre_lnet_parse_nids(char *nids, char **array, int size,
+                          char ***out_array)
+{
+       int num_nids = 0;
+       char *comma = nids, *cur, *entry;
+       char **new_array;
+       int i, len, start = 0, finish = 0;
+
+       if (nids == NULL || strlen(nids) == 0)
+               return size;
+
+       /* count the number or new nids, by counting the number of commas */
+       while (comma) {
+               comma = strchr(comma, ',');
+               if (comma) {
+                       comma++;
+                       num_nids++;
+               } else {
+                       num_nids++;
+               }
+       }
+
+       /*
+        * if the array is not NULL allocate a large enough array to house
+        * the old and new entries
+        */
+       new_array = calloc(sizeof(char*),
+                          (size > 0) ? size + num_nids : num_nids);
+
+       if (!new_array)
+               goto failed;
+
+       /* parse our the new nids and add them to the tail of the array */
+       comma = nids;
+       cur = nids;
+       start = (size > 0) ? size: 0;
+       finish = (size > 0) ? size + num_nids : num_nids;
+       for (i = start; i < finish; i++) {
+               comma = strchr(comma, ',');
+               if (!comma)
+                       /*
+                        * the length of the string to be parsed out is
+                        * from cur to end of string. So it's good enough
+                        * to strlen(cur)
+                        */
+                       len = strlen(cur) + 1;
+               else
+                       /* length of the string is comma - cur */
+                       len = (comma - cur) + 1;
+
+               entry = calloc(1, len);
+               if (!entry) {
+                       finish = i > 0 ? i - 1: 0;
+                       goto failed;
+               }
+               strncpy(entry, cur, len - 1);
+               entry[len] = '\0';
+               new_array[i] = entry;
+               if (comma) {
+                       comma++;
+                       cur = comma;
+               }
+       }
+
+       /* add the old entries in the array and delete the old array*/
+       for (i = 0; i < size; i++)
+               new_array[i] = array[i];
+
+       if (array)
+               free(array);
+
+       *out_array = new_array;
+
+       return finish;
+
+failed:
+       for (i = start; i < finish; i++)
+               free(new_array[i]);
+       if (new_array)
+               free(new_array);
+
+       return size;
+}
+
+/*
+ * format expected:
+ *     <intf>[<expr>], <intf>[<expr>],..
+ */
+int lustre_lnet_parse_interfaces(char *intf_str,
+                                struct lnet_dlc_network_descr *nw_descr)
+{
+       char *open_square;
+       char *close_square;
+       char *comma;
+       char *cur = intf_str, *next = NULL;
+       char *end = intf_str + strlen(intf_str);
+       int rc, len;
+       struct lnet_dlc_intf_descr *intf_descr, *tmp;
+
+       if (nw_descr == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       while (cur < end) {
+               open_square = strchr(cur, '[');
+               if (open_square != NULL) {
+                       close_square = strchr(cur, ']');
+                       if (close_square == NULL) {
+                               rc = LUSTRE_CFG_RC_BAD_PARAM;
+                               goto failed;
+                       }
+
+                       comma = strchr(cur, ',');
+                       if (comma != NULL && comma > close_square) {
+                               next = comma + 1;
+                               len = next - close_square;
+                       } else {
+                               len = strlen(cur);
+                               next = cur + len;
+                       }
+               } else {
+                       comma = strchr(cur, ',');
+                       if (comma != NULL) {
+                               next = comma + 1;
+                               len = comma - cur;
+                       } else {
+                               len = strlen(cur);
+                               next = cur + len;
+                       }
+               }
+
+               rc = lustre_lnet_add_intf_descr(&nw_descr->nw_intflist, cur, len);
+               if (rc != LUSTRE_CFG_RC_NO_ERR)
+                       goto failed;
+
+               cur = next;
+       }
+
+       return LUSTRE_CFG_RC_NO_ERR;
+
+failed:
+       list_for_each_entry_safe(intf_descr, tmp, &nw_descr->nw_intflist,
+                                intf_on_network) {
+               list_del(&intf_descr->intf_on_network);
+               free_intf_descr(intf_descr);
+       }
+
+       return rc;
+}
 
 int lustre_lnet_config_lib_init(void)
 {
@@ -59,6 +337,11 @@ int lustre_lnet_config_lib_init(void)
                                LNET_DEV_MAJOR, LNET_DEV_MINOR);
 }
 
+void lustre_lnet_config_lib_uninit(void)
+{
+       unregister_ioc_dev(LNET_DEV_ID);
+}
+
 int lustre_lnet_config_ni_system(bool up, bool load_ni_from_mod,
                                 int seq_no, struct cYAML **err_rc)
 {
@@ -92,6 +375,185 @@ int lustre_lnet_config_ni_system(bool up, bool load_ni_from_mod,
        return rc;
 }
 
+static lnet_nid_t *allocate_create_nid_array(char **nids, __u32 num_nids,
+                                            char *err_str)
+{
+       lnet_nid_t *array = NULL;
+       __u32 i;
+
+       if (!nids) {
+               snprintf(err_str, LNET_MAX_STR_LEN, "no NIDs to add");
+               return NULL;
+       }
+
+       array = calloc(sizeof(*array) * num_nids, 1);
+       if (array == NULL) {
+               snprintf(err_str, LNET_MAX_STR_LEN, "out of memory");
+               return NULL;
+       }
+
+       for (i = 0; i < num_nids; i++) {
+               array[i] = libcfs_str2nid(nids[i]);
+               if (array[i] == LNET_NID_ANY) {
+                       free(array);
+                       snprintf(err_str, LNET_MAX_STR_LEN,
+                                "bad NID: '%s'",
+                                nids[i]);
+                       return NULL;
+               }
+       }
+
+       return array;
+}
+
+static int dispatch_peer_ni_cmd(lnet_nid_t pnid, lnet_nid_t nid, __u32 cmd,
+                               struct lnet_ioctl_peer_cfg *data,
+                               char *err_str, char *cmd_str)
+{
+       int rc;
+
+       data->prcfg_prim_nid = pnid;
+       data->prcfg_cfg_nid = nid;
+
+       rc = l_ioctl(LNET_DEV_ID, cmd, data);
+       if (rc != 0) {
+               rc = -errno;
+               snprintf(err_str,
+                       LNET_MAX_STR_LEN,
+                       "\"cannot %s peer ni: %s\"",
+                       (cmd_str) ? cmd_str : "add", strerror(errno));
+       }
+
+       return rc;
+}
+
+int lustre_lnet_config_peer_nid(char *pnid, char **nid, int num_nids,
+                               bool mr, int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_peer_cfg data;
+       lnet_nid_t prim_nid = LNET_NID_ANY;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       int idx = 0;
+       bool nid0_used = false;
+       char err_str[LNET_MAX_STR_LEN] = {0};
+       lnet_nid_t *nids = allocate_create_nid_array(nid, num_nids, err_str);
+
+       if (pnid) {
+               prim_nid = libcfs_str2nid(pnid);
+               if (prim_nid == LNET_NID_ANY) {
+                       snprintf(err_str, sizeof(err_str),
+                                "bad key NID: '%s'",
+                                pnid);
+                       rc = LUSTRE_CFG_RC_MISSING_PARAM;
+                       goto out;
+               }
+       } else if (!nids || nids[0] == LNET_NID_ANY) {
+               snprintf(err_str, sizeof(err_str),
+                        "no NIDs provided for configuration");
+               rc = LUSTRE_CFG_RC_MISSING_PARAM;
+               goto out;
+       } else {
+               prim_nid = LNET_NID_ANY;
+       }
+
+       snprintf(err_str, sizeof(err_str), "\"Success\"");
+
+       LIBCFS_IOC_INIT_V2(data, prcfg_hdr);
+       data.prcfg_mr = mr;
+
+       /*
+        * if prim_nid is not specified use the first nid in the list of
+        * nids provided as the prim_nid. NOTE: on entering 'if' we must
+        * have at least 1 NID
+        */
+       if (prim_nid == LNET_NID_ANY) {
+               nid0_used = true;
+               prim_nid = nids[0];
+       }
+
+       /* Create the prim_nid first */
+       rc = dispatch_peer_ni_cmd(prim_nid, LNET_NID_ANY,
+                                 IOC_LIBCFS_ADD_PEER_NI,
+                                 &data, err_str, "add");
+
+       if (rc != 0)
+               goto out;
+
+       /* add the rest of the nids to the key nid if any are available */
+       for (idx = nid0_used ? 1 : 0 ; nids && idx < num_nids; idx++) {
+               /*
+                * If prim_nid is not provided then the first nid in the
+                * list becomes the prim_nid. First time round the loop use
+                * LNET_NID_ANY for the first parameter, then use nid[0]
+                * as the key nid after wards
+                */
+               rc = dispatch_peer_ni_cmd(prim_nid, nids[idx],
+                                         IOC_LIBCFS_ADD_PEER_NI, &data,
+                                         err_str, "add");
+
+               if (rc != 0)
+                       goto out;
+       }
+
+out:
+       if (nids != NULL)
+               free(nids);
+       cYAML_build_error(rc, seq_no, ADD_CMD, "peer_ni", err_str, err_rc);
+       return rc;
+}
+
+int lustre_lnet_del_peer_nid(char *pnid, char **nid, int num_nids,
+                            int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_peer_cfg data;
+       lnet_nid_t prim_nid;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       int idx = 0;
+       char err_str[LNET_MAX_STR_LEN] = {0};
+       lnet_nid_t *nids = allocate_create_nid_array(nid, num_nids, err_str);
+
+       if (pnid == NULL) {
+               snprintf(err_str, sizeof(err_str),
+                        "\"Primary nid is not provided\"");
+               rc = LUSTRE_CFG_RC_MISSING_PARAM;
+               goto out;
+       } else {
+               prim_nid = libcfs_str2nid(pnid);
+               if (prim_nid == LNET_NID_ANY) {
+                       rc = LUSTRE_CFG_RC_BAD_PARAM;
+                       snprintf(err_str, sizeof(err_str),
+                                "bad key NID: '%s'",
+                                pnid);
+                       goto out;
+               }
+       }
+
+       snprintf(err_str, sizeof(err_str), "\"Success\"");
+
+       LIBCFS_IOC_INIT_V2(data, prcfg_hdr);
+       if (!nids || nids[0] == LNET_NID_ANY) {
+               rc = dispatch_peer_ni_cmd(prim_nid, LNET_NID_ANY,
+                                         IOC_LIBCFS_DEL_PEER_NI,
+                                         &data, err_str, "del");
+               goto out;
+       }
+
+       for (idx = 0; nids && idx < num_nids; idx++) {
+               rc = dispatch_peer_ni_cmd(prim_nid, nids[idx],
+                                         IOC_LIBCFS_DEL_PEER_NI, &data,
+                                         err_str, "del");
+
+               if (rc != 0)
+                       goto out;
+       }
+
+out:
+       if (nids != NULL)
+               free(nids);
+       cYAML_build_error(rc, seq_no, DEL_CMD, "peer_ni", err_str, err_rc);
+       return rc;
+}
+
 int lustre_lnet_config_route(char *nw, char *gw, int hops, int prio,
                             int seq_no, struct cYAML **err_rc)
 {
@@ -440,167 +902,646 @@ out:
        return rc;
 }
 
-int lustre_lnet_config_net(char *net, char *intf, char *ip2net,
-                          int peer_to, int peer_cr, int peer_buf_cr,
-                          int credits, char *smp, int seq_no,
-                          struct lnet_ioctl_config_lnd_tunables *lnd_tunables,
-                          struct cYAML **err_rc)
+static int socket_intf_query(int request, char *intf,
+                            struct ifreq *ifr)
 {
-       struct lnet_ioctl_config_lnd_tunables *lnd = NULL;
-       struct lnet_ioctl_config_data *data;
-       size_t ioctl_size = sizeof(*data);
-       char buf[LNET_MAX_STR_LEN];
-       int rc = LUSTRE_CFG_RC_NO_ERR;
-       char err_str[LNET_MAX_STR_LEN];
+       int rc;
+       int sockfd;
 
-       snprintf(err_str, sizeof(err_str), "\"success\"");
+       if (strlen(intf) >= IFNAMSIZ || ifr == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
 
-       /* No need to register lo */
-       if (net != NULL && !strcmp(net, "lo"))
-               return 0;
+       sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+       if (sockfd < 0)
+               return LUSTRE_CFG_RC_BAD_PARAM;
 
-       if (ip2net == NULL && (intf == NULL || net == NULL)) {
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"mandatory parameter '%s' not specified."
-                        " Optionally specify ip2net parameter\"",
-                        (intf == NULL && net == NULL) ? "net, if" :
-                        (intf == NULL) ? "if" : "net");
-               rc = LUSTRE_CFG_RC_MISSING_PARAM;
-               goto out;
-       }
+       strcpy(ifr->ifr_name, intf);
+       rc = ioctl(sockfd, request, ifr);
+       if (rc != 0)
+               return LUSTRE_CFG_RC_BAD_PARAM;
 
-       if (peer_to != -1 && peer_to <= 0) {
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"peer timeout %d, must be greater than 0\"",
-                        peer_to);
-               rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
-               goto out;
-       }
+       return 0;
+}
 
-       if (ip2net != NULL && strlen(ip2net) >= sizeof(buf)) {
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"ip2net string too long %d\"",
-                               (int)strlen(ip2net));
-               rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
-               goto out;
-       }
+/*
+ * for each interface in the array of interfaces find the IP address of
+ * that interface, create its nid and add it to an array of NIDs.
+ * Stop if any of the interfaces is down
+ */
+static int lustre_lnet_intf2nids(struct lnet_dlc_network_descr *nw,
+                                lnet_nid_t **nids, __u32 *nnids)
+{
+       int i = 0, count = 0, rc;
+       struct ifreq ifr;
+       __u32 ip;
+       struct lnet_dlc_intf_descr *intf;
 
-       if (lnd_tunables != NULL)
-               ioctl_size += sizeof(*lnd_tunables);
+       if (nw == NULL || nids == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
 
-       data = calloc(1, ioctl_size);
-       if (data == NULL)
-               goto out;
+       list_for_each_entry(intf, &nw->nw_intflist, intf_on_network)
+               count++;
 
-       if (ip2net == NULL)
-               snprintf(buf, sizeof(buf) - 1, "%s(%s)%s",
-                       net, intf,
-                       (smp) ? smp : "");
+       *nids = calloc(count, sizeof(lnet_nid_t));
+       if (*nids == NULL)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
-       LIBCFS_IOC_INIT_V2(*data, cfg_hdr);
-       strncpy(data->cfg_config_u.cfg_net.net_intf,
-               (ip2net != NULL) ? ip2net : buf, sizeof(buf));
-       data->cfg_config_u.cfg_net.net_peer_timeout = peer_to;
-       data->cfg_config_u.cfg_net.net_peer_tx_credits = peer_cr;
-       data->cfg_config_u.cfg_net.net_peer_rtr_credits = peer_buf_cr;
-       data->cfg_config_u.cfg_net.net_max_tx_credits = credits;
-       /* Add in tunable settings if available */
-       if (lnd_tunables != NULL) {
-               lnd = (struct lnet_ioctl_config_lnd_tunables *)data->cfg_bulk;
+       list_for_each_entry(intf, &nw->nw_intflist, intf_on_network) {
+               memset(&ifr, 0, sizeof(ifr));
+               rc = socket_intf_query(SIOCGIFFLAGS, intf->intf_name, &ifr);
+               if (rc != 0)
+                       goto failed;
 
-               data->cfg_hdr.ioc_len = ioctl_size;
-               memcpy(lnd, lnd_tunables, sizeof(*lnd_tunables));
-       }
+               if ((ifr.ifr_flags & IFF_UP) == 0) {
+                       rc = LUSTRE_CFG_RC_BAD_PARAM;
+                       goto failed;
+               }
 
-       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_NET, data);
-       if (rc < 0) {
-               rc = -errno;
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"cannot add network: %s\"", strerror(errno));
+               memset(&ifr, 0, sizeof(ifr));
+               rc = socket_intf_query(SIOCGIFADDR, intf->intf_name, &ifr);
+               if (rc != 0)
+                       goto failed;
+
+               ip = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+               ip = bswap_32(ip);
+               (*nids)[i] = LNET_MKNID(nw->nw_id, ip);
+               i++;
        }
-       free(data);
 
-out:
-       cYAML_build_error(rc, seq_no, ADD_CMD, "net", err_str, err_rc);
+       *nnids = count;
+
+       return 0;
 
+failed:
+       free(*nids);
+       *nids = NULL;
        return rc;
 }
 
-int lustre_lnet_del_net(char *nw, int seq_no, struct cYAML **err_rc)
+/*
+ * called repeatedly until a match or no more ip range
+ * What do you have?
+ *     ip_range expression
+ *     interface list with all the interface names.
+ *     all the interfaces in the system.
+ *
+ *     try to match the ip_range expr to one of the interfaces' IPs in
+ *     the system. If we hit a patch for an interface. Check if that
+ *     interface name is in the list.
+ *
+ *     If there are more than one interface in the list, then make sure
+ *     that the IPs for all of these interfaces match the ip ranges
+ *     given.
+ *
+ *     for each interface in intf_list
+ *             look up the intf name in ifa
+ *             if not there then no match
+ *             check ip obtained from ifa against a match to any of the
+ *             ip_ranges given.
+ *             If no match, then fail
+ *
+ *     The result is that all the interfaces have to match.
+ */
+int lustre_lnet_match_ip_to_intf(struct ifaddrs *ifa,
+                                struct list_head *intf_list,
+                                struct list_head *ip_ranges)
 {
-       struct lnet_ioctl_config_data data;
-       __u32 net = LNET_NIDNET(LNET_NID_ANY);
-       int rc = LUSTRE_CFG_RC_NO_ERR;
-       char err_str[LNET_MAX_STR_LEN];
+       int rc;
+       __u32 ip;
+       struct lnet_dlc_intf_descr *intf_descr, *tmp;
+       struct ifaddrs *ifaddr = ifa;
+       struct lustre_lnet_ip_range_descr *ip_range;
+       int family;
+
+       /*
+        * if there are no explicit interfaces, and no ip ranges, then
+        * configure the first tcp interface we encounter.
+        */
+       if (list_empty(intf_list) && list_empty(ip_ranges)) {
+               for (ifaddr = ifa; ifaddr != NULL; ifaddr = ifaddr->ifa_next) {
+                       if (ifaddr->ifa_addr == NULL)
+                               continue;
+
+                       if ((ifaddr->ifa_flags & IFF_UP) == 0)
+                               continue;
+
+                       family = ifaddr->ifa_addr->sa_family;
+                       if (family == AF_INET &&
+                           strcmp(ifaddr->ifa_name, "lo") != 0) {
+                               rc = lustre_lnet_add_intf_descr
+                                       (intf_list, ifaddr->ifa_name,
+                                       strlen(ifaddr->ifa_name));
 
-       snprintf(err_str, sizeof(err_str), "\"success\"");
+                               if (rc != LUSTRE_CFG_RC_NO_ERR)
+                                       return rc;
 
-       if (nw == NULL) {
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"missing mandatory parameter\"");
-               rc = LUSTRE_CFG_RC_MISSING_PARAM;
-               goto out;
+                               return LUSTRE_CFG_RC_MATCH;
+                       }
+               }
+               return LUSTRE_CFG_RC_NO_MATCH;
        }
 
-       net = libcfs_str2net(nw);
-       if (net == LNET_NIDNET(LNET_NID_ANY)) {
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"cannot parse net '%s'\"", nw);
-               rc = LUSTRE_CFG_RC_BAD_PARAM;
-               goto out;
-       }
+       /*
+        * First interface which matches an IP pattern will be used
+        */
+       if (list_empty(intf_list)) {
+               /*
+                * no interfaces provided in the rule, but an ip range is
+                * provided, so try and match an interface to the ip
+                * range.
+                */
+               for (ifaddr = ifa; ifaddr != NULL; ifaddr = ifaddr->ifa_next) {
+                       if (ifaddr->ifa_addr == NULL)
+                               continue;
+
+                       if ((ifaddr->ifa_flags & IFF_UP) == 0)
+                               continue;
+
+                       family = ifaddr->ifa_addr->sa_family;
+                       if (family == AF_INET) {
+                               ip = ((struct sockaddr_in *)ifaddr->ifa_addr)->
+                                       sin_addr.s_addr;
+
+                               list_for_each_entry(ip_range, ip_ranges,
+                                                   ipr_entry) {
+                                       rc = cfs_ip_addr_match(bswap_32(ip),
+                                                       &ip_range->ipr_expr);
+                                       if (!rc)
+                                               continue;
+
+                                       rc = lustre_lnet_add_intf_descr
+                                         (intf_list, ifaddr->ifa_name,
+                                          strlen(ifaddr->ifa_name));
+
+                                       if (rc != LUSTRE_CFG_RC_NO_ERR)
+                                               return rc;
+                               }
+                       }
+               }
 
-       LIBCFS_IOC_INIT_V2(data, cfg_hdr);
-       data.cfg_net = net;
+               if (!list_empty(intf_list))
+                       return LUSTRE_CFG_RC_MATCH;
 
-       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_NET, &data);
-       if (rc != 0) {
-               rc = -errno;
-               snprintf(err_str,
-                        sizeof(err_str),
-                        "\"cannot delete network: %s\"", strerror(errno));
-               goto out;
+               return LUSTRE_CFG_RC_NO_MATCH;
        }
 
-out:
-       cYAML_build_error(rc, seq_no, DEL_CMD, "net", err_str, err_rc);
-
-       return rc;
-}
+       /*
+        * If an interface is explicitly specified the ip-range might or
+        * might not be specified. if specified the interface needs to match the
+        * ip-range. If no ip-range then the interfaces are
+        * automatically matched if they are all up.
+        * If > 1 interfaces all the interfaces must match for the NI to
+        * be configured.
+        */
+       list_for_each_entry_safe(intf_descr, tmp, intf_list, intf_on_network) {
+               for (ifaddr = ifa; ifaddr != NULL; ifaddr = ifaddr->ifa_next) {
+                       if (ifaddr->ifa_addr == NULL)
+                               continue;
+
+                       family = ifaddr->ifa_addr->sa_family;
+                       if (family == AF_INET &&
+                           strcmp(intf_descr->intf_name,
+                                  ifaddr->ifa_name) == 0)
+                               break;
+               }
+
+               if (ifaddr == NULL) {
+                       list_del(&intf_descr->intf_on_network);
+                       free_intf_descr(intf_descr);
+                       continue;
+               }
+
+               if ((ifaddr->ifa_flags & IFF_UP) == 0) {
+                       list_del(&intf_descr->intf_on_network);
+                       free_intf_descr(intf_descr);
+                       continue;
+               }
+
+               ip = ((struct sockaddr_in *)ifaddr->ifa_addr)->sin_addr.s_addr;
+
+               rc = 1;
+               list_for_each_entry(ip_range, ip_ranges, ipr_entry) {
+                       rc = cfs_ip_addr_match(bswap_32(ip), &ip_range->ipr_expr);
+                       if (rc)
+                               break;
+               }
+
+               if (!rc) {
+                       /* no match for this interface */
+                       list_del(&intf_descr->intf_on_network);
+                       free_intf_descr(intf_descr);
+               }
+       }
+
+       return LUSTRE_CFG_RC_MATCH;
+}
+
+int lustre_lnet_resolve_ip2nets_rule(struct lustre_lnet_ip2nets *ip2nets,
+                                    lnet_nid_t **nids, __u32 *nnids)
+{
+       struct ifaddrs *ifa;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+
+       rc = getifaddrs(&ifa);
+       if (rc < 0)
+               return -errno;
+
+       rc = lustre_lnet_match_ip_to_intf(ifa,
+                                         &ip2nets->ip2nets_net.nw_intflist,
+                                         &ip2nets->ip2nets_ip_ranges);
+       if (rc != LUSTRE_CFG_RC_MATCH) {
+               freeifaddrs(ifa);
+               return rc;
+       }
+
+       rc = lustre_lnet_intf2nids(&ip2nets->ip2nets_net, nids, nnids);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               *nids = NULL;
+               *nnids = 0;
+       }
+
+       freeifaddrs(ifa);
+
+       return rc;
+}
+
+static int
+lustre_lnet_ioctl_config_ni(struct list_head *intf_list,
+                           struct lnet_ioctl_config_lnd_tunables *tunables,
+                           struct cfs_expr_list *global_cpts,
+                           lnet_nid_t *nids, char *err_str)
+{
+       char *data;
+       struct lnet_ioctl_config_ni *conf;
+       struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+       int rc = LUSTRE_CFG_RC_NO_ERR, i = 0;
+       size_t len;
+       int count;
+       struct lnet_dlc_intf_descr *intf_descr;
+       __u32 *cpt_array;
+       struct cfs_expr_list *cpt_expr;
+
+       list_for_each_entry(intf_descr, intf_list,
+                           intf_on_network) {
+               if (i == 0 && tunables != NULL)
+                       len = sizeof(struct lnet_ioctl_config_ni) +
+                             sizeof(struct lnet_ioctl_config_lnd_tunables);
+               else
+                       len = sizeof(struct lnet_ioctl_config_ni);
+
+               data = calloc(1, len);
+               conf = (struct lnet_ioctl_config_ni*) data;
+               if (i == 0 && tunables != NULL)
+                       tun = (struct lnet_ioctl_config_lnd_tunables*)
+                               conf->lic_bulk;
+
+               LIBCFS_IOC_INIT_V2(*conf, lic_cfg_hdr);
+               conf->lic_cfg_hdr.ioc_len = len;
+               conf->lic_nid = nids[i];
+               strncpy(conf->lic_ni_intf[0], intf_descr->intf_name,
+                       LNET_MAX_STR_LEN);
+
+               if (intf_descr->cpt_expr != NULL)
+                       cpt_expr = intf_descr->cpt_expr;
+               else if (global_cpts != NULL)
+                       cpt_expr = global_cpts;
+               else
+                       cpt_expr = NULL;
+
+               if (cpt_expr != NULL) {
+                       count = cfs_expr_list_values(cpt_expr,
+                                                    LNET_MAX_SHOW_NUM_CPT,
+                                                    &cpt_array);
+                       if (count > 0) {
+                               memcpy(conf->lic_cpts, cpt_array,
+                                      sizeof(cpt_array[0]) * LNET_MAX_STR_LEN);
+                               free(cpt_array);
+                       } else {
+                               count = 0;
+                       }
+               } else {
+                       count = 0;
+               }
+
+               conf->lic_ncpts = count;
+
+               if (i == 0 && tunables != NULL)
+                       /* TODO put in the LND tunables */
+                       memcpy(tun, tunables, sizeof(*tunables));
+
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_LOCAL_NI, data);
+               if (rc < 0) {
+                       rc = -errno;
+                       snprintf(err_str,
+                                LNET_MAX_STR_LEN,
+                                "\"cannot add network: %s\"", strerror(errno));
+                       return rc;
+               }
+               i++;
+       }
+
+       return LUSTRE_CFG_RC_NO_ERR;
+}
+
+int
+lustre_lnet_config_ip2nets(struct lustre_lnet_ip2nets *ip2nets,
+                          struct lnet_ioctl_config_lnd_tunables *tunables,
+                          struct cfs_expr_list *global_cpts,
+                          int seq_no, struct cYAML **err_rc)
+{
+       lnet_nid_t *nids = NULL;
+       __u32 nnids = 0;
+       int rc;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       if (!ip2nets) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"incomplete ip2nets information\"");
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       rc = lustre_lnet_resolve_ip2nets_rule(ip2nets, &nids, &nnids);
+       if (rc != LUSTRE_CFG_RC_NO_ERR && rc != LUSTRE_CFG_RC_MATCH) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot resolve ip2nets rule\"");
+               goto out;
+       }
+
+       if (list_empty(&ip2nets->ip2nets_net.nw_intflist)) {
+               snprintf(err_str, sizeof(err_str),
+                        "\"no interfaces match ip2nets rules\"");
+               goto out;
+       }
+
+       rc = lustre_lnet_ioctl_config_ni(&ip2nets->ip2nets_net.nw_intflist,
+                                        tunables, global_cpts, nids,
+                                        err_str);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               free(nids);
+
+out:
+       cYAML_build_error(rc, seq_no, ADD_CMD, "ip2nets", err_str, err_rc);
+       return rc;
+}
+
+int lustre_lnet_config_ni(struct lnet_dlc_network_descr *nw_descr,
+                         struct cfs_expr_list *global_cpts,
+                         char *ip2net,
+                         struct lnet_ioctl_config_lnd_tunables *tunables,
+                         int seq_no, struct cYAML **err_rc)
+{
+       char *data = NULL;
+       struct lnet_ioctl_config_ni *conf;
+       struct lnet_ioctl_config_lnd_tunables *tun = NULL;
+       char buf[LNET_MAX_STR_LEN];
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+       lnet_nid_t *nids = NULL;
+       __u32 nnids = 0;
+       size_t len;
+       int count;
+       struct lnet_dlc_intf_descr *intf_descr, *tmp;
+       __u32 *cpt_array;
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       if (ip2net == NULL && nw_descr == NULL) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"mandatory parameters not specified.\"");
+               rc = LUSTRE_CFG_RC_MISSING_PARAM;
+               goto out;
+       }
+
+       if (ip2net != NULL && strlen(ip2net) >= sizeof(buf)) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"ip2net string too long %d\"",
+                               (int)strlen(ip2net));
+               rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
+               goto out;
+       }
+
+       if (ip2net != NULL) {
+               if (tunables != NULL)
+                       len = sizeof(struct lnet_ioctl_config_ni) +
+                             sizeof(struct lnet_ioctl_config_lnd_tunables);
+               else
+                       len = sizeof(struct lnet_ioctl_config_ni);
+               data = calloc(1, len);
+               conf = (struct lnet_ioctl_config_ni*) data;
+               if (tunables != NULL)
+                       tun = (struct lnet_ioctl_config_lnd_tunables*)
+                               (data + sizeof(*conf));
+
+               LIBCFS_IOC_INIT_V2(*conf, lic_cfg_hdr);
+               conf->lic_cfg_hdr.ioc_len = len;
+               strncpy(conf->lic_legacy_ip2nets, ip2net,
+                       LNET_MAX_STR_LEN);
+
+               if (global_cpts != NULL) {
+                       count = cfs_expr_list_values(global_cpts,
+                                                    LNET_MAX_SHOW_NUM_CPT,
+                                                    &cpt_array);
+                       if (count > 0) {
+                               memcpy(conf->lic_cpts, cpt_array,
+                                      sizeof(cpt_array[0]) * LNET_MAX_STR_LEN);
+                               free(cpt_array);
+                       } else {
+                               count = 0;
+                       }
+               } else {
+                       count = 0;
+               }
+
+               conf->lic_ncpts = count;
+
+               if (tunables != NULL)
+                       memcpy(tun, tunables, sizeof(*tunables));
+
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_LOCAL_NI, data);
+               if (rc < 0) {
+                       rc = -errno;
+                       snprintf(err_str,
+                               sizeof(err_str),
+                               "\"cannot add network: %s\"", strerror(errno));
+                       goto out;
+               }
+
+               goto out;
+       }
+
+       if (LNET_NETTYP(nw_descr->nw_id) == LOLND)
+               return LUSTRE_CFG_RC_NO_ERR;
+
+       if (nw_descr->nw_id == LNET_NIDNET(LNET_NID_ANY)) {
+               snprintf(err_str,
+                       sizeof(err_str),
+                       "\"cannot parse net '%s'\"",
+                       libcfs_net2str(nw_descr->nw_id));
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       if (list_empty(&nw_descr->nw_intflist)) {
+               snprintf(err_str,
+                       sizeof(err_str),
+                       "\"no interface name provided\"");
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       rc = lustre_lnet_intf2nids(nw_descr, &nids, &nnids);
+       if (rc != 0) {
+               snprintf(err_str, sizeof(err_str),
+                        "\"bad parameter\"");
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       rc = lustre_lnet_ioctl_config_ni(&nw_descr->nw_intflist,
+                                        tunables, global_cpts, nids,
+                                        err_str);
+
+out:
+       if (nw_descr != NULL) {
+               list_for_each_entry_safe(intf_descr, tmp,
+                                        &nw_descr->nw_intflist,
+                                        intf_on_network) {
+                       list_del(&intf_descr->intf_on_network);
+                       free_intf_descr(intf_descr);
+               }
+       }
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, "net", err_str, err_rc);
+
+       if (nids)
+               free(nids);
+
+       if (data)
+               free(data);
+
+       return rc;
+}
+
+int lustre_lnet_del_ni(struct lnet_dlc_network_descr *nw_descr,
+                      int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_config_ni data;
+       int rc = LUSTRE_CFG_RC_NO_ERR, i;
+       char err_str[LNET_MAX_STR_LEN];
+       lnet_nid_t *nids = NULL;
+       __u32 nnids = 0;
+       struct lnet_dlc_intf_descr *intf_descr, *tmp;
+
+       if (LNET_NETTYP(nw_descr->nw_id) == LOLND)
+               return LUSTRE_CFG_RC_NO_ERR;
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       if (nw_descr == NULL) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"missing mandatory parameter\"");
+               rc = LUSTRE_CFG_RC_MISSING_PARAM;
+               goto out;
+       }
+
+       if (nw_descr->nw_id == LNET_NIDNET(LNET_NID_ANY)) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot parse net '%s'\"",
+                        libcfs_net2str(nw_descr->nw_id));
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       rc = lustre_lnet_intf2nids(nw_descr, &nids, &nnids);
+       if (rc != 0) {
+               snprintf(err_str, sizeof(err_str),
+                        "\"bad parameter\"");
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       /*
+        * no interfaces just the nw_id is specified
+        */
+       if (nnids == 0) {
+               nids = calloc(1, sizeof(*nids));
+               if (nids == NULL) {
+                       snprintf(err_str, sizeof(err_str),
+                               "\"out of memory\"");
+                       rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+                       goto out;
+               }
+               nids[0] = LNET_MKNID(nw_descr->nw_id, 0);
+               nnids = 1;
+       }
+
+       for (i = 0; i < nnids; i++) {
+               LIBCFS_IOC_INIT_V2(data, lic_cfg_hdr);
+               data.lic_nid = nids[i];
+
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_LOCAL_NI, &data);
+               if (rc < 0) {
+                       rc = -errno;
+                       snprintf(err_str,
+                               sizeof(err_str),
+                               "\"cannot del network: %s\"", strerror(errno));
+               }
+       }
+
+       list_for_each_entry_safe(intf_descr, tmp, &nw_descr->nw_intflist,
+                                intf_on_network) {
+               list_del(&intf_descr->intf_on_network);
+               free_intf_descr(intf_descr);
+       }
+
+out:
+       cYAML_build_error(rc, seq_no, DEL_CMD, "net", err_str, err_rc);
+
+       if (nids != NULL)
+               free(nids);
+
+       return rc;
+}
 
 int lustre_lnet_show_net(char *nw, int detail, int seq_no,
                         struct cYAML **show_rc, struct cYAML **err_rc)
 {
        char *buf;
-       struct lnet_ioctl_config_lnd_tunables *lnd_cfg;
-       struct lnet_ioctl_config_data *data;
-       struct lnet_ioctl_net_config *net_config;
+       struct lnet_ioctl_config_ni *ni_data;
+       struct lnet_ioctl_config_lnd_tunables *lnd;
+       struct lnet_ioctl_element_stats *stats;
        __u32 net = LNET_NIDNET(LNET_NID_ANY);
+       __u32 prev_net = LNET_NIDNET(LNET_NID_ANY);
        int rc = LUSTRE_CFG_RC_OUT_OF_MEM, i, j;
        int l_errno = 0;
-       struct cYAML *root = NULL, *tunables = NULL, *net_node = NULL,
-               *interfaces = NULL, *item = NULL, *first_seq = NULL;
+       struct cYAML *root = NULL, *tunables = NULL,
+               *net_node = NULL, *interfaces = NULL,
+               *item = NULL, *first_seq = NULL,
+               *tmp = NULL, *statistics = NULL;
        int str_buf_len = LNET_MAX_SHOW_NUM_CPT * 2;
        char str_buf[str_buf_len];
        char *pos;
        char err_str[LNET_MAX_STR_LEN];
-       bool exist = false;
-       size_t buf_len;
+       bool exist = false, new_net = true;
+       int net_num = 0;
+       size_t buf_size = sizeof(*ni_data) + sizeof(*lnd) + sizeof(*stats);
 
        snprintf(err_str, sizeof(err_str), "\"out of memory\"");
 
-       buf_len = sizeof(*data) + sizeof(*net_config) + sizeof(*lnd_cfg);
-       buf = calloc(1, buf_len);
+       buf = calloc(1, buf_size);
        if (buf == NULL)
                goto out;
 
-       data = (struct lnet_ioctl_config_data *)buf;
+       ni_data = (struct lnet_ioctl_config_ni *)buf;
 
        if (nw != NULL) {
                net = libcfs_str2net(nw);
@@ -623,116 +1564,158 @@ int lustre_lnet_show_net(char *nw, int detail, int seq_no,
 
        for (i = 0;; i++) {
                pos = str_buf;
+               __u32 rc_net;
 
-               memset(buf, 0, buf_len);
+               memset(buf, 0, buf_size);
 
-               LIBCFS_IOC_INIT_V2(*data, cfg_hdr);
+               LIBCFS_IOC_INIT_V2(*ni_data, lic_cfg_hdr);
                /*
                 * set the ioc_len to the proper value since INIT assumes
                 * size of data
                 */
-               data->cfg_hdr.ioc_len = buf_len;
-               data->cfg_count = i;
+               ni_data->lic_cfg_hdr.ioc_len = buf_size;
+               ni_data->lic_idx = i;
 
-               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NET, data);
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_LOCAL_NI, ni_data);
                if (rc != 0) {
                        l_errno = errno;
                        break;
                }
 
+               rc_net = LNET_NIDNET(ni_data->lic_nid);
+
                /* filter on provided data */
                if (net != LNET_NIDNET(LNET_NID_ANY) &&
-                   net != LNET_NIDNET(data->cfg_nid))
+                   net != rc_net)
                        continue;
 
                /* default rc to -1 in case we hit the goto */
                rc = -1;
                exist = true;
 
-               net_config = (struct lnet_ioctl_net_config *)data->cfg_bulk;
+               stats = (struct lnet_ioctl_element_stats *)ni_data->lic_bulk;
+               lnd = (struct lnet_ioctl_config_lnd_tunables *)
+                       (ni_data->lic_bulk + sizeof(*stats));
+
+               if (rc_net != prev_net) {
+                       prev_net = rc_net;
+                       new_net = true;
+                       net_num++;
+               }
+
+               if (new_net) {
+                       if (!cYAML_create_string(net_node, "net type",
+                                                libcfs_net2str(rc_net)))
+                               goto out;
+
+                       tmp = cYAML_create_seq(net_node, "local NI(s)");
+                       if (tmp == NULL)
+                               goto out;
+                       new_net = false;
+               }
 
                /* create the tree to be printed. */
-               item = cYAML_create_seq_item(net_node);
+               item = cYAML_create_seq_item(tmp);
                if (item == NULL)
                        goto out;
 
                if (first_seq == NULL)
                        first_seq = item;
 
-               if (cYAML_create_string(item, "net",
-                                       libcfs_net2str(
-                                               LNET_NIDNET(data->cfg_nid)))
-                   == NULL)
-                       goto out;
-
                if (cYAML_create_string(item, "nid",
-                                       libcfs_nid2str(data->cfg_nid)) == NULL)
+                                       libcfs_nid2str(ni_data->lic_nid)) == NULL)
                        goto out;
 
-               if (cYAML_create_string(item, "status",
-                                       (net_config->ni_status ==
+               if (cYAML_create_string(item,
+                                       "status",
+                                       (ni_data->lic_status ==
                                          LNET_NI_STATUS_UP) ?
                                            "up" : "down") == NULL)
                        goto out;
 
                /* don't add interfaces unless there is at least one
                 * interface */
-               if (strlen(net_config->ni_interfaces[0]) > 0) {
+               if (strlen(ni_data->lic_ni_intf[0]) > 0) {
                        interfaces = cYAML_create_object(item, "interfaces");
                        if (interfaces == NULL)
                                goto out;
 
                        for (j = 0; j < LNET_MAX_INTERFACES; j++) {
-                               if (lustre_interface_show_net(interfaces, j,
-                                                             detail, data,
-                                                             net_config) < 0)
-                                       goto out;
+                               if (strlen(ni_data->lic_ni_intf[j]) > 0) {
+                                       snprintf(str_buf,
+                                                sizeof(str_buf), "%d", j);
+                                       if (cYAML_create_string(interfaces,
+                                               str_buf,
+                                               ni_data->lic_ni_intf[j]) ==
+                                                       NULL)
+                                               goto out;
+                               }
                        }
                }
 
                if (detail) {
                        char *limit;
 
+                       statistics = cYAML_create_object(item, "statistics");
+                       if (statistics == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(statistics, "send_count",
+                                               stats->send_count)
+                                                       == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(statistics, "recv_count",
+                                               stats->recv_count)
+                                                       == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(statistics, "drop_count",
+                                               stats->drop_count)
+                                                       == NULL)
+                               goto out;
+
                        tunables = cYAML_create_object(item, "tunables");
-                       if (tunables == NULL)
+                       if (!tunables)
+                               goto out;
+
+                       rc = lustre_net_show_tunables(tunables, &lnd->lt_cmn);
+                       if (rc != LUSTRE_CFG_RC_NO_ERR)
                                goto out;
 
-                       if (cYAML_create_number(tunables, "peer_timeout",
-                                               data->cfg_config_u.cfg_net.
-                                               net_peer_timeout) == NULL)
+                       tunables = cYAML_create_object(item, "lnd tunables");
+                       if (tunables == NULL)
                                goto out;
 
-                       if (cYAML_create_number(tunables, "peer_credits",
-                                               data->cfg_config_u.cfg_net.
-                                               net_peer_tx_credits) == NULL)
+                       rc = lustre_ni_show_tunables(tunables, LNET_NETTYP(rc_net),
+                                                    &lnd->lt_tun);
+                       if (rc != LUSTRE_CFG_RC_NO_ERR)
                                goto out;
 
-                       if (cYAML_create_number(tunables,
-                                               "peer_buffer_credits",
-                                               data->cfg_config_u.cfg_net.
-                                               net_peer_rtr_credits) == NULL)
+                       if (cYAML_create_number(item, "tcp bonding",
+                                               ni_data->lic_tcp_bonding)
+                                                       == NULL)
                                goto out;
 
-                       if (cYAML_create_number(tunables, "credits",
-                                               data->cfg_config_u.cfg_net.
-                                               net_max_tx_credits) == NULL)
+                       if (cYAML_create_number(item, "dev cpt",
+                                               ni_data->lic_dev_cpt) == NULL)
                                goto out;
 
                        /* out put the CPTs in the format: "[x,x,x,...]" */
                        limit = str_buf + str_buf_len - 3;
                        pos += snprintf(pos, limit - pos, "\"[");
-                       for (j = 0 ; data->cfg_ncpts > 1 &&
-                               j < data->cfg_ncpts &&
+                       for (j = 0 ; ni_data->lic_ncpts >= 1 &&
+                               j < ni_data->lic_ncpts &&
                                pos < limit; j++) {
                                pos += snprintf(pos, limit - pos,
-                                               "%d", net_config->ni_cpts[j]);
-                               if ((j + 1) < data->cfg_ncpts)
+                                               "%d", ni_data->lic_cpts[j]);
+                               if ((j + 1) < ni_data->lic_ncpts)
                                        pos += snprintf(pos, limit - pos, ",");
                        }
                        pos += snprintf(pos, 3, "]\"");
 
-                       if (data->cfg_ncpts > 1 &&
-                           cYAML_create_string(tunables, "CPT",
+                       if (ni_data->lic_ncpts >= 1 &&
+                           cYAML_create_string(item, "CPT",
                                                str_buf) == NULL)
                                goto out;
                }
@@ -811,6 +1794,40 @@ out:
        return rc;
 }
 
+int lustre_lnet_config_numa_range(int range, int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_numa_range data;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       if (range < 0) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"range must be >= 0\"");
+               rc = LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM;
+               goto out;
+       }
+
+       LIBCFS_IOC_INIT_V2(data, nr_hdr);
+       data.nr_range = range;
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_SET_NUMA_RANGE, &data);
+       if (rc != 0) {
+               rc = -errno;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot configure buffers: %s\"", strerror(errno));
+               goto out;
+       }
+
+out:
+       cYAML_build_error(rc, seq_no, ADD_CMD, "numa_range", err_str, err_rc);
+
+       return rc;
+}
+
 int lustre_lnet_config_buffers(int tiny, int small, int large, int seq_no,
                               struct cYAML **err_rc)
 {
@@ -1002,20 +2019,35 @@ out:
        return rc;
 }
 
-int lustre_lnet_show_peer_credits(int seq_no, struct cYAML **show_rc,
-                                 struct cYAML **err_rc)
+int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
+                         struct cYAML **show_rc, struct cYAML **err_rc)
 {
-       struct lnet_ioctl_peer peer_info;
+       /*
+        * TODO: This function is changing in a future patch to accommodate
+        * PEER_LIST and proper filtering on any nid of the peer
+        */
+       struct lnet_ioctl_peer_cfg *peer_info;
+       struct lnet_peer_ni_credit_info *lpni_cri;
+       struct lnet_ioctl_element_stats *lpni_stats;
        int rc = LUSTRE_CFG_RC_OUT_OF_MEM, ncpt = 0, i = 0, j = 0;
        int l_errno = 0;
-       struct cYAML *root = NULL, *peer = NULL, *first_seq = NULL,
-                    *peer_root = NULL;
+       struct cYAML *root = NULL, *peer = NULL, *peer_ni = NULL,
+                    *first_seq = NULL, *peer_root = NULL, *tmp = NULL;
        char err_str[LNET_MAX_STR_LEN];
-       bool ncpt_set = false;
+       lnet_nid_t prev_primary_nid = LNET_NID_ANY, primary_nid = LNET_NID_ANY;
+       int data_size = sizeof(*peer_info) + sizeof(*lpni_cri) +
+                       sizeof(*lpni_stats);
+       char *data = calloc(data_size, 1);
+       bool new_peer = true;
 
        snprintf(err_str, sizeof(err_str),
                 "\"out of memory\"");
 
+       if (data == NULL)
+               goto out;
+
+       peer_info = (struct lnet_ioctl_peer_cfg *)data;
+
        /* create struct cYAML root object */
        root = cYAML_create_object(NULL, NULL);
        if (root == NULL)
@@ -1025,84 +2057,128 @@ int lustre_lnet_show_peer_credits(int seq_no, struct cYAML **show_rc,
        if (peer_root == NULL)
                goto out;
 
+       if (knid != NULL)
+               primary_nid = libcfs_str2nid(knid);
+
        do {
                for (i = 0;; i++) {
-                       LIBCFS_IOC_INIT_V2(peer_info, pr_hdr);
-                       peer_info.pr_count = i;
-                       peer_info.pr_lnd_u.pr_peer_credits.cr_ncpt = j;
+                       memset(data, 0, data_size);
+                       LIBCFS_IOC_INIT_V2(*peer_info, prcfg_hdr);
+                       peer_info->prcfg_hdr.ioc_len = data_size;
+                       peer_info->prcfg_idx = i;
+
                        rc = l_ioctl(LNET_DEV_ID,
-                                    IOC_LIBCFS_GET_PEER_INFO, &peer_info);
+                                    IOC_LIBCFS_GET_PEER_NI, peer_info);
                        if (rc != 0) {
                                l_errno = errno;
                                break;
                        }
 
-                       if (ncpt_set != 0) {
-                               ncpt = peer_info.pr_lnd_u.pr_peer_credits.
-                                       cr_ncpt;
-                               ncpt_set = true;
-                       }
+                       if (primary_nid != LNET_NID_ANY &&
+                           primary_nid != peer_info->prcfg_prim_nid)
+                                       continue;
+
+                       lpni_cri = (struct lnet_peer_ni_credit_info*)peer_info->prcfg_bulk;
+                       lpni_stats = (struct lnet_ioctl_element_stats *)
+                                    (peer_info->prcfg_bulk +
+                                    sizeof(*lpni_cri));
 
                        peer = cYAML_create_seq_item(peer_root);
                        if (peer == NULL)
                                goto out;
 
+                       if (peer_info->prcfg_prim_nid != prev_primary_nid) {
+                               prev_primary_nid = peer_info->prcfg_prim_nid;
+                               new_peer = true;
+                       }
+
+                       if (new_peer) {
+                               lnet_nid_t pnid = peer_info->prcfg_prim_nid;
+                               if (cYAML_create_string(peer, "primary nid",
+                                                       libcfs_nid2str(pnid))
+                                   == NULL)
+                                       goto out;
+                               if (cYAML_create_string(peer, "Multi-Rail",
+                                                       peer_info->prcfg_mr ?
+                                                       "True" : "False")
+                                   == NULL)
+                                       goto out;
+                               tmp = cYAML_create_seq(peer, "peer ni");
+                               if (tmp == NULL)
+                                       goto out;
+                               new_peer = false;
+                       }
+
                        if (first_seq == NULL)
                                first_seq = peer;
 
-                       if (cYAML_create_string(peer, "nid",
-                                               libcfs_nid2str
-                                                (peer_info.pr_nid)) == NULL)
+                       peer_ni = cYAML_create_seq_item(tmp);
+                       if (peer_ni == NULL)
                                goto out;
 
-                       if (cYAML_create_string(peer, "state",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                       cr_aliveness) ==
-                           NULL)
+                       if (cYAML_create_string(peer_ni, "nid",
+                                               libcfs_nid2str
+                                                (peer_info->prcfg_cfg_nid))
+                           == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "refcount",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                       cr_refcount) == NULL)
+                       if (cYAML_create_string(peer_ni, "state",
+                                               lpni_cri->cr_aliveness)
+                           == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "max_ni_tx_credits",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                   cr_ni_peer_tx_credits)
+                       if (!detail)
+                               continue;
+
+                       if (cYAML_create_number(peer_ni, "max_ni_tx_credits",
+                                               lpni_cri->cr_ni_peer_tx_credits)
                            == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "available_tx_credits",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                   cr_peer_tx_credits)
+                       if (cYAML_create_number(peer_ni, "available_tx_credits",
+                                               lpni_cri->cr_peer_tx_credits)
                            == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "available_rtr_credits",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                   cr_peer_rtr_credits)
+                       if (cYAML_create_number(peer_ni, "min_tx_credits",
+                                               lpni_cri->cr_peer_min_tx_credits)
                            == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "min_rtr_credits",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                   cr_peer_min_rtr_credits)
+                       if (cYAML_create_number(peer_ni, "tx_q_num_of_buf",
+                                               lpni_cri->cr_peer_tx_qnob)
                            == NULL)
                                goto out;
 
-                       if (cYAML_create_number(peer, "tx_q_num_of_buf",
-                                               peer_info.pr_lnd_u.
-                                                 pr_peer_credits.
-                                                   cr_peer_tx_qnob)
+                       if (cYAML_create_number(peer_ni, "available_rtr_credits",
+                                               lpni_cri->cr_peer_rtr_credits)
                            == NULL)
                                goto out;
-               }
+
+                       if (cYAML_create_number(peer_ni, "min_rtr_credits",
+                                               lpni_cri->cr_peer_min_rtr_credits)
+                           == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(peer_ni, "send_count",
+                                               lpni_stats->send_count)
+                           == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(peer_ni, "recv_count",
+                                               lpni_stats->recv_count)
+                           == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(peer_ni, "drop_count",
+                                               lpni_stats->drop_count)
+                           == NULL)
+                               goto out;
+
+                       if (cYAML_create_number(peer_ni, "refcount",
+                                               lpni_cri->cr_refcount) == NULL)
+                               goto out;
+               }
 
                if (l_errno != ENOENT) {
                        snprintf(err_str,
@@ -1132,7 +2208,7 @@ out:
                 * insert one.  Otherwise add to the one there
                 */
                show_node = cYAML_get_object_item(*show_rc,
-                                                 "peer_credits");
+                                                 "peer");
                if (show_node != NULL && cYAML_is_sequence(show_node)) {
                        cYAML_insert_child(show_node, first_seq);
                        free(peer_root);
@@ -1148,12 +2224,68 @@ out:
                *show_rc = root;
        }
 
-       cYAML_build_error(rc, seq_no, SHOW_CMD, "peer_credits", err_str,
+       cYAML_build_error(rc, seq_no, SHOW_CMD, "peer", err_str,
                          err_rc);
 
        return rc;
 }
 
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+                               struct cYAML **err_rc)
+{
+       struct lnet_ioctl_numa_range data;
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       int l_errno;
+       char err_str[LNET_MAX_STR_LEN];
+       struct cYAML *root = NULL, *range = NULL;
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       LIBCFS_IOC_INIT_V2(data, nr_hdr);
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NUMA_RANGE, &data);
+       if (rc != 0) {
+               l_errno = errno;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot get numa range: %s\"",
+                        strerror(l_errno));
+               rc = -l_errno;
+               goto out;
+       }
+
+       root = cYAML_create_object(NULL, NULL);
+       if (root == NULL)
+               goto out;
+
+       range = cYAML_create_object(root, "numa");
+       if (range == NULL)
+               goto out;
+
+       if (cYAML_create_number(range, "range",
+                               data.nr_range) == NULL)
+               goto out;
+
+       if (show_rc == NULL)
+               cYAML_print_tree(root);
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+out:
+       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_free_tree(root);
+       } else if (show_rc != NULL && *show_rc != NULL) {
+               cYAML_insert_sibling((*show_rc)->cy_child,
+                                       root->cy_child);
+               free(root);
+       } else {
+               *show_rc = root;
+       }
+
+       cYAML_build_error(rc, seq_no, SHOW_CMD, "numa", err_str, err_rc);
+
+       return rc;
+}
+
 int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
                           struct cYAML **err_rc)
 {
@@ -1273,79 +2405,466 @@ static int handle_yaml_config_route(struct cYAML *tree, struct cYAML **show_rc,
                                        err_rc);
 }
 
-static int handle_yaml_config_net(struct cYAML *tree, struct cYAML **show_rc,
-                                 struct cYAML **err_rc)
+static void yaml_free_string_array(char **array, int num)
+{
+       int i;
+       char **sub_array = array;
+
+       for (i = 0; i < num; i++) {
+               if (*sub_array != NULL)
+                       free(*sub_array);
+               sub_array++;
+       }
+       if (array)
+               free(array);
+}
+
+/*
+ *    interfaces:
+ *        0: <intf_name>['['<expr>']']
+ *        1: <intf_name>['['<expr>']']
+ */
+static int yaml_copy_intf_info(struct cYAML *intf_tree,
+                              struct lnet_dlc_network_descr *nw_descr)
+{
+       struct cYAML *child = NULL;
+       int intf_num = 0, rc = LUSTRE_CFG_RC_NO_ERR;
+       struct lnet_dlc_intf_descr *intf_descr, *tmp;
+
+       if (intf_tree == NULL || nw_descr == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       /* now grab all the interfaces and their cpts */
+       child = intf_tree->cy_child;
+       while (child != NULL) {
+               if (child->cy_valuestring == NULL) {
+                       child = child->cy_next;
+                       continue;
+               }
+
+               if (strlen(child->cy_valuestring) >= LNET_MAX_STR_LEN)
+                       goto failed;
+
+               rc = lustre_lnet_add_intf_descr(&nw_descr->nw_intflist,
+                                               child->cy_valuestring,
+                                               strlen(child->cy_valuestring));
+               if (rc != LUSTRE_CFG_RC_NO_ERR)
+                       goto failed;
+
+               intf_num++;
+               child = child->cy_next;
+       }
+
+       if (intf_num == 0)
+               return LUSTRE_CFG_RC_MISSING_PARAM;
+
+       return intf_num;
+
+failed:
+       list_for_each_entry_safe(intf_descr, tmp, &nw_descr->nw_intflist,
+                                intf_on_network) {
+               list_del(&intf_descr->intf_on_network);
+               free_intf_descr(intf_descr);
+       }
+
+       return rc;
+}
+
+static bool
+yaml_extract_cmn_tunables(struct cYAML *tree,
+                         struct lnet_ioctl_config_lnd_cmn_tunables *tunables,
+                         struct cfs_expr_list **global_cpts)
+{
+       struct cYAML *tun, *item, *smp;
+       int rc;
+
+       tun = cYAML_get_object_item(tree, "tunables");
+       if (tun != NULL) {
+               item = cYAML_get_object_item(tun, "peer_timeout");
+               if (item != NULL)
+                       tunables->lct_peer_timeout = item->cy_valueint;
+               item = cYAML_get_object_item(tun, "peer_credits");
+               if (item != NULL)
+                       tunables->lct_peer_tx_credits = item->cy_valueint;
+               item = cYAML_get_object_item(tun, "peer_buffer_credits");
+               if (item != NULL)
+                       tunables->lct_peer_rtr_credits = item->cy_valueint;
+               item = cYAML_get_object_item(tun, "credits");
+               if (item != NULL)
+                       tunables->lct_max_tx_credits = item->cy_valueint;
+               smp = cYAML_get_object_item(tun, "CPT");
+               if (smp != NULL) {
+                       rc = cfs_expr_list_parse(smp->cy_valuestring,
+                                                strlen(smp->cy_valuestring),
+                                                0, UINT_MAX, global_cpts);
+                       if (rc != 0)
+                               *global_cpts = NULL;
+               }
+
+               return true;
+       }
+
+       return false;
+}
+
+static bool
+yaml_extract_tunables(struct cYAML *tree,
+                     struct lnet_ioctl_config_lnd_tunables *tunables,
+                     struct cfs_expr_list **global_cpts,
+                     __u32 net_type)
+{
+       bool rc;
+
+       rc = yaml_extract_cmn_tunables(tree, &tunables->lt_cmn,
+                                      global_cpts);
+
+       if (!rc)
+               return rc;
+
+       lustre_yaml_extract_lnd_tunables(tree, net_type,
+                                        &tunables->lt_tun);
+
+       return rc;
+}
+
+/*
+ * net:
+ *    - net type: <net>[<NUM>]
+  *      local NI(s):
+ *        - nid: <ip>@<net>[<NUM>]
+ *          status: up
+ *          interfaces:
+ *               0: <intf_name>['['<expr>']']
+ *               1: <intf_name>['['<expr>']']
+ *        tunables:
+ *               peer_timeout: <NUM>
+ *               peer_credits: <NUM>
+ *               peer_buffer_credits: <NUM>
+ *               credits: <NUM>
+*         lnd tunables:
+ *               peercredits_hiw: <NUM>
+ *               map_on_demand: <NUM>
+ *               concurrent_sends: <NUM>
+ *               fmr_pool_size: <NUM>
+ *               fmr_flush_trigger: <NUM>
+ *               fmr_cache: <NUM>
+ *
+ * At least one interface is required. If no interfaces are provided the
+ * network interface can not be configured.
+ */
+static int handle_yaml_config_ni(struct cYAML *tree, struct cYAML **show_rc,
+                                struct cYAML **err_rc)
 {
-       struct cYAML *net, *intf, *tunables, *seq_no,
-             *peer_to = NULL, *peer_buf_cr = NULL, *peer_cr = NULL,
-             *credits = NULL, *ip2net = NULL, *smp = NULL, *child;
-       struct lnet_ioctl_config_lnd_tunables *lnd_tunables_p = NULL;
-       struct lnet_ioctl_config_lnd_tunables lnd_tunables;
-       char devs[LNET_MAX_STR_LEN];
-       char *loc = devs;
-       int size = LNET_MAX_STR_LEN;
-       int num;
-       bool intf_found = false;
+       struct cYAML *net, *intf, *seq_no, *ip2net = NULL, *local_nis = NULL,
+                    *item = NULL;
+       int num_entries = 0, rc;
+       struct lnet_dlc_network_descr nw_descr;
+       struct cfs_expr_list *global_cpts = NULL;
+       struct lnet_ioctl_config_lnd_tunables tunables;
+       bool found = false;
+
+       memset(&tunables, 0, sizeof(tunables));
+
+       INIT_LIST_HEAD(&nw_descr.network_on_rule);
+       INIT_LIST_HEAD(&nw_descr.nw_intflist);
 
        ip2net = cYAML_get_object_item(tree, "ip2net");
-       net = cYAML_get_object_item(tree, "net");
+       net = cYAML_get_object_item(tree, "net type");
+       if (net)
+               nw_descr.nw_id = libcfs_str2net(net->cy_valuestring);
+
+       /*
+        * if neither net nor ip2nets are present, then we can not
+        * configure the network.
+        */
+       if (!net && !ip2net)
+               return LUSTRE_CFG_RC_MISSING_PARAM;
+
+       local_nis = cYAML_get_object_item(tree, "local NI(s)");
+       if (local_nis == NULL)
+               return LUSTRE_CFG_RC_MISSING_PARAM;
+
+       if (!cYAML_is_sequence(local_nis))
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       while (cYAML_get_next_seq_item(local_nis, &item) != NULL) {
+               intf = cYAML_get_object_item(item, "interfaces");
+               if (intf == NULL)
+                       continue;
+               num_entries = yaml_copy_intf_info(intf, &nw_descr);
+               if (num_entries <= 0) {
+                       cYAML_build_error(num_entries, -1, "ni", "add",
+                                       "bad interface list",
+                                       err_rc);
+                       return LUSTRE_CFG_RC_BAD_PARAM;
+               }
+       }
+
+       found = yaml_extract_tunables(tree, &tunables, &global_cpts,
+                                     LNET_NETTYP(nw_descr.nw_id));
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+
+       rc = lustre_lnet_config_ni(&nw_descr,
+                                  global_cpts,
+                                  (ip2net) ? ip2net->cy_valuestring : NULL,
+                                  (found) ? &tunables: NULL,
+                                  (seq_no) ? seq_no->cy_valueint : -1,
+                                  err_rc);
+
+       if (global_cpts != NULL)
+               cfs_expr_list_free(global_cpts);
+
+       return rc;
+}
+
+/*
+ * ip2nets:
+ *  - net-spec: <tcp|o2ib|gni>[NUM]
+ *    interfaces:
+ *        0: <intf name>['['<expr>']']
+ *        1: <intf name>['['<expr>']']
+ *    ip-range:
+ *        0: <expr.expr.expr.expr>
+ *        1: <expr.expr.expr.expr>
+ */
+static int handle_yaml_config_ip2nets(struct cYAML *tree,
+                                     struct cYAML **show_rc,
+                                     struct cYAML **err_rc)
+{
+       struct cYAML *net, *ip_range, *item = NULL, *intf = NULL,
+                    *seq_no = NULL;
+       struct lustre_lnet_ip2nets ip2nets;
+       struct lustre_lnet_ip_range_descr *ip_range_descr = NULL,
+                                         *tmp = NULL;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       struct cfs_expr_list *global_cpts = NULL;
+       struct cfs_expr_list *el, *el_tmp;
+       struct lnet_ioctl_config_lnd_tunables tunables;
+       struct lnet_dlc_intf_descr *intf_descr, *intf_tmp;
+       bool found = false;
+
+       memset(&tunables, 0, sizeof(tunables));
+
+       /* initialize all lists */
+       INIT_LIST_HEAD(&ip2nets.ip2nets_ip_ranges);
+       INIT_LIST_HEAD(&ip2nets.ip2nets_net.network_on_rule);
+       INIT_LIST_HEAD(&ip2nets.ip2nets_net.nw_intflist);
+
+       net = cYAML_get_object_item(tree, "net-spec");
+       if (net == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       if (net != NULL && net->cy_valuestring == NULL)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       /* assign the network id */
+       ip2nets.ip2nets_net.nw_id = libcfs_str2net(net->cy_valuestring);
+       if (ip2nets.ip2nets_net.nw_id == LNET_NID_ANY)
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+
        intf = cYAML_get_object_item(tree, "interfaces");
        if (intf != NULL) {
-               /* grab all the interfaces */
-               child = intf->cy_child;
-               while (child != NULL && size > 0) {
-                       struct cYAML *lnd_params;
-
-                       if (child->cy_valuestring == NULL)
-                               goto ignore_child;
-
-                       if (loc > devs)
-                               num  = snprintf(loc, size, ",%s",
-                                               child->cy_valuestring);
-                       else
-                               num = snprintf(loc, size, "%s",
-                                              child->cy_valuestring);
-                       size -= num;
-                       loc += num;
-                       intf_found = true;
-
-                       lnd_params = cYAML_get_object_item(intf,
-                                                          "lnd tunables");
-                       if (lnd_params != NULL) {
-                               const char *dev_name = child->cy_valuestring;
-                               lnd_tunables_p = &lnd_tunables;
-
-                               lustre_interface_parse(lnd_params, dev_name,
-                                                      lnd_tunables_p);
+               rc = yaml_copy_intf_info(intf, &ip2nets.ip2nets_net);
+               if (rc <= 0)
+                       return LUSTRE_CFG_RC_BAD_PARAM;
+       }
+
+       ip_range = cYAML_get_object_item(tree, "ip-range");
+       if (ip_range != NULL) {
+               item = ip_range->cy_child;
+               while (item != NULL) {
+                       if (item->cy_valuestring == NULL) {
+                               item = item->cy_next;
+                               continue;
                        }
-ignore_child:
-                       child = child->cy_next;
+
+                       rc = lustre_lnet_add_ip_range(&ip2nets.ip2nets_ip_ranges,
+                                                     item->cy_valuestring);
+
+                       if (rc != LUSTRE_CFG_RC_NO_ERR)
+                               goto out;
+
+                       item = item->cy_next;
+               }
+       }
+
+       found = yaml_extract_tunables(tree, &tunables, &global_cpts,
+                                     LNET_NETTYP(ip2nets.ip2nets_net.nw_id));
+
+       rc = lustre_lnet_config_ip2nets(&ip2nets,
+                       (found) ? &tunables : NULL,
+                       global_cpts,
+                       (seq_no) ? seq_no->cy_valueint : -1,
+                       err_rc);
+
+       /*
+        * don't stop because there was no match. Continue processing the
+        * rest of the rules. If non-match then nothing is configured
+        */
+       if (rc == LUSTRE_CFG_RC_NO_MATCH)
+               rc = LUSTRE_CFG_RC_NO_ERR;
+out:
+       list_for_each_entry_safe(intf_descr, intf_tmp,
+                                &ip2nets.ip2nets_net.nw_intflist,
+                                intf_on_network) {
+               list_del(&intf_descr->intf_on_network);
+               free_intf_descr(intf_descr);
+       }
+
+       list_for_each_entry_safe(ip_range_descr, tmp,
+                                &ip2nets.ip2nets_ip_ranges,
+                                ipr_entry) {
+               list_del(&ip_range_descr->ipr_entry);
+               list_for_each_entry_safe(el, el_tmp, &ip_range_descr->ipr_expr,
+                                        el_link) {
+                       list_del(&el->el_link);
+                       cfs_expr_list_free(el);
+               }
+               free(ip_range_descr);
+       }
+
+       return rc;
+}
+
+static int handle_yaml_del_ni(struct cYAML *tree, struct cYAML **show_rc,
+                             struct cYAML **err_rc)
+{
+       struct cYAML *net = NULL, *intf = NULL, *seq_no = NULL, *item = NULL,
+                    *local_nis = NULL;
+       int num_entries, rc;
+       struct lnet_dlc_network_descr nw_descr;
+
+       INIT_LIST_HEAD(&nw_descr.network_on_rule);
+       INIT_LIST_HEAD(&nw_descr.nw_intflist);
+
+       net = cYAML_get_object_item(tree, "net type");
+       if (net != NULL)
+               nw_descr.nw_id = libcfs_str2net(net->cy_valuestring);
+
+       local_nis = cYAML_get_object_item(tree, "local NI(s)");
+       if (local_nis == NULL)
+               return LUSTRE_CFG_RC_MISSING_PARAM;
+
+       if (!cYAML_is_sequence(local_nis))
+               return LUSTRE_CFG_RC_BAD_PARAM;
+
+       while (cYAML_get_next_seq_item(local_nis, &item) != NULL) {
+               intf = cYAML_get_object_item(item, "interfaces");
+               if (intf == NULL)
+                       continue;
+               num_entries = yaml_copy_intf_info(intf, &nw_descr);
+               if (num_entries <= 0) {
+                       cYAML_build_error(num_entries, -1, "ni", "add",
+                                       "bad interface list",
+                                       err_rc);
+                       return LUSTRE_CFG_RC_BAD_PARAM;
                }
        }
 
-       tunables = cYAML_get_object_item(tree, "tunables");
-       if (tunables != NULL) {
-               peer_to = cYAML_get_object_item(tunables, "peer_timeout");
-               peer_cr = cYAML_get_object_item(tunables, "peer_credits");
-               peer_buf_cr = cYAML_get_object_item(tunables,
-                                                   "peer_buffer_credits");
-               credits = cYAML_get_object_item(tunables, "credits");
-               smp = cYAML_get_object_item(tunables, "CPT");
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+
+       rc = lustre_lnet_del_ni((net) ? &nw_descr : NULL,
+                               (seq_no) ? seq_no->cy_valueint : -1,
+                               err_rc);
+
+       return rc;
+}
+
+static int yaml_copy_peer_nids(struct cYAML *tree, char ***nidsppp)
+{
+       struct cYAML *nids_entry = NULL, *child = NULL, *entry = NULL;
+       char **nids = NULL;
+       int num = 0, rc = LUSTRE_CFG_RC_NO_ERR;
+
+       nids_entry = cYAML_get_object_item(tree, "peer ni");
+       if (cYAML_is_sequence(nids_entry)) {
+               while (cYAML_get_next_seq_item(nids_entry, &child))
+                       num++;
+       }
+
+       if (num == 0)
+               return LUSTRE_CFG_RC_MISSING_PARAM;
+
+       nids = calloc(sizeof(*nids) * num, 1);
+       if (nids == NULL)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       /* now grab all the nids */
+       num = 0;
+       child = NULL;
+       while (cYAML_get_next_seq_item(nids_entry, &child)) {
+               entry = cYAML_get_object_item(child, "nid");
+               if (!entry)
+                       continue;
+               nids[num] = calloc(strlen(entry->cy_valuestring) + 1, 1);
+               if (!nids[num]) {
+                       rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+                       goto failed;
+               }
+               strncpy(nids[num], entry->cy_valuestring,
+                       strlen(entry->cy_valuestring));
+               num++;
        }
+       rc = num;
+
+       *nidsppp = nids;
+       return rc;
+
+failed:
+       if (nids != NULL)
+               yaml_free_string_array(nids, num);
+       *nidsppp = NULL;
+       return rc;
+}
+
+static int handle_yaml_config_peer(struct cYAML *tree, struct cYAML **show_rc,
+                                  struct cYAML **err_rc)
+{
+       char **nids = NULL;
+       int num, rc;
+       struct cYAML *seq_no, *prim_nid, *non_mr;
+
+       num = yaml_copy_peer_nids(tree, &nids);
+       if (num < 0)
+               return num;
+
        seq_no = cYAML_get_object_item(tree, "seq_no");
+       prim_nid = cYAML_get_object_item(tree, "primary nid");
+       non_mr = cYAML_get_object_item(tree, "non_mr");
+
+       rc = lustre_lnet_config_peer_nid((prim_nid) ? prim_nid->cy_valuestring : NULL,
+                                        nids, num,
+                                        (non_mr) ? false : true,
+                                        (seq_no) ? seq_no->cy_valueint : -1,
+                                        err_rc);
+
+       yaml_free_string_array(nids, num);
+       return rc;
+}
+
+static int handle_yaml_del_peer(struct cYAML *tree, struct cYAML **show_rc,
+                               struct cYAML **err_rc)
+{
+       char **nids = NULL;
+       int num, rc;
+       struct cYAML *seq_no, *prim_nid;
+
+       num = yaml_copy_peer_nids(tree, &nids);
+       if (num < 0)
+               return num;
 
-       return lustre_lnet_config_net((net) ? net->cy_valuestring : NULL,
-                                     (intf_found) ? devs : NULL,
-                                     (ip2net) ? ip2net->cy_valuestring : NULL,
-                                     (peer_to) ? peer_to->cy_valueint : -1,
-                                     (peer_cr) ? peer_cr->cy_valueint : -1,
-                                     (peer_buf_cr) ?
-                                       peer_buf_cr->cy_valueint : -1,
-                                     (credits) ? credits->cy_valueint : -1,
-                                     (smp) ? smp->cy_valuestring : NULL,
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+       prim_nid = cYAML_get_object_item(tree, "primary nid");
+
+       rc = lustre_lnet_del_peer_nid((prim_nid) ? prim_nid->cy_valuestring : NULL,
+                                     nids, num,
                                      (seq_no) ? seq_no->cy_valueint : -1,
-                                     lnd_tunables_p,
                                      err_rc);
+
+       yaml_free_string_array(nids, num);
+       return rc;
 }
 
 static int handle_yaml_config_buffers(struct cYAML *tree,
@@ -1406,19 +2925,6 @@ static int handle_yaml_del_route(struct cYAML *tree, struct cYAML **show_rc,
                                     err_rc);
 }
 
-static int handle_yaml_del_net(struct cYAML *tree, struct cYAML **show_rc,
-                              struct cYAML **err_rc)
-{
-       struct cYAML *net, *seq_no;
-
-       net = cYAML_get_object_item(tree, "net");
-       seq_no = cYAML_get_object_item(tree, "seq_no");
-
-       return lustre_lnet_del_net((net) ? net->cy_valuestring : NULL,
-                                  (seq_no) ? seq_no->cy_valueint : -1,
-                                  err_rc);
-}
-
 static int handle_yaml_del_routing(struct cYAML *tree, struct cYAML **show_rc,
                                   struct cYAML **err_rc)
 {
@@ -1485,16 +2991,19 @@ static int handle_yaml_show_routing(struct cYAML *tree, struct cYAML **show_rc,
                                        show_rc, err_rc);
 }
 
-static int handle_yaml_show_credits(struct cYAML *tree, struct cYAML **show_rc,
-                                   struct cYAML **err_rc)
+static int handle_yaml_show_peers(struct cYAML *tree, struct cYAML **show_rc,
+                                 struct cYAML **err_rc)
 {
-       struct cYAML *seq_no;
+       struct cYAML *seq_no, *nid, *detail;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
+       detail = cYAML_get_object_item(tree, "detail");
+       nid = cYAML_get_object_item(tree, "nid");
 
-       return lustre_lnet_show_peer_credits((seq_no) ?
-                                               seq_no->cy_valueint : -1,
-                                            show_rc, err_rc);
+       return lustre_lnet_show_peer((nid) ? nid->cy_valuestring : NULL,
+                                    (detail) ? detail->cy_valueint : 0,
+                                    (seq_no) ? seq_no->cy_valueint : -1,
+                                    show_rc, err_rc);
 }
 
 static int handle_yaml_show_stats(struct cYAML *tree, struct cYAML **show_rc,
@@ -1508,6 +3017,41 @@ static int handle_yaml_show_stats(struct cYAML *tree, struct cYAML **show_rc,
                                      show_rc, err_rc);
 }
 
+static int handle_yaml_config_numa(struct cYAML *tree, struct cYAML **show_rc,
+                                 struct cYAML **err_rc)
+{
+       struct cYAML *seq_no, *range;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+       range = cYAML_get_object_item(tree, "range");
+
+       return lustre_lnet_config_numa_range(range ? range->cy_valueint : -1,
+                                            seq_no ? seq_no->cy_valueint : -1,
+                                            err_rc);
+}
+
+static int handle_yaml_del_numa(struct cYAML *tree, struct cYAML **show_rc,
+                              struct cYAML **err_rc)
+{
+       struct cYAML *seq_no;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+
+       return lustre_lnet_config_numa_range(0, seq_no ? seq_no->cy_valueint : -1,
+                                            err_rc);
+}
+
+static int handle_yaml_show_numa(struct cYAML *tree, struct cYAML **show_rc,
+                               struct cYAML **err_rc)
+{
+       struct cYAML *seq_no;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+
+       return lustre_lnet_show_numa_range(seq_no ? seq_no->cy_valueint : -1,
+                                          show_rc, err_rc);
+}
+
 struct lookup_cmd_hdlr_tbl {
        char *name;
        cmd_handler_t cb;
@@ -1515,16 +3059,21 @@ struct lookup_cmd_hdlr_tbl {
 
 static struct lookup_cmd_hdlr_tbl lookup_config_tbl[] = {
        {"route", handle_yaml_config_route},
-       {"net", handle_yaml_config_net},
+       {"net", handle_yaml_config_ni},
+       {"ip2nets", handle_yaml_config_ip2nets},
+       {"peer", handle_yaml_config_peer},
        {"routing", handle_yaml_config_routing},
        {"buffers", handle_yaml_config_buffers},
+       {"numa", handle_yaml_config_numa},
        {NULL, NULL}
 };
 
 static struct lookup_cmd_hdlr_tbl lookup_del_tbl[] = {
        {"route", handle_yaml_del_route},
-       {"net", handle_yaml_del_net},
+       {"net", handle_yaml_del_ni},
+       {"peer", handle_yaml_del_peer},
        {"routing", handle_yaml_del_routing},
+       {"numa", handle_yaml_del_numa},
        {NULL, NULL}
 };
 
@@ -1533,8 +3082,9 @@ static struct lookup_cmd_hdlr_tbl lookup_show_tbl[] = {
        {"net", handle_yaml_show_net},
        {"buffers", handle_yaml_show_routing},
        {"routing", handle_yaml_show_routing},
-       {"credits", handle_yaml_show_credits},
+       {"peer", handle_yaml_show_peers},
        {"statistics", handle_yaml_show_stats},
+       {"numa", handle_yaml_show_numa},
        {NULL, NULL}
 };
 
@@ -1561,7 +3111,7 @@ static int lustre_yaml_cb_helper(char *f, struct lookup_cmd_hdlr_tbl *table,
        char err_str[LNET_MAX_STR_LEN];
        int rc = LUSTRE_CFG_RC_NO_ERR, return_rc = LUSTRE_CFG_RC_NO_ERR;
 
-       tree = cYAML_build_tree(f, NULL, 0, err_rc);
+       tree = cYAML_build_tree(f, NULL, 0, err_rc, false);
        if (tree == NULL)
                return LUSTRE_CFG_RC_BAD_PARAM;
 
@@ -1616,3 +3166,47 @@ int lustre_yaml_show(char *f, struct cYAML **show_rc, struct cYAML **err_rc)
        return lustre_yaml_cb_helper(f, lookup_show_tbl,
                                     show_rc, err_rc);
 }
+
+int lustre_lnet_send_dbg_task(enum lnet_dbg_task dbg_task,
+                             struct lnet_dbg_task_info *dbg_info,
+                             struct cYAML **show_rc,
+                             struct cYAML **err_rc)
+{
+       struct lnet_ioctl_dbg *dbg;
+       struct lnet_dbg_task_info *info;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       dbg = calloc(1, sizeof(*dbg) + sizeof(*info));
+       if (!dbg) {
+               snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+               rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+               goto out;
+       }
+
+       info = (struct lnet_dbg_task_info *)dbg->dbg_bulk;
+
+       LIBCFS_IOC_INIT_V2(*dbg, dbg_hdr);
+
+       dbg->dbg_task = dbg_task;
+       if (dbg_info)
+               memcpy(info, dbg_info, sizeof(*info));
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DBG, dbg);
+       if (rc != 0) {
+               rc = -errno;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"debug task failed %s\"", strerror(errno));
+               goto out;
+       }
+
+out:
+       cYAML_build_error(rc, -1, DBG_CMD,
+                        "debug", err_str, err_rc);
+
+       return rc;
+}
+
index a051220..3256726 100644 (file)
 #define LUSTRE_CFG_RC_OUT_OF_RANGE_PARAM       -3
 #define LUSTRE_CFG_RC_OUT_OF_MEM               -4
 #define LUSTRE_CFG_RC_GENERIC_ERR              -5
+#define LUSTRE_CFG_RC_NO_MATCH                 -6
+#define LUSTRE_CFG_RC_MATCH                    -7
+
+#include <lnet/lnet.h>
+#include <libcfs/util/string.h>
+
+struct lnet_dlc_network_descr {
+       struct list_head network_on_rule;
+       __u32 nw_id;
+       struct list_head nw_intflist;
+};
+
+struct lnet_dlc_intf_descr {
+       struct list_head intf_on_network;
+       char intf_name[LNET_MAX_STR_LEN];
+       struct cfs_expr_list *cpt_expr;
+};
 
 /* forward declaration of the cYAML structure. */
 struct cYAML;
@@ -47,6 +64,12 @@ struct cYAML;
 int lustre_lnet_config_lib_init();
 
 /*
+ * lustre_lnet_config_lib_uninit
+ *     Uninitialize the DLC Library
+ */
+void lustre_lnet_config_lib_uninit();
+
+/*
  * lustre_lnet_config_ni_system
  *   Initialize/Uninitialize the lnet NI system.
  *
@@ -108,39 +131,36 @@ int lustre_lnet_show_route(char *nw, char *gw,
                           struct cYAML **err_rc);
 
 /*
- * lustre_lnet_config_net
- *   Send down an IOCTL to configure a network.
+ * lustre_lnet_config_ni
+ *   Send down an IOCTL to configure a network interface. It implicitly
+ *   creates a network if one doesn't exist..
  *
- *   net - the network name
- *   intf - the interface of the network of the form net_name(intf)
+ *   nw_descr - network and interface descriptor
+ *   global_cpts - globally defined CPTs
  *   ip2net - this parameter allows configuring multiple networks.
  *     it takes precedence over the net and intf parameters
- *   peer_to - peer timeout
- *   peer_cr - peer credit
- *   peer_buf_cr - peer buffer credits
- *       - the above are LND tunable parameters and are optional
- *   credits - network interface credits
- *   smp - cpu affinity
+ *   tunables - LND tunables
  *   seq_no - sequence number of the request
  *   lnd_tunables - lnet specific tunable parameters
  *   err_rc - [OUT] struct cYAML tree describing the error. Freed by caller
  */
-int lustre_lnet_config_net(char *net, char *intf, char *ip2net,
-                          int peer_to, int peer_cr, int peer_buf_cr,
-                          int credits, char *smp, int seq_no,
-                          struct lnet_ioctl_config_lnd_tunables *lnd_tunables,
-                          struct cYAML **err_rc);
+int lustre_lnet_config_ni(struct lnet_dlc_network_descr *nw_descr,
+                         struct cfs_expr_list *global_cpts,
+                         char *ip2net,
+                         struct lnet_ioctl_config_lnd_tunables *tunables,
+                         int seq_no, struct cYAML **err_rc);
 
 /*
- * lustre_lnet_del_net
- *   Send down an IOCTL to delete a network.
+ * lustre_lnet_del_ni
+ *   Send down an IOCTL to delete a network interface. It implicitly
+ *   deletes a network if it becomes empty of nis
  *
- *   nw - network to delete.
+ *   nw  - network and interface list
  *   seq_no - sequence number of the request
  *   err_rc - [OUT] struct cYAML tree describing the error. Freed by caller
  */
-int lustre_lnet_del_net(char *nw, int seq_no,
-                       struct cYAML **err_rc);
+int lustre_lnet_del_ni(struct lnet_dlc_network_descr *nw,
+                      int seq_no, struct cYAML **err_rc);
 
 /*
  * lustre_lnet_show_net
@@ -169,6 +189,35 @@ int lustre_lnet_enable_routing(int enable, int seq_no,
                               struct cYAML **err_rc);
 
 /*
+ * lustre_lnet_config_numa_range
+ *   Set the NUMA range which impacts the NIs to be selected
+ *   during sending. If the NUMA range is large the NUMA
+ *   distance between the message memory and the NI becomes
+ *   less significant. The NUMA range is a relative number
+ *   with no other meaning besides allowing a wider breadth
+ *   for picking an NI to send from.
+ *
+ *   range - numa range value.
+ *   seq_no - sequence number of the request
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_config_numa_range(int range, int seq_no,
+                                 struct cYAML **err_rc);
+
+/*
+ * lustre_lnet_show_num_range
+ *   Get the currently set NUMA range
+ *
+ *   seq_no - sequence number of the request
+ *   show_rc - [OUT] struct cYAML tree containing NUMA range info
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_show_numa_range(int seq_no, struct cYAML **show_rc,
+                               struct cYAML **err_rc);
+
+/*
  * lustre_lnet_config_buffers
  *   Send down an IOCTL to configure routing buffer sizes.  A value of 0 means
  *   default that particular buffer to default size. A value of -1 means
@@ -196,27 +245,70 @@ int lustre_lnet_show_routing(int seq_no, struct cYAML **show_rc,
                             struct cYAML **err_rc);
 
 /*
- * lustre_lnet_show_peer_credits
- *   Shows credit details on the peers in the system
+ * lustre_lnet_show_stats
+ *   Shows internal LNET statistics.  This is useful to display the
+ *   current LNET activity, such as number of messages route, etc
  *
  *     seq_no - sequence number of the command
  *     show_rc - YAML structure of the resultant show
  *     err_rc - YAML strucutre of the resultant return code.
  */
-int lustre_lnet_show_peer_credits(int seq_no, struct cYAML **show_rc,
-                                 struct cYAML **err_rc);
+int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
+                          struct cYAML **err_rc);
 
 /*
- * lustre_lnet_show_stats
- *   Shows internal LNET statistics.  This is useful to display the
- *   current LNET activity, such as number of messages route, etc
+ * lustre_lnet_config_peer_nid
+ *   Add a peer nid to a peer with primary nid pnid. If no pnid is given
+ *   then the first nid in the nid list becomes the primary nid for
+ *   a newly created peer.
+ *   Otherwise if pnid is provided and it's unique then a new peer is
+ *   created with pnid as the primary NID and the nids in the nid list as
+ *   secondary nids.
+ *   If any of the peers nids provided in with exception to the pnid is
+ *   not unique the operation fails. Some peer nids might have already
+ *   been added. It's the role of the caller of this API to remove the
+ *   added NIDs if they wish.
+ *
+ *     pnid - Primary NID of the peer
+ *     nid - list of nids to add
+ *     num_nids - number of nids in the nid array
+ *     mr - true if this peer is MR capable.
+ *     seq_no - sequence number of the command
+ *     err_rc - YAML strucutre of the resultant return code.
+ */
+int lustre_lnet_config_peer_nid(char *pnid, char **nid, int num_nids,
+                               bool mr, int seq_no, struct cYAML **err_rc);
+
+/*
+ * lustre_lnet_del_peer_nid
+ *  Delete the nids given in the nid list from the peer with primary NID
+ *  pnid. If pnid is NULL or it doesn't identify a peer the operation
+ *  fails and no change happens to the system.
+ *  The operation is aborted on the first NID that fails to be deleted.
  *
+ *     pnid - Primary NID of the peer
+ *     nid - list of nids to add
+ *     num_nids - number of nids in the nid array
+ *     seq_no - sequence number of the command
+ *     err_rc - YAML strucutre of the resultant return code.
+ */
+int lustre_lnet_del_peer_nid(char *pnid, char **nid, int num_nids,
+                            int seq_no, struct cYAML **err_rc);
+
+/*
+ * lustre_lnet_show_peer
+ *   Show the peer identified by nid, knid. If knid is NULL all
+ *   peers in the system are shown.
+ *
+ *     knid - A NID of the peer
+ *     detail - display detailed information
  *     seq_no - sequence number of the command
  *     show_rc - YAML structure of the resultant show
  *     err_rc - YAML strucutre of the resultant return code.
+ *
  */
-int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
-                          struct cYAML **err_rc);
+int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
+                         struct cYAML **show_rc, struct cYAML **err_rc);
 
 /*
  * lustre_yaml_config
@@ -250,4 +342,52 @@ int lustre_yaml_del(char *f, struct cYAML **err_rc);
 int lustre_yaml_show(char *f, struct cYAML **show_rc,
                     struct cYAML **err_rc);
 
+/*
+ * lustre_lnet_init_nw_descr
+ *     initialize the network descriptor structure for use
+ */
+void lustre_lnet_init_nw_descr(struct lnet_dlc_network_descr *nw_descr);
+
+/*
+ * lustre_lnet_parse_interfaces
+ *     prase an interface string and populate descriptor structures
+ *             intf_str - interface string of the format
+ *                     <intf>[<expr>], <intf>[<expr>],..
+ *             nw_descr - network descriptor to populate
+ *             init - True to initialize nw_descr
+ */
+int lustre_lnet_parse_interfaces(char *intf_str,
+                                struct lnet_dlc_network_descr *nw_descr);
+
+/*
+ * lustre_lnet_parse_nids
+ *     Parse a set of nids into a locally allocated array and return the
+ *     pointer of the array to the caller. The caller is responsible for
+ *     freeing the array. If an initial array is provided then copy over
+ *     the contents of that array into the new array and append to it the
+ *     new content.
+ *     The nids can be of the form "nid [,nid, nid, nid]"
+ *             nids: nids string to be parsed
+ *             array: initial array of content
+ *             size: num of elements in the array
+ *             out_array: [OUT] new allocated array.
+ *     Returns size of array
+ *             sets the out_array to NULL on failure.
+ */
+int lustre_lnet_parse_nids(char *nids, char **array, int size,
+                          char ***out_array);
+
+/*
+ * lustre_lnet_send_dbg_task
+ *     send a debug task to be carried out in the kernel. This API will
+ *     not be exposed to the user through lnetctl utility. It can only be
+ *     executed by being called directly.
+ *             dbg_task: The task to be carried out
+ *             dbg_info: task specific information
+ */
+int lustre_lnet_send_dbg_task(enum lnet_dbg_task dbg_task,
+                             struct lnet_dbg_task_info *dbg_info,
+                             struct cYAML **show_rc,
+                             struct cYAML **err_rc);
+
 #endif /* LIB_LNET_CONFIG_API_H */
index 5f68b47..56e5975 100644 (file)
@@ -25,6 +25,9 @@
  * Author:
  *   James Simmons <jsimmons@infradead.org>
  */
+
+#include <limits.h>
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <libcfs/util/ioctl.h>
 #include "cyaml.h"
 
 static int
-lustre_ko2iblnd_show_net(struct cYAML *lndparams,
-                        struct lnet_ioctl_config_lnd_tunables *tunables)
+lustre_o2iblnd_show_tun(struct cYAML *lndparams,
+                       struct lnet_ioctl_config_o2iblnd_tunables *lnd_cfg)
 {
-       struct lnet_ioctl_config_o2iblnd_tunables *lnd_cfg;
-
-       lnd_cfg = &tunables->lt_tun_u.lt_o2ib;
-
        if (cYAML_create_number(lndparams, "peercredits_hiw",
                                lnd_cfg->lnd_peercredits_hiw) == NULL)
-               return -1;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
        if (cYAML_create_number(lndparams, "map_on_demand",
                                lnd_cfg->lnd_map_on_demand) == NULL)
-               return -1;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
        if (cYAML_create_number(lndparams, "concurrent_sends",
                                lnd_cfg->lnd_concurrent_sends) == NULL)
-               return -1;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
        if (cYAML_create_number(lndparams, "fmr_pool_size",
                                lnd_cfg->lnd_fmr_pool_size) == NULL)
-               return -1;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
        if (cYAML_create_number(lndparams, "fmr_flush_trigger",
                                lnd_cfg->lnd_fmr_flush_trigger) == NULL)
-               return -1;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
 
        if (cYAML_create_number(lndparams, "fmr_cache",
                                lnd_cfg->lnd_fmr_cache) == NULL)
-               return -1;
-       return 0;
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       return LUSTRE_CFG_RC_NO_ERR;
 }
 
 int
-lustre_interface_show_net(struct cYAML *interfaces, unsigned int index,
-                         bool detail, struct lnet_ioctl_config_data *data,
-                         struct lnet_ioctl_net_config *net_config)
+lustre_net_show_tunables(struct cYAML *tunables,
+                        struct lnet_ioctl_config_lnd_cmn_tunables *cmn)
 {
-       char ni_index[2]; /* LNET_MAX_INTERFACES is only 16 */
-
-       if (strlen(net_config->ni_interfaces[index]) == 0)
-               return 0;
-
-       snprintf(ni_index, sizeof(ni_index), "%d", index);
-       if (cYAML_create_string(interfaces, ni_index,
-                               net_config->ni_interfaces[index]) == NULL)
-               return -1;
-
-       if (detail) {
-               __u32 net = LNET_NETTYP(LNET_NIDNET(data->cfg_nid));
-               struct lnet_ioctl_config_lnd_tunables *lnd_cfg;
-               struct cYAML *lndparams;
-
-               if (data->cfg_config_u.cfg_net.net_interface_count == 0 ||
-                   net != O2IBLND)
-                       return 0;
-
-               lndparams = cYAML_create_object(interfaces, "lnd tunables");
-               if (lndparams == NULL)
-                       return -1;
-
-               lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk;
-               if (lustre_ko2iblnd_show_net(lndparams, lnd_cfg) < 0)
-                       return -1;
-       }
-       return 0;
+
+
+       if (cYAML_create_number(tunables, "peer_timeout",
+                               cmn->lct_peer_timeout)
+                                       == NULL)
+               goto out;
+
+       if (cYAML_create_number(tunables, "peer_credits",
+                               cmn->lct_peer_tx_credits)
+                                       == NULL)
+               goto out;
+
+       if (cYAML_create_number(tunables,
+                               "peer_buffer_credits",
+                               cmn->lct_peer_rtr_credits)
+                                       == NULL)
+               goto out;
+
+       if (cYAML_create_number(tunables, "credits",
+                               cmn->lct_max_tx_credits)
+                                       == NULL)
+               goto out;
+
+       return LUSTRE_CFG_RC_NO_ERR;
+
+out:
+       return LUSTRE_CFG_RC_OUT_OF_MEM;
+}
+
+int
+lustre_ni_show_tunables(struct cYAML *lnd_tunables,
+                       __u32 net_type,
+                       struct lnet_lnd_tunables *lnd)
+{
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+
+       if (net_type == O2IBLND)
+               rc = lustre_o2iblnd_show_tun(lnd_tunables,
+                                            &lnd->lnd_tun_u.lnd_o2ib);
+
+       return rc;
 }
 
 static void
-lustre_ko2iblnd_parse_net(struct cYAML *lndparams,
-                         struct lnet_ioctl_config_lnd_tunables *lnd_cfg)
+yaml_extract_o2ib_tun(struct cYAML *tree,
+                     struct lnet_ioctl_config_o2iblnd_tunables *lnd_cfg)
 {
        struct cYAML *map_on_demand = NULL, *concurrent_sends = NULL;
        struct cYAML *fmr_pool_size = NULL, *fmr_cache = NULL;
-       struct cYAML *fmr_flush_trigger = NULL;
+       struct cYAML *fmr_flush_trigger = NULL, *lndparams = NULL;
+
+       lndparams = cYAML_get_object_item(tree, "lnd tunables");
+       if (!lndparams)
+               return;
 
        map_on_demand = cYAML_get_object_item(lndparams, "map_on_demand");
-       lnd_cfg->lt_tun_u.lt_o2ib.lnd_map_on_demand =
+       lnd_cfg->lnd_map_on_demand =
                (map_on_demand) ? map_on_demand->cy_valueint : 0;
 
        concurrent_sends = cYAML_get_object_item(lndparams, "concurrent_sends");
-       lnd_cfg->lt_tun_u.lt_o2ib.lnd_concurrent_sends =
+       lnd_cfg->lnd_concurrent_sends =
                (concurrent_sends) ? concurrent_sends->cy_valueint : 0;
 
        fmr_pool_size = cYAML_get_object_item(lndparams, "fmr_pool_size");
-       lnd_cfg->lt_tun_u.lt_o2ib.lnd_fmr_pool_size =
+       lnd_cfg->lnd_fmr_pool_size =
                (fmr_pool_size) ? fmr_pool_size->cy_valueint : 0;
 
        fmr_flush_trigger = cYAML_get_object_item(lndparams,
                                                  "fmr_flush_trigger");
-       lnd_cfg->lt_tun_u.lt_o2ib.lnd_fmr_flush_trigger =
+       lnd_cfg->lnd_fmr_flush_trigger =
                (fmr_flush_trigger) ? fmr_flush_trigger->cy_valueint : 0;
 
        fmr_cache = cYAML_get_object_item(lndparams, "fmr_cache");
-       lnd_cfg->lt_tun_u.lt_o2ib.lnd_fmr_cache =
+       lnd_cfg->lnd_fmr_cache =
                (fmr_cache) ? fmr_cache->cy_valueint : 0;
 }
 
+
 void
-lustre_interface_parse(struct cYAML *lndparams, const char *dev_name,
-                      struct lnet_ioctl_config_lnd_tunables *lnd_cfg)
+lustre_yaml_extract_lnd_tunables(struct cYAML *tree,
+                                __u32 net_type,
+                                struct lnet_lnd_tunables *tun)
 {
-       if (dev_name != NULL && strstr(dev_name, "ib"))
-               lustre_ko2iblnd_parse_net(lndparams, lnd_cfg);
+       if (net_type == O2IBLND)
+               yaml_extract_o2ib_tun(tree,
+                                     &tun->lnd_tun_u.lnd_o2ib);
+
 }
+
index b407daf..f0a6568 100644 (file)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 #include <libcfs/util/ioctl.h>
 #include <libcfs/util/parser.h>
 #include <lnet/lnetctl.h>
+#include <lnet/nidstr.h>
 #include "cyaml/cyaml.h"
 #include "lnetconfig/liblnetconfig.h"
 
 static int jt_config_lnet(int argc, char **argv);
 static int jt_unconfig_lnet(int argc, char **argv);
 static int jt_add_route(int argc, char **argv);
-static int jt_add_net(int argc, char **argv);
+static int jt_add_ni(int argc, char **argv);
 static int jt_set_routing(int argc, char **argv);
 static int jt_del_route(int argc, char **argv);
-static int jt_del_net(int argc, char **argv);
+static int jt_del_ni(int argc, char **argv);
 static int jt_show_route(int argc, char **argv);
 static int jt_show_net(int argc, char **argv);
 static int jt_show_routing(int argc, char **argv);
 static int jt_show_stats(int argc, char **argv);
-static int jt_show_peer_credits(int argc, char **argv);
+static int jt_show_peer(int argc, char **argv);
+static int jt_show_numa(int argc, char **argv);
 static int jt_set_tiny(int argc, char **argv);
 static int jt_set_small(int argc, char **argv);
 static int jt_set_large(int argc, char **argv);
+static int jt_set_numa(int argc, char **argv);
+static int jt_add_peer_nid(int argc, char **argv);
+static int jt_del_peer_nid(int argc, char **argv);
+/*static int jt_show_peer(int argc, char **argv);*/
 
 command_t lnet_cmds[] = {
        {"configure", jt_config_lnet, 0, "configure lnet\n"
@@ -78,7 +85,7 @@ command_t route_cmds[] = {
 };
 
 command_t net_cmds[] = {
-       {"add", jt_add_net, 0, "add a network\n"
+       {"add", jt_add_ni, 0, "add a network\n"
         "\t--net: net name (e.g. tcp0)\n"
         "\t--if: physical interface (e.g. eth0)\n"
         "\t--ip2net: specify networks based on IP address patterns\n"
@@ -87,8 +94,9 @@ command_t net_cmds[] = {
         "\t--peer-buffer-credits: the number of buffer credits per peer\n"
         "\t--credits: Network Interface credits\n"
         "\t--cpt: CPU Partitions configured net uses (e.g. [0,1]\n"},
-       {"del", jt_del_net, 0, "delete a network\n"
-        "\t--net: net name (e.g. tcp0)\n"},
+       {"del", jt_del_ni, 0, "delete a network\n"
+        "\t--net: net name (e.g. tcp0)\n"
+        "\t--if: physical interface (e.g. eth0)\n"},
        {"show", jt_show_net, 0, "show networks\n"
         "\t--net: net name (e.g. tcp0) to filter on\n"
         "\t--verbose: display detailed output per network\n"},
@@ -105,8 +113,8 @@ command_t stats_cmds[] = {
        { 0, 0, 0, NULL }
 };
 
-command_t credits_cmds[] = {
-       {"show", jt_show_peer_credits, 0, "show peer credits\n"},
+command_t numa_cmds[] = {
+       {"show", jt_show_numa, 0, "show NUMA range\n"},
        { 0, 0, 0, NULL }
 };
 
@@ -120,6 +128,25 @@ command_t set_cmds[] = {
        {"routing", jt_set_routing, 0, "enable/disable routing\n"
         "\t0 - disable routing\n"
         "\t1 - enable routing\n"},
+       {"numa_range", jt_set_numa, 0, "set NUMA range for NI selection\n"
+        "\tVALUE must be at least 0\n"},
+       { 0, 0, 0, NULL }
+};
+
+command_t peer_cmds[] = {
+       {"add", jt_add_peer_nid, 0, "add a peer NID\n"
+        "\t--prim_nid: Primary NID of the peer. If not provided then the first\n"
+        "\t            NID in the list becomes the Primary NID of a newly created\n"
+        "\t            peer. \n"
+        "\t--nid: one or more peer NIDs\n"
+        "\t--non_mr: create this peer as not Multi-Rail capable\n"},
+       {"del", jt_del_peer_nid, 0, "delete a peer NID\n"
+        "\t--prim_nid: Primary NID of the peer.\n"
+        "\t--nid: list of NIDs to remove. If none provided,\n"
+        "\t       peer is deleted\n"},
+       {"show", jt_show_peer, 0, "show peer information\n"
+        "\t--nid: NID of peer to filter on.\n"
+        "\t--verbose: Include  extended  statistics\n"},
        { 0, 0, 0, NULL }
 };
 
@@ -184,6 +211,33 @@ static int handle_help(const command_t *cmd_list, const char *cmd,
        return rc;
 }
 
+static int jt_set_numa(int argc, char **argv)
+{
+       long int value;
+       int rc;
+       struct cYAML *err_rc = NULL;
+
+       if (handle_help(set_cmds, "set", "numa_range", argc, argv) == 0)
+               return 0;
+
+       rc = parse_long(argv[1], &value);
+       if (rc != 0) {
+               cYAML_build_error(-1, -1, "parser", "set",
+                                 "cannot parse numa_range value", &err_rc);
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               return -1;
+       }
+
+       rc = lustre_lnet_config_numa_range(value, -1, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
 static int jt_set_tiny(int argc, char **argv)
 {
        long int value;
@@ -410,12 +464,19 @@ static int jt_add_route(int argc, char **argv)
        return rc;
 }
 
-static int jt_add_net(int argc, char **argv)
+static int jt_add_ni(int argc, char **argv)
 {
-       char *network = NULL, *intf = NULL, *ip2net = NULL, *cpt = NULL;
+       char *ip2net = NULL;
        long int pto = -1, pc = -1, pbc = -1, cre = -1;
        struct cYAML *err_rc = NULL;
-       int rc, opt;
+       int rc, opt, cpt_rc = -1;
+       struct lnet_dlc_network_descr nw_descr;
+       struct cfs_expr_list *global_cpts = NULL;
+       struct lnet_ioctl_config_lnd_tunables tunables;
+       bool found = false;
+
+       memset(&tunables, 0, sizeof(tunables));
+       lustre_lnet_init_nw_descr(&nw_descr);
 
        const char *const short_options = "n:i:p:t:c:b:r:s:h";
        const struct option long_options[] = {
@@ -435,10 +496,16 @@ static int jt_add_net(int argc, char **argv)
                                   long_options, NULL)) != -1) {
                switch (opt) {
                case 'n':
-                       network = optarg;
+                       nw_descr.nw_id = libcfs_str2net(optarg);
                        break;
                case 'i':
-                       intf = optarg;
+                       rc = lustre_lnet_parse_interfaces(optarg, &nw_descr);
+                       if (rc != 0) {
+                               cYAML_build_error(-1, -1, "ni", "add",
+                                               "bad interface list",
+                                               &err_rc);
+                               goto failed;
+                       }
                        break;
                case 'p':
                        ip2net = optarg;
@@ -476,7 +543,9 @@ static int jt_add_net(int argc, char **argv)
                        }
                        break;
                case 's':
-                       cpt = optarg;
+                       cpt_rc = cfs_expr_list_parse(optarg,
+                                                    strlen(optarg), 0,
+                                                    UINT_MAX, &global_cpts);
                        break;
                case 'h':
                        print_help(net_cmds, "net", "add");
@@ -486,9 +555,23 @@ static int jt_add_net(int argc, char **argv)
                }
        }
 
-       rc = lustre_lnet_config_net(network, intf, ip2net, pto, pc, pbc,
-                                   cre, cpt, -1, NULL, &err_rc);
+       if (pto > 0 || pc > 0 || pbc > 0 || cre > 0) {
+               tunables.lt_cmn.lct_peer_timeout = pto;
+               tunables.lt_cmn.lct_peer_tx_credits = pc;
+               tunables.lt_cmn.lct_peer_rtr_credits = pbc;
+               tunables.lt_cmn.lct_max_tx_credits = cre;
+               found = true;
+       }
+
+       rc = lustre_lnet_config_ni(&nw_descr,
+                                  (cpt_rc == 0) ? global_cpts: NULL,
+                                  ip2net, (found) ? &tunables : NULL,
+                                  -1, &err_rc);
+
+       if (global_cpts != NULL)
+               cfs_expr_list_free(global_cpts);
 
+failed:
        if (rc != LUSTRE_CFG_RC_NO_ERR)
                cYAML_print_tree2file(stderr, err_rc);
 
@@ -538,15 +621,18 @@ static int jt_del_route(int argc, char **argv)
        return rc;
 }
 
-static int jt_del_net(int argc, char **argv)
+static int jt_del_ni(int argc, char **argv)
 {
-       char *network = NULL;
        struct cYAML *err_rc = NULL;
        int rc, opt;
+       struct lnet_dlc_network_descr nw_descr;
+
+       lustre_lnet_init_nw_descr(&nw_descr);
 
-       const char *const short_options = "n:h";
+       const char *const short_options = "n:i:h";
        const struct option long_options[] = {
                { "net", 1, NULL, 'n' },
+               { "if", 1, NULL, 'i' },
                { "help", 0, NULL, 'h' },
                { NULL, 0, NULL, 0 },
        };
@@ -555,7 +641,16 @@ static int jt_del_net(int argc, char **argv)
                                   long_options, NULL)) != -1) {
                switch (opt) {
                case 'n':
-                       network = optarg;
+                       nw_descr.nw_id = libcfs_str2net(optarg);
+                       break;
+               case 'i':
+                       rc = lustre_lnet_parse_interfaces(optarg, &nw_descr);
+                       if (rc != 0) {
+                               cYAML_build_error(-1, -1, "ni", "add",
+                                               "bad interface list",
+                                               &err_rc);
+                               goto out;
+                       }
                        break;
                case 'h':
                        print_help(net_cmds, "net", "del");
@@ -565,8 +660,9 @@ static int jt_del_net(int argc, char **argv)
                }
        }
 
-       rc = lustre_lnet_del_net(network, -1, &err_rc);
+       rc = lustre_lnet_del_ni(&nw_descr, -1, &err_rc);
 
+out:
        if (rc != LUSTRE_CFG_RC_NO_ERR)
                cYAML_print_tree2file(stderr, err_rc);
 
@@ -729,15 +825,15 @@ static int jt_show_stats(int argc, char **argv)
        return rc;
 }
 
-static int jt_show_peer_credits(int argc, char **argv)
+static int jt_show_numa(int argc, char **argv)
 {
        int rc;
        struct cYAML *show_rc = NULL, *err_rc = NULL;
 
-       if (handle_help(credits_cmds, "peer_credits", "show", argc, argv) == 0)
+       if (handle_help(numa_cmds, "numa", "show", argc, argv) == 0)
                return 0;
 
-       rc = lustre_lnet_show_peer_credits(-1, &show_rc, &err_rc);
+       rc = lustre_lnet_show_numa_range(-1, &show_rc, &err_rc);
 
        if (rc != LUSTRE_CFG_RC_NO_ERR)
                cYAML_print_tree2file(stderr, err_rc);
@@ -810,16 +906,28 @@ static inline int jt_stats(int argc, char **argv)
        return Parser_execarg(argc - 1, &argv[1], stats_cmds);
 }
 
-static inline int jt_peer_credits(int argc, char **argv)
+static inline int jt_numa(int argc, char **argv)
 {
        if (argc < 2)
                return CMD_HELP;
 
        if (argc == 2 &&
-           handle_help(credits_cmds, "peer_credits", NULL, argc, argv) == 0)
+           handle_help(numa_cmds, "numa", NULL, argc, argv) == 0)
                return 0;
 
-       return Parser_execarg(argc - 1, &argv[1], credits_cmds);
+       return Parser_execarg(argc - 1, &argv[1], numa_cmds);
+}
+
+static inline int jt_peers(int argc, char **argv)
+{
+       if (argc < 2)
+               return CMD_HELP;
+
+       if (argc == 2 &&
+           handle_help(peer_cmds, "peer", NULL, argc, argv) == 0)
+               return 0;
+
+       return Parser_execarg(argc - 1, &argv[1], peer_cmds);
 }
 
 static inline int jt_set(int argc, char **argv)
@@ -895,8 +1003,7 @@ static int jt_import(int argc, char **argv)
                break;
        }
 
-       if (rc != LUSTRE_CFG_RC_NO_ERR)
-               cYAML_print_tree2file(stderr, err_rc);
+       cYAML_print_tree2file(stderr, err_rc);
 
        cYAML_free_tree(err_rc);
 
@@ -955,6 +1062,18 @@ static int jt_export(int argc, char **argv)
                cYAML_free_tree(err_rc);
        }
 
+       rc = lustre_lnet_show_peer(NULL, 1, -1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+       }
+
+       rc = lustre_lnet_show_numa_range(-1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+       }
+
        if (show_rc != NULL) {
                cYAML_print_tree2file(f, show_rc);
                cYAML_free_tree(show_rc);
@@ -966,6 +1085,162 @@ static int jt_export(int argc, char **argv)
        return 0;
 }
 
+static int jt_add_peer_nid(int argc, char **argv)
+{
+       char *prim_nid = NULL;
+       char **nids = NULL, **nids2 = NULL;
+       int size = 0;
+       struct cYAML *err_rc = NULL;
+       int rc = LUSTRE_CFG_RC_NO_ERR, opt, i;
+       bool non_mr = false;
+
+       const char *const short_options = "k:n:mh";
+       const struct option long_options[] = {
+               { "prim_nid", 1, NULL, 'k' },
+               { "nid", 1, NULL, 'n' },
+               { "non_mr", 0, NULL, 'm'},
+               { "help", 0, NULL, 'h' },
+               { NULL, 0, NULL, 0 },
+       };
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 'k':
+                       prim_nid = optarg;
+                       break;
+               case 'n':
+                       size = lustre_lnet_parse_nids(optarg, nids, size,
+                                                     &nids2);
+                       if (nids2 == NULL)
+                               goto failed;
+                       nids = nids2;
+                       rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+                       break;
+               case 'm':
+                       non_mr = true;
+                       break;
+               case 'h':
+                       print_help(peer_cmds, "peer", "add");
+                       return 0;
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_config_peer_nid(prim_nid, nids, size,
+                                        !non_mr, -1, &err_rc);
+
+failed:
+       for (i = 0; i < size; i++)
+               free(nids[i]);
+       free(nids);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
+static int jt_del_peer_nid(int argc, char **argv)
+{
+       char *prim_nid = NULL;
+       char **nids = NULL, **nids2 = NULL;
+       struct cYAML *err_rc = NULL;
+       int rc = LUSTRE_CFG_RC_NO_ERR, opt, i, size = 0;
+
+       const char *const short_options = "k:n:h";
+       const struct option long_options[] = {
+               { "prim_nid", 1, NULL, 'k' },
+               { "nid", 1, NULL, 'n' },
+               { "help", 0, NULL, 'h' },
+               { NULL, 0, NULL, 0 },
+       };
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 'k':
+                       prim_nid = optarg;
+                       break;
+               case 'n':
+                       size = lustre_lnet_parse_nids(optarg, nids, size,
+                                                     &nids2);
+                       if (nids2 == NULL)
+                               goto failed;
+                       nids = nids2;
+                       rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+                       break;
+               case 'h':
+                       print_help(peer_cmds, "peer", "del");
+                       return 0;
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_del_peer_nid(prim_nid, nids, size, -1, &err_rc);
+
+failed:
+       for (i = 0; i < size; i++)
+               free(nids[i]);
+       free(nids);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
+static int jt_show_peer(int argc, char **argv)
+{
+       char *nid = NULL;
+       int rc, opt;
+       struct cYAML *err_rc = NULL, *show_rc = NULL;
+       int detail = 0;
+
+       const char *const short_options = "n:vh";
+       const struct option long_options[] = {
+               { "nid", 1, NULL, 'n' },
+               { "verbose", 0, NULL, 'v' },
+               { "help", 0, NULL, 'h' },
+               { NULL, 0, NULL, 0 },
+       };
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 'n':
+                       nid = optarg;
+                       break;
+               case 'v':
+                       detail = 1;
+                       break;
+               case 'h':
+                       print_help(peer_cmds, "peer", "show");
+                       return 0;
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_show_peer(nid, detail, -1, &show_rc, &err_rc);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+       else if (show_rc)
+               cYAML_print_tree(show_rc);
+
+       cYAML_free_tree(err_rc);
+       cYAML_free_tree(show_rc);
+
+       return rc;
+}
+
 command_t list[] = {
        {"lnet", jt_lnet, 0, "lnet {configure | unconfigure} [--all]"},
        {"route", jt_route, 0, "route {add | del | show | help}"},
@@ -977,7 +1252,8 @@ command_t list[] = {
                                 "--help} FILE.yaml"},
        {"export", jt_export, 0, "export {--help} FILE.yaml"},
        {"stats", jt_stats, 0, "stats {show | help}"},
-       {"peer_credits", jt_peer_credits, 0, "peer_credits {show | help}"},
+       {"numa", jt_numa, 0, "numa {show | help}"},
+       {"peer", jt_peers, 0, "peer {add | del | show | help}"},
        {"help", Parser_help, 0, "help"},
        {"exit", Parser_quit, 0, "quit"},
        {"quit", Parser_quit, 0, "quit"},
index 2dba6e2..00e8849 100644 (file)
@@ -102,6 +102,54 @@ parameter\.
 .
 .br
 \-\-verbose: display detailed output per network
+
+.
+.SS "Peer Configuration"
+.TP
+\fBlnetctl peer\fR add
+Configure an LNET peer with at least one supplied NID\.  By default, peers are marked as multi-rail capable\.  If prim_nid is not specified, the first NID in this list is assumed to be the primary NID for the peer.
+.
+.br
+.
+\-\-nid: one or more peer NIDs to add to the peer\.
+.
+.br
+.
+\-\-prim_nid: Primary NID of the peer\.
+.
+.br
+\-\-non_mr: create this peer as not Multi-Rail capable\.
+.
+.br
+
+.TP
+\fBlnetctl peer\fR del
+Delete a peer NID.  The primary NID must be specified.  If the removed NID is the primary NID, the peer entry will be deleted.
+.
+.br
+.
+\-\-nid: one or more peer NIDs to remove from the peer\.
+.
+.br
+.
+\-\-prim_nid: Primary NID of the peer\.
+.
+.br
+
+.TP
+\fBlnetctl peer\fR show
+Show configured peers.  By default, lists all peers and associated NIDs.
+.
+.br
+.
+\-\-nid: list of primary nids to filter on
+.
+.br
+.
+\-\-verbose: Include extended statistics, including credits and counters.
+.
+.br
+
 .
 .SS "Route Configuration"
 .
@@ -278,33 +326,6 @@ Show LNET statistics
 .br
 
 .
-.SS "Showing Peer Credits"
-.
-.TP
-\fBlnetctl peer_credits\fR
-Show details on configured peer credits
-.
-.br
-\-> Peer nid
-.
-.br
-\-> State
-.
-.br
-\-> Reference count on the peer
-.
-.br
-\-> Maximum transmit credits
-.
-.br
-\-> Available transmit credits
-.
-.br
-\-> Available router credits
-.
-.br
-\-> Minimum router credits\.
-.
 .SH "EXAMPLES"
 .
 .SS "Initializing LNet after load"
@@ -585,10 +606,10 @@ statistics:
 .
 .br
 .
-.SS "Showing peer credits information"
+.SS "Showing peer information"
 .
 .IP "\(bu" 4
-lnetctl peer_credits show
+lnetctl peer show
 .
 .IP "" 0
 .
@@ -596,28 +617,40 @@ lnetctl peer_credits show
 peer:
 .
 .br
-       \- nid: 192\.168\.205\.131@tcp1
+    \- primary nid: 10\.148\.0\.8@o2ib
+.
+.br
+      Multi\-Rail: True
+.
+.br
+      peer ni:
+.
+.br
+        \- nid: 10\.148\.0\.8@o2ib
+.
+.br
+          state: NA
 .
 .br
-         state: down
+    \- primary nid: 10\.148\.0\.20@o2ib
 .
 .br
-         refcount: 4
+      Multi\-Rail: True
 .
 .br
-         max_ni_tx_credits: 8
+      peer ni:
 .
 .br
-         available_tx_credits: 8
+        \- nid: 10\.148\.0\.20@o2ib
 .
 .br
-         available_rtr_credits: 8
+          state: NA
 .
 .br
-         min_rtr_credits: 7
+        \- nid: 10\.148\.0\.25@o2ib
 .
 .br
-         tx_q_num_of_buf: 0
+          state: NA
 .
 .br
 
index 833231a..08b2251 100644 (file)
@@ -1105,6 +1105,8 @@ struct ptlrpc_request {
        lnet_nid_t                       rq_self;
        /** Peer description (the other side) */
        lnet_process_id_t                rq_peer;
+       /** Descriptor for the NID from which the peer sent the request. */
+       lnet_process_id_t                rq_source;
        /**
         * service time estimate (secs)
         * If the request is not served by this time, it is marked as timed out.
index 8d8a365..8aaf5a4 100644 (file)
@@ -42,30 +42,31 @@ static struct cfs_hash_ops conn_hash_ops;
 
 struct ptlrpc_connection *
 ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
-                      struct obd_uuid *uuid)
+                     struct obd_uuid *uuid)
 {
-        struct ptlrpc_connection *conn, *conn2;
-        ENTRY;
+       struct ptlrpc_connection *conn, *conn2;
+       ENTRY;
 
-        conn = cfs_hash_lookup(conn_hash, &peer);
-        if (conn)
-                GOTO(out, conn);
+       peer.nid = LNetPrimaryNID(peer.nid);
+       conn = cfs_hash_lookup(conn_hash, &peer);
+       if (conn)
+               GOTO(out, conn);
 
-        OBD_ALLOC_PTR(conn);
-        if (!conn)
-                RETURN(NULL);
+       OBD_ALLOC_PTR(conn);
+       if (!conn)
+               RETURN(NULL);
 
-        conn->c_peer = peer;
-        conn->c_self = self;
+       conn->c_peer = peer;
+       conn->c_self = self;
        INIT_HLIST_NODE(&conn->c_hash);
        atomic_set(&conn->c_refcount, 1);
-        if (uuid)
-                obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+       if (uuid)
+               obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
 
        /*
         * Add the newly created conn to the hash, on key collision we
         * lost a racing addition and must destroy our newly allocated
-        * connection.  The object which exists in the has will be
+        * connection.  The object which exists in the hash will be
         * returned and may be compared against out object.
         */
        /* In the function below, .hs_keycmp resolves to
index 77b73dc..2982837 100644 (file)
@@ -336,7 +336,9 @@ void request_in_callback(lnet_event_t *ev)
        if (ev->type == LNET_EVENT_PUT && ev->status == 0)
                req->rq_reqdata_len = ev->mlength;
        do_gettimeofday(&req->rq_arrival_time);
+       /* Multi-Rail: keep track of both initiator and source NID. */
        req->rq_peer = ev->initiator;
+       req->rq_source = ev->source;
        req->rq_self = ev->target.nid;
        req->rq_rqbd = rqbd;
        req->rq_phase = RQ_PHASE_NEW;
@@ -344,7 +346,8 @@ void request_in_callback(lnet_event_t *ev)
                CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
                       req, req->rq_xid, ev->mlength);
 
-       CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+       CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n",
+               libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source));
 
        spin_lock(&svcpt->scp_lock);
 
index f86e6d2..268965f 100644 (file)
  * over \a conn connection to portal \a portal.
  * Returns 0 on success or error code.
  */
-static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
-                         lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
-                         struct ptlrpc_connection *conn, int portal, __u64 xid,
-                         unsigned int offset)
+static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
+                       lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+                       lnet_nid_t self, lnet_process_id_t peer_id,
+                       int portal, __u64 xid, unsigned int offset,
+                       lnet_handle_md_t *bulk_cookie)
 {
-        int              rc;
-        lnet_md_t         md;
-        ENTRY;
+       int              rc;
+       lnet_md_t         md;
+       ENTRY;
 
-        LASSERT (portal != 0);
-        LASSERT (conn != NULL);
-        CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
-        md.start     = base;
-        md.length    = len;
-        md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
-        md.options   = PTLRPC_MD_OPTIONS;
-        md.user_ptr  = cbid;
-        md.eq_handle = ptlrpc_eq_h;
+       LASSERT (portal != 0);
+       CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
+       md.start     = base;
+       md.length    = len;
+       md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+       md.options   = PTLRPC_MD_OPTIONS;
+       md.user_ptr  = cbid;
+       md.eq_handle = ptlrpc_eq_h;
+       LNetInvalidateHandle(&md.bulk_handle);
 
-        if (unlikely(ack == LNET_ACK_REQ &&
-                     OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
-                /* don't ask for the ack to simulate failing client */
-                ack = LNET_NOACK_REQ;
-        }
+       if (bulk_cookie) {
+               md.bulk_handle = *bulk_cookie;
+               md.options |= LNET_MD_BULK_HANDLE;
+       }
 
-        rc = LNetMDBind (md, LNET_UNLINK, mdh);
-        if (unlikely(rc != 0)) {
-                CERROR ("LNetMDBind failed: %d\n", rc);
-                LASSERT (rc == -ENOMEM);
-                RETURN (-ENOMEM);
-        }
+       if (unlikely(ack == LNET_ACK_REQ &&
+                    OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+               /* don't ask for the ack to simulate failing client */
+               ack = LNET_NOACK_REQ;
+       }
+
+       rc = LNetMDBind (md, LNET_UNLINK, mdh);
+       if (unlikely(rc != 0)) {
+               CERROR ("LNetMDBind failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               RETURN (-ENOMEM);
+       }
 
        CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
-               len, portal, xid, offset);
-
-        rc = LNetPut (conn->c_self, *mdh, ack,
-                      conn->c_peer, portal, xid, offset, 0);
-        if (unlikely(rc != 0)) {
-                int rc2;
-                /* We're going to get an UNLINK event when I unlink below,
-                 * which will complete just like any other failed send, so
-                 * I fall through and return success here! */
+              len, portal, xid, offset);
+
+       rc = LNetPut(self, *mdh, ack,
+                    peer_id, portal, xid, offset, 0);
+       if (unlikely(rc != 0)) {
+               int rc2;
+               /* We're going to get an UNLINK event when I unlink below,
+                * which will complete just like any other failed send, so
+                * I fall through and return success here! */
                CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
-                       libcfs_id2str(conn->c_peer), portal, xid, rc);
-                rc2 = LNetMDUnlink(*mdh);
-                LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
-        }
+                      libcfs_id2str(peer_id), portal, xid, rc);
+               rc2 = LNetMDUnlink(*mdh);
+               LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+       }
 
-        RETURN (0);
+       RETURN (0);
 }
 
 static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
@@ -148,7 +154,8 @@ EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
 int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 {
        struct obd_export        *exp = desc->bd_export;
-       struct ptlrpc_connection *conn = exp->exp_connection;
+       lnet_nid_t                self_nid;
+       lnet_process_id_t         peer_id;
        int                       rc = 0;
        __u64                     mbits;
        int                       posted_md;
@@ -166,6 +173,14 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
        LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
        LASSERT(desc->bd_cbid.cbid_arg == desc);
 
+       /*
+        * Multi-Rail: get the preferred self and peer NIDs from the
+        * request, so they are based on the route taken by the
+        * message.
+        */
+       self_nid = desc->bd_req->rq_self;
+       peer_id = desc->bd_req->rq_source;
+
        /* NB total length may be 0 for a read past EOF, so we send 0
         * length bulks, since the client expects bulk events.
         *
@@ -211,18 +226,18 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 
                /* Network is about to get at the memory */
                if (ptlrpc_is_bulk_put_source(desc->bd_type))
-                       rc = LNetPut(conn->c_self, desc->bd_mds[posted_md],
-                                    LNET_ACK_REQ, conn->c_peer,
+                       rc = LNetPut(self_nid, desc->bd_mds[posted_md],
+                                    LNET_ACK_REQ, peer_id,
                                     desc->bd_portal, mbits, 0, 0);
                else
-                       rc = LNetGet(conn->c_self, desc->bd_mds[posted_md],
-                                    conn->c_peer, desc->bd_portal, mbits, 0);
+                       rc = LNetGet(self_nid, desc->bd_mds[posted_md],
+                                    peer_id, desc->bd_portal, mbits, 0);
 
                posted_md++;
                if (rc != 0) {
                        CERROR("%s: failed bulk transfer with %s:%u x%llu: "
                               "rc = %d\n", exp->exp_obd->obd_name,
-                              libcfs_id2str(conn->c_peer), desc->bd_portal,
+                              libcfs_id2str(peer_id), desc->bd_portal,
                               mbits, rc);
                        break;
                }
@@ -243,7 +258,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
 
        CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
               "id %s mbits %#llx-%#llx\n", desc->bd_iov_count,
-              desc->bd_nob, desc->bd_portal, libcfs_id2str(conn->c_peer),
+              desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id),
               mbits - posted_md, mbits - 1);
 
        RETURN(0);
@@ -537,9 +552,9 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
  */
 int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
 {
-        struct ptlrpc_reply_state *rs = req->rq_reply_state;
-        struct ptlrpc_connection  *conn;
-        int                        rc;
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct ptlrpc_connection  *conn;
+       int                        rc;
 
         /* We must already have a reply buffer (only ptlrpc_error() may be
          * called without one). The reply generated by sptlrpc layer (e.g.
@@ -608,12 +623,12 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
 
         req->rq_sent = cfs_time_current_sec();
 
-        rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
-                           (rs->rs_difficult && !rs->rs_no_ack) ?
-                           LNET_ACK_REQ : LNET_NOACK_REQ,
-                          &rs->rs_cb_id, conn,
-                          ptlrpc_req2svc(req)->srv_rep_portal,
-                           req->rq_xid, req->rq_reply_off);
+       rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+                         (rs->rs_difficult && !rs->rs_no_ack) ?
+                         LNET_ACK_REQ : LNET_NOACK_REQ,
+                         &rs->rs_cb_id, req->rq_self, req->rq_source,
+                         ptlrpc_req2svc(req)->srv_rep_portal,
+                         req->rq_xid, req->rq_reply_off, NULL);
 out:
         if (unlikely(rc != 0))
                 ptlrpc_req_drop_rs(req);
@@ -669,15 +684,18 @@ int ptlrpc_error(struct ptlrpc_request *req)
  */
 int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 {
-        int rc;
-        int rc2;
-        int mpflag = 0;
-        struct ptlrpc_connection *connection;
-        lnet_handle_me_t  reply_me_h;
-        lnet_md_t         reply_md;
+       int rc;
+       int rc2;
+       int mpflag = 0;
+       lnet_handle_md_t bulk_cookie;
+       struct ptlrpc_connection *connection;
+       lnet_handle_me_t  reply_me_h;
+       lnet_md_t         reply_md;
        struct obd_import *imp = request->rq_import;
        struct obd_device *obd = imp->imp_obd;
-        ENTRY;
+       ENTRY;
+
+       LNetInvalidateHandle(&bulk_cookie);
 
         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
                 RETURN(0);
@@ -767,12 +785,18 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        if (rc)
                GOTO(out, rc);
 
-        /* bulk register should be done after wrap_request() */
-        if (request->rq_bulk != NULL) {
-                rc = ptlrpc_register_bulk (request);
-                if (rc != 0)
-                        GOTO(out, rc);
-        }
+       /* bulk register should be done after wrap_request() */
+       if (request->rq_bulk != NULL) {
+               rc = ptlrpc_register_bulk (request);
+               if (rc != 0)
+                       GOTO(out, rc);
+               /*
+                * All the mds in the request will have the same cpt
+                * encoded in the cookie. So we can just get the first
+                * one.
+                */
+               bulk_cookie = request->rq_bulk->bd_mds[0];
+       }
 
         if (!noreply) {
                 LASSERT (request->rq_replen != 0);
@@ -869,14 +893,14 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
        ptlrpc_pinger_sending_on_import(imp);
 
-        DEBUG_REQ(D_INFO, request, "send flg=%x",
-                  lustre_msg_get_flags(request->rq_reqmsg));
-        rc = ptl_send_buf(&request->rq_req_md_h,
-                          request->rq_reqbuf, request->rq_reqdata_len,
-                          LNET_NOACK_REQ, &request->rq_req_cbid,
-                          connection,
-                          request->rq_request_portal,
-                          request->rq_xid, 0);
+       DEBUG_REQ(D_INFO, request, "send flg=%x",
+                 lustre_msg_get_flags(request->rq_reqmsg));
+       rc = ptl_send_buf(&request->rq_req_md_h,
+                         request->rq_reqbuf, request->rq_reqdata_len,
+                         LNET_NOACK_REQ, &request->rq_req_cbid,
+                         LNET_NID_ANY, connection->c_peer,
+                         request->rq_request_portal,
+                         request->rq_xid, 0, &bulk_cookie);
        if (likely(rc == 0))
                GOTO(out, rc);